diff --git a/notebooks/01.1-exploration-re3data.ipynb b/notebooks/01.1-exploration-re3data.ipynb index 6c14da2..820ed23 100644 --- a/notebooks/01.1-exploration-re3data.ipynb +++ b/notebooks/01.1-exploration-re3data.ipynb @@ -1,20 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Information to check\n", - "- names\n", - "- description\n", - "- url\n", - "- subjects & keywords\n", - "- content type\n", - "- repo type\n", - "- policies\n", - "\n" - ] - }, { "cell_type": "code", "execution_count": 1, @@ -69,327 +54,353 @@ " \n", " \n", " \n", - " openaire_id\n", - " re3data_id\n", - " repository_name\n", - " additional_name\n", - " repository_url\n", - " repository_id\n", + " orgIdentifier\n", + " repositoryName\n", + " repositoryName.language\n", + " additionalName\n", + " repositoryURL\n", + " repositoryIdentifier\n", + " repositoryContact\n", " description\n", + " description.language\n", " type\n", " size\n", - " update_date\n", - " start_date\n", - " end_date\n", + " startDate\n", + " endDate\n", + " repositoryLanguage\n", " subject\n", - " mission_statement\n", - " content_type\n", - " provider_type\n", + " missionStatementURL\n", + " contentType\n", + " providerType\n", " keyword\n", " institution\n", " policy\n", - " database_access\n", - " database_license\n", - " data_access\n", - " data_license\n", - " data_upload\n", - " data_upload_license\n", + " databaseAccess\n", + " databaseLicense\n", + " dataAccess\n", + " dataLicense\n", + " dataUploadType\n", + " dataUploadLicense\n", " software\n", " versioning\n", " api\n", - " pid_system\n", - " citation_guideline_url\n", - " aid_system\n", - " enhanced_publication\n", - " quality_management\n", + " pidSystem\n", + " citationGuidelineURL\n", + " aidSystem\n", + " enhancedPublication\n", + " qualityManagement\n", " certificate\n", - " metadata_standard\n", + " metadataStandard\n", " syndication\n", " remarks\n", - " entry_date\n", - " last_update\n", + " entryDate\n", + " lastUpdate\n", " \n", " \n", " \n", " \n", " 0\n", - " re3data_____::91780fe96da5ba32f804e43359c154ba\n", " r3d100000001\n", " Odum Institute Archive Dataverse\n", + " eng\n", " []\n", " https://dataverse.unc.edu/dataverse/odum\n", " []\n", + " [\"https://dataverse.unc.edu/dataverse/odum#\", ...\n", " The Odum Institute Archive Dataverse contains ...\n", + " eng\n", " [disciplinary]\n", - " 13 dataverses; 3.050 datasets\n", - " 2020-12-04\n", + " {\"size\": \"13 dataverses; 3.050 datasets\", \"upd...\n", " NaN\n", " NaN\n", - " [1 Humanities and Social Sciences, 111 Social ...\n", - " false\n", - " [Databases, Plain text, Scientific and statist...\n", + " [\"eng\"]\n", + " [{'name': '1 Humanities and Social Sciences', ...\n", + " NaN\n", + " [{'name': 'Databases', 'scheme': 'parse'}, {'n...\n", " [dataProvider]\n", " [FAIR, Middle East, crime, demography, economy...\n", - " [[Odum Institute for Research in Social Scienc...\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", - " false\n", - " true\n", + " [{'institutionName': 'Odum Institute for Resea...\n", + " [{\"policyName\": \"Collection Development Policy...\n", + " {\"databaseAccessType\": \"open\", \"databaseAcces...\n", + " [{\"databaseLicenseName\": \"CC0\", \"databaseLicen...\n", + " [{\"dataAccessType\": \"embargoed\", \"dataAccessRe...\n", + " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...\n", + " restricted\n", + " []\n", + " [\"DataVerse\"]\n", " NaN\n", - " false\n", - " true\n", - " true\n", - " true\n", + " {}\n", + " [\"DOI\"]\n", + " NaN\n", + " []\n", " unknown\n", " yes\n", - " true\n", - " true\n", - " false\n", + " [\"other\"]\n", + " [{\"metadataStandardName\": \"DDI - Data Document...\n", + " {}\n", " Odum Dataverse is covered by Thomson Reuters D...\n", " 2013-06-10\n", " 2021-07-06\n", " \n", " \n", " 1\n", - " re3data_____::cc3ea05c863cd49af75f7f54e0e86f09\n", " r3d100000002\n", " Access to Archival Databases\n", - " [AAD]\n", + " eng\n", + " [{'additionalName': 'AAD', 'additionalNameLang...\n", " https://aad.archives.gov/aad/\n", " [RRID:SCR_010479, RRID:nlx_157752]\n", + " [\"https://www.archives.gov/contact\"]\n", " You will find in the Access to Archival Databa...\n", + " eng\n", " [disciplinary]\n", - " NaN\n", - " NaN\n", + " {\"size\": \"\", \"updatedp\": \"\"}\n", " 1985\n", " NaN\n", - " [1 Humanities and Social Sciences, 102 History...\n", - " true\n", - " [Images, Standard office documents, Structured...\n", + " [\"eng\", \"spa\"]\n", + " [{'name': '1 Humanities and Social Sciences', ...\n", + " https://www.archives.gov/publications/general-...\n", + " [{'name': 'Images', 'scheme': 'parse'}, {'name...\n", " [dataProvider]\n", " [US History]\n", - " [[The U.S. National Archives and Records Admin...\n", - " true\n", - " true\n", - " false\n", - " true\n", - " true\n", - " true\n", - " false\n", - " true\n", + " [{'institutionName': 'The U.S. National Archiv...\n", + " [{\"policyName\": \"Contribution Policy\", \"policy...\n", + " {\"databaseAccessType\": \"open\", \"databaseAcces...\n", + " []\n", + " [{\"dataAccessType\": \"open\", \"dataAccessRestric...\n", + " [{\"dataLicenseName\": \"Copyrights\", \"dataLicens...\n", + " restricted\n", + " []\n", + " [\"unknown\"]\n", " no\n", - " true\n", - " true\n", - " true\n", - " true\n", + " {\"api\": \"https://www.archives.gov/developer#to...\n", + " [\"none\"]\n", + " https://aad.archives.gov/aad/help/getting-star...\n", + " []\n", " unknown\n", " unknown\n", - " false\n", - " false\n", - " true\n", + " []\n", + " []\n", + " {\"syndication\": \"http://www.archives.gov/socia...\n", " NaN\n", " 2012-07-04\n", " 2021-05-25\n", " \n", " \n", " 2\n", - " re3data_____::a2f73fbe91311f4356d0d7957c441773\n", " r3d100000004\n", " Datenbank Gesprochenes Deutsch\n", - " [DGD, DGD2 (formerly), Database for Spoken Ger...\n", + " deu\n", + " [{'additionalName': 'DGD', 'additionalNameLang...\n", " https://dgd.ids-mannheim.de/\n", " []\n", + " [\"dgd@ids-mannheim.de\"]\n", " The \"Database for Spoken German (DGD)\" is a co...\n", + " eng\n", " [disciplinary]\n", - " 34 corpora\n", - " 2020-02-03\n", + " {\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"}\n", " 2012\n", " NaN\n", - " [1 Humanities and Social Sciences, 104 Linguis...\n", - " true\n", - " [Audiovisual data, Standard office documents, ...\n", + " [\"deu\"]\n", + " [{'name': '1 Humanities and Social Sciences', ...\n", + " https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext...\n", + " [{'name': 'Audiovisual data', 'scheme': 'parse...\n", " [dataProvider, serviceProvider]\n", " [Australian German, FOLK, German dialects, Pfe...\n", - " [[Institut für Deutsche Sprache, Archiv für Ge...\n", - " true\n", - " true\n", - " false\n", - " true\n", - " true\n", - " true\n", - " false\n", - " true\n", + " [{'institutionName': 'Institut für Deutsche Sp...\n", + " [{\"policyName\": \"Erfurter Aufruf zur Sicherung...\n", + " {\"databaseAccessType\": \"restricted\", \"databas...\n", + " []\n", + " [{\"dataAccessType\": \"restricted\", \"dataAccessR...\n", + " [{\"dataLicenseName\": \"other\", \"dataLicenseURL\"...\n", + " restricted\n", + " []\n", + " [\"other\"]\n", " yes\n", - " false\n", - " true\n", - " true\n", - " true\n", + " {}\n", + " [\"none\"]\n", + " http://agd.ids-mannheim.de/konditionen.shtml\n", + " []\n", " unknown\n", " unknown\n", - " true\n", - " false\n", - " false\n", + " [\"RatSWD\"]\n", + " []\n", + " {}\n", " NaN\n", " 2012-07-20\n", " 2020-08-27\n", " \n", " \n", " 3\n", - " re3data_____::0394b97eb11f19785cbca1ec830429da\n", " r3d100000005\n", " UNC Dataverse\n", - " [University of North Carolina Dataverse]\n", + " eng\n", + " [{'additionalName': 'University of North Carol...\n", " https://dataverse.unc.edu/\n", " []\n", + " [\"https://dataverse.unc.edu/\", \"odumarchive@un...\n", " UNC Dataverse is an open-source repository sof...\n", + " eng\n", " [institutional]\n", - " 186 dataverses; 25.272 studies; 229.442 files\n", - " 2020-11-30\n", + " {\"size\": \"186 dataverses; 25.272 studies; 229....\n", " 2011\n", " NaN\n", - " [1 Humanities and Social Sciences, 111 Social ...\n", - " true\n", - " [Archived data, Plain text, Raw data, Scientif...\n", + " [\"eng\"]\n", + " [{'name': '1 Humanities and Social Sciences', ...\n", + " https://odum.unc.edu/about/mission-vision/\n", + " [{'name': 'Archived data', 'scheme': 'parse'},...\n", " [dataProvider, serviceProvider]\n", " [FAIR, census, demographic survey, demography,...\n", - " [[Odum Institute for Research in Social Scienc...\n", - " true\n", - " true\n", - " false\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", + " [{'institutionName': 'Odum Institute for Resea...\n", + " [{\"policyName\": \"Collection Development Policy...\n", + " {\"databaseAccessType\": \"open\", \"databaseAcces...\n", + " []\n", + " [{\"dataAccessType\": \"open\", \"dataAccessRestric...\n", + " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...\n", + " restricted\n", + " [{\"dataUploadLicenseName\": \"Data Deposit Form\"...\n", + " [\"DataVerse\"]\n", " yes\n", - " true\n", - " true\n", - " true\n", - " true\n", + " {\"api\": \"https://guides.dataverse.org/en/lates...\n", + " [\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"]\n", + " https://dataverse.org/best-practices/data-cita...\n", + " []\n", " unknown\n", " yes\n", - " false\n", - " true\n", - " false\n", - " The Odum Institute houses one of the oldest an...\n", + " []\n", + " [{\"metadataStandardName\": \"DDI - Data Document...\n", + " {}\n", + " UNC Dataverse is covered by Clarivate Data Cit...\n", " 2012-07-23\n", - " 2020-11-30\n", + " 2021-08-11\n", " \n", " \n", " 4\n", - " re3data_____::a48f09c562b247a9919acfe195549b47\n", " r3d100000006\n", " Archaeology Data Service\n", - " [ADS]\n", + " eng\n", + " [{'additionalName': 'ADS', 'additionalNameLang...\n", " https://archaeologydataservice.ac.uk/\n", " [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg]\n", + " [\"help@archaeologydataservice.ac.uk\", \"https:/...\n", " The ADS is an accredited digital repository fo...\n", + " eng\n", " [disciplinary]\n", - " 1837 results\n", - " 2020-05-20\n", + " {\"size\": \"1837 results\", \"updatedp\": \"2020-05-...\n", " 1996-10-01\n", " NaN\n", - " [1 Humanities and Social Sciences, 101 Ancient...\n", - " true\n", - " [Archived data, Audiovisual data, Databases, I...\n", + " [\"eng\"]\n", + " [{'name': '1 Humanities and Social Sciences', ...\n", + " https://archaeologydataservice.ac.uk/about/our...\n", + " [{'name': 'Archived data', 'scheme': 'parse'},...\n", " [dataProvider, serviceProvider]\n", " [FAIR, archaeology, cultural heritage, prehist...\n", - " [[Arts and Humanities Research Council, [AHRC]...\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", - " true\n", + " [{'institutionName': 'Arts and Humanities Rese...\n", + " [{\"policyName\": \"ADS Guides to good practice\",...\n", + " {\"databaseAccessType\": \"open\", \"databaseAcces...\n", + " [{\"databaseLicenseName\": \"CC\", \"databaseLicens...\n", + " [{\"dataAccessType\": \"open\", \"dataAccessRestric...\n", + " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...\n", + " restricted\n", + " [{\"dataUploadLicenseName\": \"Guidelines for Dep...\n", + " [\"other\"]\n", " yes\n", - " true\n", - " true\n", - " true\n", - " true\n", + " {\"api\": \"https://archaeologydataservice.ac.uk/...\n", + " [\"DOI\"]\n", + " https://archaeologydataservice.ac.uk/advice/te...\n", + " []\n", " unknown\n", " yes\n", - " true\n", - " true\n", - " true\n", + " [\"other\"]\n", + " [{\"metadataStandardName\": \"DataCite Metadata S...\n", + " {\"syndication\": \"https://archaeologydataservic...\n", " ADS is covered by Clarivate Data Citation Inde...\n", " 2012-07-23\n", - " 2021-06-11\n", + " 2021-09-02\n", " \n", " \n", "\n", "" ], "text/plain": [ - " openaire_id re3data_id \\\n", - "0 re3data_____::91780fe96da5ba32f804e43359c154ba r3d100000001 \n", - "1 re3data_____::cc3ea05c863cd49af75f7f54e0e86f09 r3d100000002 \n", - "2 re3data_____::a2f73fbe91311f4356d0d7957c441773 r3d100000004 \n", - "3 re3data_____::0394b97eb11f19785cbca1ec830429da r3d100000005 \n", - "4 re3data_____::a48f09c562b247a9919acfe195549b47 r3d100000006 \n", + " orgIdentifier repositoryName repositoryName.language \\\n", + "0 r3d100000001 Odum Institute Archive Dataverse eng \n", + "1 r3d100000002 Access to Archival Databases eng \n", + "2 r3d100000004 Datenbank Gesprochenes Deutsch deu \n", + "3 r3d100000005 UNC Dataverse eng \n", + "4 r3d100000006 Archaeology Data Service eng \n", "\n", - " repository_name \\\n", - "0 Odum Institute Archive Dataverse \n", - "1 Access to Archival Databases \n", - "2 Datenbank Gesprochenes Deutsch \n", - "3 UNC Dataverse \n", - "4 Archaeology Data Service \n", - "\n", - " additional_name \\\n", + " additionalName \\\n", "0 [] \n", - "1 [AAD] \n", - "2 [DGD, DGD2 (formerly), Database for Spoken Ger... \n", - "3 [University of North Carolina Dataverse] \n", - "4 [ADS] \n", + "1 [{'additionalName': 'AAD', 'additionalNameLang... \n", + "2 [{'additionalName': 'DGD', 'additionalNameLang... \n", + "3 [{'additionalName': 'University of North Carol... \n", + "4 [{'additionalName': 'ADS', 'additionalNameLang... \n", "\n", - " repository_url \\\n", + " repositoryURL \\\n", "0 https://dataverse.unc.edu/dataverse/odum \n", "1 https://aad.archives.gov/aad/ \n", "2 https://dgd.ids-mannheim.de/ \n", "3 https://dataverse.unc.edu/ \n", "4 https://archaeologydataservice.ac.uk/ \n", "\n", - " repository_id \\\n", + " repositoryIdentifier \\\n", "0 [] \n", "1 [RRID:SCR_010479, RRID:nlx_157752] \n", "2 [] \n", "3 [] \n", "4 [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] \n", "\n", - " description type \\\n", - "0 The Odum Institute Archive Dataverse contains ... [disciplinary] \n", - "1 You will find in the Access to Archival Databa... [disciplinary] \n", - "2 The \"Database for Spoken German (DGD)\" is a co... [disciplinary] \n", - "3 UNC Dataverse is an open-source repository sof... [institutional] \n", - "4 The ADS is an accredited digital repository fo... [disciplinary] \n", + " repositoryContact \\\n", + "0 [\"https://dataverse.unc.edu/dataverse/odum#\", ... \n", + "1 [\"https://www.archives.gov/contact\"] \n", + "2 [\"dgd@ids-mannheim.de\"] \n", + "3 [\"https://dataverse.unc.edu/\", \"odumarchive@un... \n", + "4 [\"help@archaeologydataservice.ac.uk\", \"https:/... \n", "\n", - " size update_date start_date \\\n", - "0 13 dataverses; 3.050 datasets 2020-12-04 NaN \n", - "1 NaN NaN 1985 \n", - "2 34 corpora 2020-02-03 2012 \n", - "3 186 dataverses; 25.272 studies; 229.442 files 2020-11-30 2011 \n", - "4 1837 results 2020-05-20 1996-10-01 \n", + " description description.language \\\n", + "0 The Odum Institute Archive Dataverse contains ... eng \n", + "1 You will find in the Access to Archival Databa... eng \n", + "2 The \"Database for Spoken German (DGD)\" is a co... eng \n", + "3 UNC Dataverse is an open-source repository sof... eng \n", + "4 The ADS is an accredited digital repository fo... eng \n", "\n", - " end_date subject \\\n", - "0 NaN [1 Humanities and Social Sciences, 111 Social ... \n", - "1 NaN [1 Humanities and Social Sciences, 102 History... \n", - "2 NaN [1 Humanities and Social Sciences, 104 Linguis... \n", - "3 NaN [1 Humanities and Social Sciences, 111 Social ... \n", - "4 NaN [1 Humanities and Social Sciences, 101 Ancient... \n", + " type size \\\n", + "0 [disciplinary] {\"size\": \"13 dataverses; 3.050 datasets\", \"upd... \n", + "1 [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n", + "2 [disciplinary] {\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"} \n", + "3 [institutional] {\"size\": \"186 dataverses; 25.272 studies; 229.... \n", + "4 [disciplinary] {\"size\": \"1837 results\", \"updatedp\": \"2020-05-... \n", "\n", - " mission_statement content_type \\\n", - "0 false [Databases, Plain text, Scientific and statist... \n", - "1 true [Images, Standard office documents, Structured... \n", - "2 true [Audiovisual data, Standard office documents, ... \n", - "3 true [Archived data, Plain text, Raw data, Scientif... \n", - "4 true [Archived data, Audiovisual data, Databases, I... \n", + " startDate endDate repositoryLanguage \\\n", + "0 NaN NaN [\"eng\"] \n", + "1 1985 NaN [\"eng\", \"spa\"] \n", + "2 2012 NaN [\"deu\"] \n", + "3 2011 NaN [\"eng\"] \n", + "4 1996-10-01 NaN [\"eng\"] \n", "\n", - " provider_type \\\n", + " subject \\\n", + "0 [{'name': '1 Humanities and Social Sciences', ... \n", + "1 [{'name': '1 Humanities and Social Sciences', ... \n", + "2 [{'name': '1 Humanities and Social Sciences', ... \n", + "3 [{'name': '1 Humanities and Social Sciences', ... \n", + "4 [{'name': '1 Humanities and Social Sciences', ... \n", + "\n", + " missionStatementURL \\\n", + "0 NaN \n", + "1 https://www.archives.gov/publications/general-... \n", + "2 https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... \n", + "3 https://odum.unc.edu/about/mission-vision/ \n", + "4 https://archaeologydataservice.ac.uk/about/our... \n", + "\n", + " contentType \\\n", + "0 [{'name': 'Databases', 'scheme': 'parse'}, {'n... \n", + "1 [{'name': 'Images', 'scheme': 'parse'}, {'name... \n", + "2 [{'name': 'Audiovisual data', 'scheme': 'parse... \n", + "3 [{'name': 'Archived data', 'scheme': 'parse'},... \n", + "4 [{'name': 'Archived data', 'scheme': 'parse'},... \n", + "\n", + " providerType \\\n", "0 [dataProvider] \n", "1 [dataProvider] \n", "2 [dataProvider, serviceProvider] \n", @@ -403,47 +414,103 @@ "3 [FAIR, census, demographic survey, demography,... \n", "4 [FAIR, archaeology, cultural heritage, prehist... \n", "\n", - " institution policy database_access \\\n", - "0 [[Odum Institute for Research in Social Scienc... true true \n", - "1 [[The U.S. National Archives and Records Admin... true true \n", - "2 [[Institut für Deutsche Sprache, Archiv für Ge... true true \n", - "3 [[Odum Institute for Research in Social Scienc... true true \n", - "4 [[Arts and Humanities Research Council, [AHRC]... true true \n", + " institution \\\n", + "0 [{'institutionName': 'Odum Institute for Resea... \n", + "1 [{'institutionName': 'The U.S. National Archiv... \n", + "2 [{'institutionName': 'Institut für Deutsche Sp... \n", + "3 [{'institutionName': 'Odum Institute for Resea... \n", + "4 [{'institutionName': 'Arts and Humanities Rese... \n", "\n", - " database_license data_access data_license data_upload data_upload_license \\\n", - "0 true true true true false \n", - "1 false true true true false \n", - "2 false true true true false \n", - "3 false true true true true \n", - "4 true true true true true \n", + " policy \\\n", + "0 [{\"policyName\": \"Collection Development Policy... \n", + "1 [{\"policyName\": \"Contribution Policy\", \"policy... \n", + "2 [{\"policyName\": \"Erfurter Aufruf zur Sicherung... \n", + "3 [{\"policyName\": \"Collection Development Policy... \n", + "4 [{\"policyName\": \"ADS Guides to good practice\",... \n", "\n", - " software versioning api pid_system citation_guideline_url aid_system \\\n", - "0 true NaN false true true true \n", - "1 true no true true true true \n", - "2 true yes false true true true \n", - "3 true yes true true true true \n", - "4 true yes true true true true \n", + " databaseAccess \\\n", + "0 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", + "1 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", + "2 {\"databaseAccessType\": \"restricted\", \"databas... \n", + "3 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", + "4 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", "\n", - " enhanced_publication quality_management certificate metadata_standard \\\n", - "0 unknown yes true true \n", - "1 unknown unknown false false \n", - "2 unknown unknown true false \n", - "3 unknown yes false true \n", - "4 unknown yes true true \n", + " databaseLicense \\\n", + "0 [{\"databaseLicenseName\": \"CC0\", \"databaseLicen... \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [{\"databaseLicenseName\": \"CC\", \"databaseLicens... \n", "\n", - " syndication remarks entry_date \\\n", - "0 false Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 \n", - "1 true NaN 2012-07-04 \n", - "2 false NaN 2012-07-20 \n", - "3 false The Odum Institute houses one of the oldest an... 2012-07-23 \n", - "4 true ADS is covered by Clarivate Data Citation Inde... 2012-07-23 \n", + " dataAccess \\\n", + "0 [{\"dataAccessType\": \"embargoed\", \"dataAccessRe... \n", + "1 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", + "2 [{\"dataAccessType\": \"restricted\", \"dataAccessR... \n", + "3 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", + "4 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", "\n", - " last_update \n", - "0 2021-07-06 \n", - "1 2021-05-25 \n", - "2 2020-08-27 \n", - "3 2020-11-30 \n", - "4 2021-06-11 " + " dataLicense dataUploadType \\\n", + "0 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", + "1 [{\"dataLicenseName\": \"Copyrights\", \"dataLicens... restricted \n", + "2 [{\"dataLicenseName\": \"other\", \"dataLicenseURL\"... restricted \n", + "3 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", + "4 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", + "\n", + " dataUploadLicense software \\\n", + "0 [] [\"DataVerse\"] \n", + "1 [] [\"unknown\"] \n", + "2 [] [\"other\"] \n", + "3 [{\"dataUploadLicenseName\": \"Data Deposit Form\"... [\"DataVerse\"] \n", + "4 [{\"dataUploadLicenseName\": \"Guidelines for Dep... [\"other\"] \n", + "\n", + " versioning api \\\n", + "0 NaN {} \n", + "1 no {\"api\": \"https://www.archives.gov/developer#to... \n", + "2 yes {} \n", + "3 yes {\"api\": \"https://guides.dataverse.org/en/lates... \n", + "4 yes {\"api\": \"https://archaeologydataservice.ac.uk/... \n", + "\n", + " pidSystem \\\n", + "0 [\"DOI\"] \n", + "1 [\"none\"] \n", + "2 [\"none\"] \n", + "3 [\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"] \n", + "4 [\"DOI\"] \n", + "\n", + " citationGuidelineURL aidSystem \\\n", + "0 NaN [] \n", + "1 https://aad.archives.gov/aad/help/getting-star... [] \n", + "2 http://agd.ids-mannheim.de/konditionen.shtml [] \n", + "3 https://dataverse.org/best-practices/data-cita... [] \n", + "4 https://archaeologydataservice.ac.uk/advice/te... [] \n", + "\n", + " enhancedPublication qualityManagement certificate \\\n", + "0 unknown yes [\"other\"] \n", + "1 unknown unknown [] \n", + "2 unknown unknown [\"RatSWD\"] \n", + "3 unknown yes [] \n", + "4 unknown yes [\"other\"] \n", + "\n", + " metadataStandard \\\n", + "0 [{\"metadataStandardName\": \"DDI - Data Document... \n", + "1 [] \n", + "2 [] \n", + "3 [{\"metadataStandardName\": \"DDI - Data Document... \n", + "4 [{\"metadataStandardName\": \"DataCite Metadata S... \n", + "\n", + " syndication \\\n", + "0 {} \n", + "1 {\"syndication\": \"http://www.archives.gov/socia... \n", + "2 {} \n", + "3 {} \n", + "4 {\"syndication\": \"https://archaeologydataservic... \n", + "\n", + " remarks entryDate lastUpdate \n", + "0 Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06 \n", + "1 NaN 2012-07-04 2021-05-25 \n", + "2 NaN 2012-07-20 2020-08-27 \n", + "3 UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-08-11 \n", + "4 ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02 " ] }, "execution_count": 2, @@ -452,16 +519,17 @@ } ], "source": [ - "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", + "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t',\n", " converters={'subject': ast.literal_eval,\n", " 'keyword': ast.literal_eval,\n", - " 'additional_name': ast.literal_eval,\n", - " 'repository_id': ast.literal_eval,\n", + " 'additionalName': ast.literal_eval,\n", + " 'repositoryIdentifier': ast.literal_eval,\n", " 'type': ast.literal_eval,\n", - " 'content_type': ast.literal_eval,\n", - " 'provider_type': ast.literal_eval,\n", + " 'contentType': ast.literal_eval,\n", + " 'providerType': ast.literal_eval,\n", " 'institution': ast.literal_eval\n", " })\n", + "\n", "re3data_df.head()" ] }, @@ -473,16 +541,17 @@ { "data": { "text/plain": [ - "Index(['openaire_id', 're3data_id', 'repository_name', 'additional_name',\n", - " 'repository_url', 'repository_id', 'description', 'type', 'size',\n", - " 'update_date', 'start_date', 'end_date', 'subject', 'mission_statement',\n", - " 'content_type', 'provider_type', 'keyword', 'institution', 'policy',\n", - " 'database_access', 'database_license', 'data_access', 'data_license',\n", - " 'data_upload', 'data_upload_license', 'software', 'versioning', 'api',\n", - " 'pid_system', 'citation_guideline_url', 'aid_system',\n", - " 'enhanced_publication', 'quality_management', 'certificate',\n", - " 'metadata_standard', 'syndication', 'remarks', 'entry_date',\n", - " 'last_update'],\n", + "Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',\n", + " 'additionalName', 'repositoryURL', 'repositoryIdentifier',\n", + " 'repositoryContact', 'description', 'description.language', 'type',\n", + " 'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',\n", + " 'missionStatementURL', 'contentType', 'providerType', 'keyword',\n", + " 'institution', 'policy', 'databaseAccess', 'databaseLicense',\n", + " 'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',\n", + " 'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',\n", + " 'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',\n", + " 'metadataStandard', 'syndication', 'remarks', 'entryDate',\n", + " 'lastUpdate'],\n", " dtype='object')" ] }, @@ -536,298 +605,338 @@ " \n", " \n", " \n", - " openaire_id\n", - " re3data_id\n", - " repository_name\n", - " additional_name\n", - " repository_url\n", - " repository_id\n", + " orgIdentifier\n", + " repositoryName\n", + " repositoryName.language\n", + " additionalName\n", + " repositoryURL\n", + " repositoryIdentifier\n", + " repositoryContact\n", " description\n", + " description.language\n", " type\n", " size\n", - " update_date\n", - " start_date\n", - " end_date\n", + " startDate\n", + " endDate\n", + " repositoryLanguage\n", " subject\n", - " mission_statement\n", - " content_type\n", - " provider_type\n", + " missionStatementURL\n", + " contentType\n", + " providerType\n", " keyword\n", " institution\n", " policy\n", - " database_access\n", - " database_license\n", - " data_access\n", - " data_license\n", - " data_upload\n", - " data_upload_license\n", + " databaseAccess\n", + " databaseLicense\n", + " dataAccess\n", + " dataLicense\n", + " dataUploadType\n", + " dataUploadLicense\n", " software\n", " versioning\n", " api\n", - " pid_system\n", - " citation_guideline_url\n", - " aid_system\n", - " enhanced_publication\n", - " quality_management\n", + " pidSystem\n", + " citationGuidelineURL\n", + " aidSystem\n", + " enhancedPublication\n", + " qualityManagement\n", " certificate\n", - " metadata_standard\n", + " metadataStandard\n", " syndication\n", " remarks\n", - " entry_date\n", - " last_update\n", + " entryDate\n", + " lastUpdate\n", " \n", " \n", " \n", " \n", " count\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2137\n", - " 2686\n", - " 829\n", - " 2707\n", - " 2677\n", - " 1260\n", - " 1248\n", - " 1762\n", - " 146\n", - " 2685\n", - " 2707\n", - " 2700\n", - " 2699\n", - " 2699\n", - " 2706\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 1292\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2704\n", - " 2705\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 1637\n", - " 2707\n", - " 2707\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 2170\n", + " 2716\n", + " 863\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 2710\n", + " 2739\n", + " 1776\n", + " 157\n", + " 2739\n", + " 2720\n", + " 2318\n", + " 2732\n", + " 2735\n", + " 2732\n", + " 2738\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 2711\n", + " 2739\n", + " 2739\n", + " 1316\n", + " 2739\n", + " 2739\n", + " 1512\n", + " 2739\n", + " 2737\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 2739\n", + " 1674\n", + " 2739\n", + " 2739\n", " \n", " \n", " unique\n", - " 2707\n", - " 2707\n", - " 2704\n", - " 2128\n", - " 2683\n", - " 828\n", - " 2705\n", + " 2739\n", + " 2736\n", + " 19\n", + " 2161\n", + " 2713\n", + " 863\n", + " 2459\n", + " 2737\n", + " 6\n", " 8\n", - " 1233\n", - " 687\n", - " 351\n", - " 79\n", - " 1367\n", - " 2\n", - " 1323\n", + " 1289\n", + " 352\n", + " 80\n", + " 107\n", + " 1388\n", + " 2249\n", + " 1337\n", " 4\n", - " 2474\n", - " 2685\n", + " 2503\n", + " 2719\n", + " 2319\n", + " 12\n", + " 375\n", + " 145\n", + " 2263\n", + " 3\n", + " 681\n", + " 23\n", " 2\n", - " 1\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 1\n", - " 1\n", + " 1146\n", + " 29\n", + " 1321\n", + " 12\n", " 3\n", " 3\n", - " 2\n", - " 2\n", - " 2\n", - " 1632\n", - " 1259\n", - " 814\n", + " 14\n", + " 172\n", + " 563\n", + " 1656\n", + " 1275\n", + " 740\n", " \n", " \n", " top\n", - " re3data_____::4cea5a5ea78542232a51190879756661\n", - " r3d100011254\n", - " EarthChem Library\n", - " [IRIS]\n", - " http://www.jcvi.org/cms/home/\n", - " [doi:10.17171/1-6]\n", - " The repository is no longer available. >>>!!!<...\n", + " r3d100000001\n", + " Språkbanken\n", + " eng\n", + " [{'additionalName': 'MPC', 'additionalNameLang...\n", + " http://icgem.gfz-potsdam.de/home\n", + " [RRID:SCR_010479, RRID:nlx_157752]\n", + " []\n", + " The National Archives and Records Administrati...\n", + " eng\n", " [disciplinary]\n", - " 2 datasets\n", - " 2019-05-15\n", + " {\"size\": \"\", \"updatedp\": \"\"}\n", " 2008\n", " 2015\n", - " [1 Humanities and Social Sciences, 2 Life Scie...\n", - " true\n", - " [Standard office documents]\n", + " [\"eng\"]\n", + " [{'name': '1 Humanities and Social Sciences', ...\n", + " https://learn.scholarsportal.info/all-guides/d...\n", + " [{'name': 'Standard office documents', 'scheme...\n", " [dataProvider]\n", " [multidisciplinary]\n", - " [[National Center for Biotechnology Informatio...\n", - " true\n", - " true\n", - " false\n", - " true\n", - " true\n", - " true\n", - " false\n", - " true\n", + " [{'institutionName': 'National Center for Biot...\n", + " [][]\n", + " {\"databaseAccessType\": \"open\", \"databaseAcces...\n", + " []\n", + " [{\"dataAccessType\": \"open\", \"dataAccessRestric...\n", + " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...\n", + " restricted\n", + " []\n", + " [\"unknown\"]\n", " yes\n", - " false\n", - " true\n", - " true\n", - " true\n", + " {}\n", + " [\"none\"]\n", + " https://dataverse.org/best-practices/data-cita...\n", + " []\n", " unknown\n", " yes\n", - " false\n", - " false\n", - " false\n", - " The National Institute of Standards and Techno...\n", + " []\n", + " []\n", + " {}\n", + " is covered by Elsevier.\n", " 2016-05-10\n", - " 2021-07-02\n", + " 2021-09-03\n", " \n", " \n", " freq\n", " 1\n", + " 2\n", + " 2554\n", + " 2\n", + " 2\n", " 1\n", + " 202\n", " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", - " 1713\n", - " 6\n", - " 15\n", + " 2723\n", + " 1733\n", + " 1450\n", " 92\n", " 11\n", - " 222\n", - " 2286\n", + " 2063\n", + " 226\n", + " 14\n", " 30\n", - " 1748\n", - " 190\n", + " 1771\n", + " 193\n", " 6\n", - " 2394\n", - " 2707\n", - " 2134\n", - " 2701\n", - " 2693\n", - " 2681\n", - " 1988\n", - " 2227\n", - " 1086\n", - " 1485\n", - " 2448\n", - " 2707\n", - " 2707\n", - " 1592\n", - " 1492\n", - " 2481\n", - " 1655\n", - " 2129\n", - " 3\n", + " 312\n", + " 2571\n", + " 2159\n", + " 1269\n", + " 64\n", + " 1793\n", + " 2013\n", + " 1226\n", + " 1108\n", + " 1498\n", + " 1361\n", + " 72\n", + " 2155\n", + " 1608\n", + " 1515\n", + " 2509\n", + " 1669\n", + " 2162\n", + " 14\n", " 20\n", - " 47\n", + " 137\n", " \n", " \n", "\n", "" ], "text/plain": [ - " openaire_id re3data_id \\\n", - "count 2707 2707 \n", - "unique 2707 2707 \n", - "top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n", - "freq 1 1 \n", + " orgIdentifier repositoryName repositoryName.language \\\n", + "count 2739 2739 2739 \n", + "unique 2739 2736 19 \n", + "top r3d100000001 Språkbanken eng \n", + "freq 1 2 2554 \n", "\n", - " repository_name additional_name repository_url \\\n", - "count 2707 2137 2686 \n", - "unique 2704 2128 2683 \n", - "top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n", - "freq 2 2 2 \n", + " additionalName \\\n", + "count 2170 \n", + "unique 2161 \n", + "top [{'additionalName': 'MPC', 'additionalNameLang... \n", + "freq 2 \n", "\n", - " repository_id description \\\n", - "count 829 2707 \n", - "unique 828 2705 \n", - "top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n", - "freq 2 2 \n", + " repositoryURL repositoryIdentifier \\\n", + "count 2716 863 \n", + "unique 2713 863 \n", + "top http://icgem.gfz-potsdam.de/home [RRID:SCR_010479, RRID:nlx_157752] \n", + "freq 2 1 \n", "\n", - " type size update_date start_date end_date \\\n", - "count 2677 1260 1248 1762 146 \n", - "unique 8 1233 687 351 79 \n", - "top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n", - "freq 1713 6 15 92 11 \n", + " repositoryContact description \\\n", + "count 2739 2739 \n", + "unique 2459 2737 \n", + "top [] The National Archives and Records Administrati... \n", + "freq 202 2 \n", "\n", - " subject mission_statement \\\n", - "count 2685 2707 \n", - "unique 1367 2 \n", - "top [1 Humanities and Social Sciences, 2 Life Scie... true \n", - "freq 222 2286 \n", + " description.language type size \\\n", + "count 2739 2710 2739 \n", + "unique 6 8 1289 \n", + "top eng [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n", + "freq 2723 1733 1450 \n", "\n", - " content_type provider_type keyword \\\n", - "count 2700 2699 2699 \n", - "unique 1323 4 2474 \n", - "top [Standard office documents] [dataProvider] [multidisciplinary] \n", - "freq 30 1748 190 \n", + " startDate endDate repositoryLanguage \\\n", + "count 1776 157 2739 \n", + "unique 352 80 107 \n", + "top 2008 2015 [\"eng\"] \n", + "freq 92 11 2063 \n", + "\n", + " subject \\\n", + "count 2720 \n", + "unique 1388 \n", + "top [{'name': '1 Humanities and Social Sciences', ... \n", + "freq 226 \n", + "\n", + " missionStatementURL \\\n", + "count 2318 \n", + "unique 2249 \n", + "top https://learn.scholarsportal.info/all-guides/d... \n", + "freq 14 \n", + "\n", + " contentType providerType \\\n", + "count 2732 2735 \n", + "unique 1337 4 \n", + "top [{'name': 'Standard office documents', 'scheme... [dataProvider] \n", + "freq 30 1771 \n", + "\n", + " keyword \\\n", + "count 2732 \n", + "unique 2503 \n", + "top [multidisciplinary] \n", + "freq 193 \n", "\n", " institution policy \\\n", - "count 2706 2707 \n", - "unique 2685 2 \n", - "top [[National Center for Biotechnology Informatio... true \n", - "freq 6 2394 \n", + "count 2738 2739 \n", + "unique 2719 2319 \n", + "top [{'institutionName': 'National Center for Biot... [][] \n", + "freq 6 312 \n", "\n", - " database_access database_license data_access data_license data_upload \\\n", - "count 2707 2707 2707 2707 2707 \n", - "unique 1 2 2 2 2 \n", - "top true false true true true \n", - "freq 2707 2134 2701 2693 2681 \n", + " databaseAccess databaseLicense \\\n", + "count 2739 2739 \n", + "unique 12 375 \n", + "top {\"databaseAccessType\": \"open\", \"databaseAcces... [] \n", + "freq 2571 2159 \n", "\n", - " data_upload_license software versioning api pid_system \\\n", - "count 2707 2707 1292 2707 2707 \n", - "unique 2 2 2 2 2 \n", - "top false true yes false true \n", - "freq 1988 2227 1086 1485 2448 \n", + " dataAccess \\\n", + "count 2739 \n", + "unique 145 \n", + "top [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", + "freq 1269 \n", "\n", - " citation_guideline_url aid_system enhanced_publication \\\n", - "count 2707 2707 2704 \n", - "unique 1 1 3 \n", - "top true true unknown \n", - "freq 2707 2707 1592 \n", + " dataLicense dataUploadType \\\n", + "count 2739 2711 \n", + "unique 2263 3 \n", + "top [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", + "freq 64 1793 \n", "\n", - " quality_management certificate metadata_standard syndication \\\n", - "count 2705 2707 2707 2707 \n", - "unique 3 2 2 2 \n", - "top yes false false false \n", - "freq 1492 2481 1655 2129 \n", + " dataUploadLicense software versioning api pidSystem \\\n", + "count 2739 2739 1316 2739 2739 \n", + "unique 681 23 2 1146 29 \n", + "top [] [\"unknown\"] yes {} [\"none\"] \n", + "freq 2013 1226 1108 1498 1361 \n", "\n", - " remarks entry_date \\\n", - "count 1637 2707 \n", - "unique 1632 1259 \n", - "top The National Institute of Standards and Techno... 2016-05-10 \n", - "freq 3 20 \n", + " citationGuidelineURL aidSystem \\\n", + "count 1512 2739 \n", + "unique 1321 12 \n", + "top https://dataverse.org/best-practices/data-cita... [] \n", + "freq 72 2155 \n", "\n", - " last_update \n", - "count 2707 \n", - "unique 814 \n", - "top 2021-07-02 \n", - "freq 47 " + " enhancedPublication qualityManagement certificate metadataStandard \\\n", + "count 2737 2739 2739 2739 \n", + "unique 3 3 14 172 \n", + "top unknown yes [] [] \n", + "freq 1608 1515 2509 1669 \n", + "\n", + " syndication remarks entryDate lastUpdate \n", + "count 2739 1674 2739 2739 \n", + "unique 563 1656 1275 740 \n", + "top {} is covered by Elsevier. 2016-05-10 2021-09-03 \n", + "freq 2162 14 20 137 " ] }, "execution_count": 5, @@ -847,45 +956,47 @@ { "data": { "text/plain": [ - "openaire_id 0\n", - "re3data_id 0\n", - "repository_name 0\n", - "additional_name 570\n", - "repository_url 21\n", - "repository_id 1878\n", - "description 0\n", - "type 30\n", - "size 1447\n", - "update_date 1459\n", - "start_date 945\n", - "end_date 2561\n", - "subject 22\n", - "mission_statement 0\n", - "content_type 7\n", - "provider_type 8\n", - "keyword 8\n", - "institution 1\n", - "policy 0\n", - "database_access 0\n", - "database_license 0\n", - "data_access 0\n", - "data_license 0\n", - "data_upload 0\n", - "data_upload_license 0\n", - "software 0\n", - "versioning 1415\n", - "api 0\n", - "pid_system 0\n", - "citation_guideline_url 0\n", - "aid_system 0\n", - "enhanced_publication 3\n", - "quality_management 2\n", - "certificate 0\n", - "metadata_standard 0\n", - "syndication 0\n", - "remarks 1070\n", - "entry_date 0\n", - "last_update 0\n", + "orgIdentifier 0\n", + "repositoryName 0\n", + "repositoryName.language 0\n", + "additionalName 569\n", + "repositoryURL 23\n", + "repositoryIdentifier 1876\n", + "repositoryContact 0\n", + "description 0\n", + "description.language 0\n", + "type 29\n", + "size 0\n", + "startDate 963\n", + "endDate 2582\n", + "repositoryLanguage 0\n", + "subject 19\n", + "missionStatementURL 421\n", + "contentType 7\n", + "providerType 4\n", + "keyword 7\n", + "institution 1\n", + "policy 0\n", + "databaseAccess 0\n", + "databaseLicense 0\n", + "dataAccess 0\n", + "dataLicense 0\n", + "dataUploadType 28\n", + "dataUploadLicense 0\n", + "software 0\n", + "versioning 1423\n", + "api 0\n", + "pidSystem 0\n", + "citationGuidelineURL 1227\n", + "aidSystem 0\n", + "enhancedPublication 2\n", + "qualityManagement 0\n", + "certificate 0\n", + "metadataStandard 0\n", + "syndication 0\n", + "remarks 1065\n", + "entryDate 0\n", + "lastUpdate 0\n", "dtype: int64" ] }, @@ -906,12 +1017,23 @@ { "data": { "text/plain": [ - "array(['Databases', 'Plain text',\n", - " 'Scientific and statistical data formats',\n", - " 'Standard office documents', 'other', 'Images', 'Structured text',\n", - " 'Audiovisual data', 'Archived data', 'Raw data',\n", - " 'Software applications', 'Source code', 'Structured graphics',\n", - " 'Configuration data', 'Networkbased data', nan], dtype=object)" + "contentType\n", + "Archived data 658\n", + "Audiovisual data 542\n", + "Configuration data 79\n", + "Databases 586\n", + "Images 1378\n", + "Networkbased data 153\n", + "Plain text 1158\n", + "Raw data 1197\n", + "Scientific and statistical data formats 1685\n", + "Software applications 456\n", + "Source code 209\n", + "Standard office documents 1684\n", + "Structured graphics 917\n", + "Structured text 848\n", + "other 962\n", + "dtype: int64" ] }, "execution_count": 7, @@ -920,7 +1042,8 @@ } ], "source": [ - "re3data_df.content_type.explode().unique()" + "types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)\n", + "pd.DataFrame(types).groupby('contentType').size()" ] }, { @@ -931,7 +1054,10 @@ { "data": { "text/plain": [ - "array(['dataProvider', 'serviceProvider', nan], dtype=object)" + "providerType\n", + "dataProvider 2491\n", + "serviceProvider 963\n", + "dtype: int64" ] }, "execution_count": 8, @@ -940,8 +1066,15 @@ } ], "source": [ - "re3data_df.provider_type.explode().unique()" + "pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb index d7eb22a..394152e 100644 --- a/notebooks/01.2-exploration-opendoar.ipynb +++ b/notebooks/01.2-exploration-opendoar.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -54,241 +54,283 @@ " \n", " \n", " \n", - " openaire_id\n", - " opendoar_id\n", - " repository_name\n", - " additional_name\n", - " repository_url\n", - " description\n", - " type\n", - " update_date\n", - " start_date\n", - " subject\n", - " content_type\n", - " institution\n", - " metadata_policy\n", - " data_policy\n", - " submission_policy\n", - " content_policy\n", - " software\n", - " api\n", + " system_metadata.id\n", + " repository_metadata.name\n", + " repository_metadata.alternativename\n", + " repository_metadata.url\n", + " repository_metadata.description\n", + " repository_metadata.type\n", + " repository_metadata.content_languages\n", + " system_metadata.date_modified\n", + " system_metadata.date_created\n", + " repository_metadata.content_subjects\n", + " repository_metadata.content_types\n", + " organization\n", + " policy_urls\n", + " repository_metadata.software\n", + " repository_metadata.oai_url\n", + " system_metadata.publicly_visible\n", + " repository_metadata.repository_status\n", + " repository_metadata.fulltext_record_count\n", + " repository_metadata.metadata_record_count\n", " \n", " \n", " \n", " \n", " 0\n", - " opendoar____::38b3eff8baf56627478ec76a704e9b52\n", + " 175\n", + " {\"name\": \"hku theses online\", \"language\": \"en\"}\n", + " []\n", + " http://hub.hku.hk/handle/10722/1057\n", + " this is an institutional repository providing ...\n", + " institutional\n", + " [\"zh\", \"en\"]\n", + " 2021-03-25 10:16:18\n", + " 2005-12-21 12:44:08\n", + " [\"multidisciplinary\"]\n", + " [bibliographic_references, theses_and_disserta...\n", + " [{'name': 'university of hong kong', 'alternat...\n", + " []\n", + " {\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap...\n", + " NaN\n", + " yes\n", + " fully_functional\n", + " NaN\n", + " 11850.0\n", + " \n", + " \n", + " 1\n", + " 64\n", + " {\"name\": \"research support scheme - central eu...\n", + " []\n", + " http://rss.archives.ceu.hu/\n", + " this is an institutional repository collecting...\n", + " institutional\n", + " [\"cs\", \"en\", \"hu\", \"ru\"]\n", + " 2021-03-25 09:48:31\n", + " 2006-01-04 14:59:30\n", + " [\"multidisciplinary\"]\n", + " [unpub_reports_and_working_papers]\n", + " [{'name': 'central european university', 'alte...\n", + " []\n", + " {\"name\": \"eprints\", \"version\": \"2.2.1\"}\n", + " http://rss.archives.ceu.hu/perl/oai2\n", + " yes\n", + " fully_functional\n", + " NaN\n", + " 164.0\n", + " \n", + " \n", + " 2\n", + " 151\n", + " {\"name\": \"cadmus, eui research repository\", \"l...\n", + " []\n", + " http://cadmus.eui.eu/\n", + " cadmus is the name of the eui research reposit...\n", + " institutional\n", + " [\"nl\", \"en\", \"fr\", \"de\", \"it\"]\n", + " 2021-09-13 13:35:36\n", + " 2006-01-04 12:07:07\n", + " [\"history and archaeology\", \"multidisciplinary...\n", + " [journal_articles, theses_and_dissertations, u...\n", + " [{'name': 'european university institute', 'al...\n", + " [{\"policy_url\": \"https://www.eui.eu/research/e...\n", + " {\"name\": \"dspace\", \"version\": \"5.2\"}\n", + " http://cadmus.eui.eu/oai/request\n", + " yes\n", + " fully_functional\n", + " 3867.0\n", + " 24869.0\n", + " \n", + " \n", + " 3\n", + " 105\n", + " {\"name\": \"document server@uhasselt\", \"language...\n", + " []\n", + " https://doclib.uhasselt.be/dspace/\n", + " this site is a university repository providing...\n", + " institutional\n", + " [\"nl\", \"en\", \"fr\", \"de\"]\n", + " 2021-04-16 15:23:52\n", + " 2006-01-24 15:46:44\n", + " [\"multidisciplinary\"]\n", + " [journal_articles, conference_and_workshop_pap...\n", + " [{'name': 'uhasselt', 'alternativeName': 'hass...\n", + " []\n", + " {\"name\": \"dspace\", \"version\": \"1.7.2\"}\n", + " http://doclib.uhasselt.be/dspace-oai/request\n", + " yes\n", + " fully_functional\n", + " 0.0\n", + " 27376.0\n", + " \n", + " \n", + " 4\n", " 101\n", - " utrecht university repository\n", + " {\"name\": \"utrecht university repository\", \"lan...\n", " []\n", " http://dspace.library.uu.nl\n", " this site is a university repository providing...\n", " institutional\n", + " [\"nl\", \"en\"]\n", " 2021-04-16 15:22:03\n", " 2006-01-13 12:55:13\n", - " [multidisciplinary]\n", + " [\"multidisciplinary\"]\n", " [journal_articles, conference_and_workshop_pap...\n", - " [[university of utrecht, [universiteit utrecht...\n", - " True\n", - " True\n", - " False\n", - " True\n", - " dspace\n", - " true\n", - " \n", - " \n", - " 1\n", - " opendoar____::2b44928ae11fb9384c4cf38708677c48\n", - " 115\n", - " dspace at indian institute of management kozhi...\n", - " [dspace@iimk]\n", - " http://dspace.iimk.ac.in/\n", - " this site is a subject based university reposi...\n", - " institutional\n", - " 2021-02-18 17:36:43\n", - " 2006-01-04 11:54:34\n", - " [ecology and environment, social sciences gene...\n", - " [journal_articles, conference_and_workshop_pap...\n", - " [[indian institute of management kozhikode, [i...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " dspace 4.1\n", - " true\n", - " \n", - " \n", - " 2\n", - " opendoar____::3416a75f4cea9109507cacd8e2f2aefc\n", - " 41\n", - " caltech engineering and science online\n", + " [{'name': 'university of utrecht', 'alternativ...\n", " []\n", - " http://calteches.library.caltech.edu/\n", - " the caltech archives holds approximately 220 c...\n", - " institutional\n", - " 2021-02-18 17:36:28\n", - " 2006-01-04 14:47:04\n", - " [biology and biochemistry, chemistry and chemi...\n", - " [journal_articles, conference_and_workshop_pap...\n", - " [[california institute of technology, [caltech...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " eprints 3.1.3\n", - " true\n", - " \n", - " \n", - " 3\n", - " opendoar____::07e1cd7dca89a1678042477183b7ac3f\n", - " 119\n", - " dcu online research access service\n", - " [doras]\n", - " http://doras.dcu.ie/\n", - " this site is an institutional repository provi...\n", - " institutional\n", - " 2021-02-18 17:36:44\n", - " 2006-01-04 11:15:19\n", - " [multidisciplinary]\n", - " [journal_articles, conference_and_workshop_pap...\n", - " [[dublin city university, [dcu], ie, [], , htt...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " eprints 3.0.5\n", - " true\n", - " \n", - " \n", - " 4\n", - " opendoar____::d1f491a404d6854880943e5c3cd9ca25\n", - " 129\n", - " earth-prints repository\n", - " []\n", - " http://www.earth-prints.org/\n", - " a subject based repository providing open acce...\n", - " disciplinary\n", - " 2021-04-19 08:28:38\n", - " 2006-01-30 16:43:11\n", - " [earth and planetary sciences]\n", - " [journal_articles, conference_and_workshop_pap...\n", - " [[istituto nazionale di geofisica e vulcanolog...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " dspace 5.8.1-snapshot\n", - " true\n", + " {\"name\": \"dspace\", \"version\": \"\"}\n", + " https://dspace.library.uu.nl/oai/request\n", + " yes\n", + " fully_functional\n", + " 1686.0\n", + " 185637.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " openaire_id opendoar_id \\\n", - "0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 \n", - "1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 \n", - "2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 \n", - "3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 \n", - "4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 \n", + " system_metadata.id repository_metadata.name \\\n", + "0 175 {\"name\": \"hku theses online\", \"language\": \"en\"} \n", + "1 64 {\"name\": \"research support scheme - central eu... \n", + "2 151 {\"name\": \"cadmus, eui research repository\", \"l... \n", + "3 105 {\"name\": \"document server@uhasselt\", \"language... \n", + "4 101 {\"name\": \"utrecht university repository\", \"lan... \n", "\n", - " repository_name additional_name \\\n", - "0 utrecht university repository [] \n", - "1 dspace at indian institute of management kozhi... [dspace@iimk] \n", - "2 caltech engineering and science online [] \n", - "3 dcu online research access service [doras] \n", - "4 earth-prints repository [] \n", + " repository_metadata.alternativename repository_metadata.url \\\n", + "0 [] http://hub.hku.hk/handle/10722/1057 \n", + "1 [] http://rss.archives.ceu.hu/ \n", + "2 [] http://cadmus.eui.eu/ \n", + "3 [] https://doclib.uhasselt.be/dspace/ \n", + "4 [] http://dspace.library.uu.nl \n", "\n", - " repository_url \\\n", - "0 http://dspace.library.uu.nl \n", - "1 http://dspace.iimk.ac.in/ \n", - "2 http://calteches.library.caltech.edu/ \n", - "3 http://doras.dcu.ie/ \n", - "4 http://www.earth-prints.org/ \n", + " repository_metadata.description repository_metadata.type \\\n", + "0 this is an institutional repository providing ... institutional \n", + "1 this is an institutional repository collecting... institutional \n", + "2 cadmus is the name of the eui research reposit... institutional \n", + "3 this site is a university repository providing... institutional \n", + "4 this site is a university repository providing... institutional \n", "\n", - " description type \\\n", - "0 this site is a university repository providing... institutional \n", - "1 this site is a subject based university reposi... institutional \n", - "2 the caltech archives holds approximately 220 c... institutional \n", - "3 this site is an institutional repository provi... institutional \n", - "4 a subject based repository providing open acce... disciplinary \n", + " repository_metadata.content_languages system_metadata.date_modified \\\n", + "0 [\"zh\", \"en\"] 2021-03-25 10:16:18 \n", + "1 [\"cs\", \"en\", \"hu\", \"ru\"] 2021-03-25 09:48:31 \n", + "2 [\"nl\", \"en\", \"fr\", \"de\", \"it\"] 2021-09-13 13:35:36 \n", + "3 [\"nl\", \"en\", \"fr\", \"de\"] 2021-04-16 15:23:52 \n", + "4 [\"nl\", \"en\"] 2021-04-16 15:22:03 \n", "\n", - " update_date start_date \\\n", - "0 2021-04-16 15:22:03 2006-01-13 12:55:13 \n", - "1 2021-02-18 17:36:43 2006-01-04 11:54:34 \n", - "2 2021-02-18 17:36:28 2006-01-04 14:47:04 \n", - "3 2021-02-18 17:36:44 2006-01-04 11:15:19 \n", - "4 2021-04-19 08:28:38 2006-01-30 16:43:11 \n", + " system_metadata.date_created \\\n", + "0 2005-12-21 12:44:08 \n", + "1 2006-01-04 14:59:30 \n", + "2 2006-01-04 12:07:07 \n", + "3 2006-01-24 15:46:44 \n", + "4 2006-01-13 12:55:13 \n", "\n", - " subject \\\n", - "0 [multidisciplinary] \n", - "1 [ecology and environment, social sciences gene... \n", - "2 [biology and biochemistry, chemistry and chemi... \n", - "3 [multidisciplinary] \n", - "4 [earth and planetary sciences] \n", + " repository_metadata.content_subjects \\\n", + "0 [\"multidisciplinary\"] \n", + "1 [\"multidisciplinary\"] \n", + "2 [\"history and archaeology\", \"multidisciplinary... \n", + "3 [\"multidisciplinary\"] \n", + "4 [\"multidisciplinary\"] \n", "\n", - " content_type \\\n", - "0 [journal_articles, conference_and_workshop_pap... \n", - "1 [journal_articles, conference_and_workshop_pap... \n", - "2 [journal_articles, conference_and_workshop_pap... \n", + " repository_metadata.content_types \\\n", + "0 [bibliographic_references, theses_and_disserta... \n", + "1 [unpub_reports_and_working_papers] \n", + "2 [journal_articles, theses_and_dissertations, u... \n", "3 [journal_articles, conference_and_workshop_pap... \n", "4 [journal_articles, conference_and_workshop_pap... \n", "\n", - " institution metadata_policy \\\n", - "0 [[university of utrecht, [universiteit utrecht... True \n", - "1 [[indian institute of management kozhikode, [i... True \n", - "2 [[california institute of technology, [caltech... True \n", - "3 [[dublin city university, [dcu], ie, [], , htt... True \n", - "4 [[istituto nazionale di geofisica e vulcanolog... True \n", + " organization \\\n", + "0 [{'name': 'university of hong kong', 'alternat... \n", + "1 [{'name': 'central european university', 'alte... \n", + "2 [{'name': 'european university institute', 'al... \n", + "3 [{'name': 'uhasselt', 'alternativeName': 'hass... \n", + "4 [{'name': 'university of utrecht', 'alternativ... \n", "\n", - " data_policy submission_policy content_policy software \\\n", - "0 True False True dspace \n", - "1 True True True dspace 4.1 \n", - "2 True True True eprints 3.1.3 \n", - "3 True True True eprints 3.0.5 \n", - "4 True True True dspace 5.8.1-snapshot \n", + " policy_urls \\\n", + "0 [] \n", + "1 [] \n", + "2 [{\"policy_url\": \"https://www.eui.eu/research/e... \n", + "3 [] \n", + "4 [] \n", "\n", - " api \n", - "0 true \n", - "1 true \n", - "2 true \n", - "3 true \n", - "4 true " + " repository_metadata.software \\\n", + "0 {\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap... \n", + "1 {\"name\": \"eprints\", \"version\": \"2.2.1\"} \n", + "2 {\"name\": \"dspace\", \"version\": \"5.2\"} \n", + "3 {\"name\": \"dspace\", \"version\": \"1.7.2\"} \n", + "4 {\"name\": \"dspace\", \"version\": \"\"} \n", + "\n", + " repository_metadata.oai_url \\\n", + "0 NaN \n", + "1 http://rss.archives.ceu.hu/perl/oai2 \n", + "2 http://cadmus.eui.eu/oai/request \n", + "3 http://doclib.uhasselt.be/dspace-oai/request \n", + "4 https://dspace.library.uu.nl/oai/request \n", + "\n", + " system_metadata.publicly_visible repository_metadata.repository_status \\\n", + "0 yes fully_functional \n", + "1 yes fully_functional \n", + "2 yes fully_functional \n", + "3 yes fully_functional \n", + "4 yes fully_functional \n", + "\n", + " repository_metadata.fulltext_record_count \\\n", + "0 NaN \n", + "1 NaN \n", + "2 3867.0 \n", + "3 0.0 \n", + "4 1686.0 \n", + "\n", + " repository_metadata.metadata_record_count \n", + "0 11850.0 \n", + "1 164.0 \n", + "2 24869.0 \n", + "3 27376.0 \n", + "4 185637.0 " ] }, - "execution_count": 24, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", - " converters={'subject': ast.literal_eval,\n", - " 'additional_name': ast.literal_eval,\n", - " 'opendoar_id': ast.literal_eval,\n", - " 'content_type': ast.literal_eval,\n", - " 'institution': ast.literal_eval\n", - " })\n", + " converters={'repository_metadata.content_subjects_phrases': ast.literal_eval,\n", + " 'repository_metadata.alternativename': ast.literal_eval,\n", + " 'repository_metadata.content_types': ast.literal_eval,\n", + " 'organization': ast.literal_eval\n", + " },\n", + " dtype={'system_metadata.id': str})\n", + "\n", "opendoar_df.head()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n", - " 'repository_url', 'description', 'type', 'update_date', 'start_date',\n", - " 'subject', 'content_type', 'institution', 'metadata_policy',\n", - " 'data_policy', 'submission_policy', 'content_policy', 'software',\n", - " 'api'],\n", + "Index(['system_metadata.id', 'repository_metadata.name',\n", + " 'repository_metadata.alternativename', 'repository_metadata.url',\n", + " 'repository_metadata.description', 'repository_metadata.type',\n", + " 'repository_metadata.content_languages',\n", + " 'system_metadata.date_modified', 'system_metadata.date_created',\n", + " 'repository_metadata.content_subjects',\n", + " 'repository_metadata.content_types', 'organization', 'policy_urls',\n", + " 'repository_metadata.software', 'repository_metadata.oai_url',\n", + " 'system_metadata.publicly_visible',\n", + " 'repository_metadata.repository_status',\n", + " 'repository_metadata.fulltext_record_count',\n", + " 'repository_metadata.metadata_record_count'],\n", " dtype='object')" ] }, - "execution_count": 25, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -299,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -314,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -338,115 +380,119 @@ " \n", " \n", " \n", - " openaire_id\n", - " opendoar_id\n", - " repository_name\n", - " additional_name\n", - " repository_url\n", - " description\n", - " type\n", - " update_date\n", - " start_date\n", - " subject\n", - " content_type\n", - " institution\n", - " metadata_policy\n", - " data_policy\n", - " submission_policy\n", - " content_policy\n", - " software\n", - " api\n", + " system_metadata.id\n", + " repository_metadata.name\n", + " repository_metadata.alternativename\n", + " repository_metadata.url\n", + " repository_metadata.description\n", + " repository_metadata.type\n", + " repository_metadata.content_languages\n", + " system_metadata.date_modified\n", + " system_metadata.date_created\n", + " repository_metadata.content_subjects\n", + " repository_metadata.content_types\n", + " organization\n", + " policy_urls\n", + " repository_metadata.software\n", + " repository_metadata.oai_url\n", + " system_metadata.publicly_visible\n", + " repository_metadata.repository_status\n", + " repository_metadata.fulltext_record_count\n", + " repository_metadata.metadata_record_count\n", " \n", " \n", " \n", " \n", " count\n", - " 5707\n", - " 5707.000000\n", - " 5707\n", - " 2138\n", - " 5707\n", - " 5425\n", - " 5707\n", - " 5707\n", - " 5707\n", - " 5542\n", - " 5563\n", - " 5707\n", - " 5707\n", - " 5707\n", - " 5707\n", - " 5707\n", - " 5707\n", - " 5707\n", + " 5742\n", + " 5742\n", + " 2147\n", + " 5742\n", + " 5421\n", + " 5742\n", + " 5742\n", + " 5742\n", + " 5742\n", + " 5742\n", + " 5598\n", + " 5742\n", + " 5742\n", + " 5742\n", + " 4402\n", + " 5742\n", + " 5595\n", + " 2.299000e+03\n", + " 4.197000e+03\n", " \n", " \n", " unique\n", - " 5707\n", - " NaN\n", - " 5670\n", - " 2096\n", - " 5670\n", - " 4622\n", + " 5742\n", + " 5713\n", + " 2107\n", + " 5705\n", + " 4619\n", " 4\n", - " 2501\n", - " 5538\n", - " 819\n", - " 476\n", - " 5098\n", - " 2\n", - " 2\n", - " 2\n", - " 2\n", + " 330\n", + " 2372\n", + " 5573\n", + " 821\n", + " 477\n", + " 5201\n", + " 642\n", " 321\n", - " 2\n", + " 4370\n", + " 1\n", + " 7\n", + " NaN\n", + " NaN\n", " \n", " \n", " top\n", - " opendoar____::3cf166c6b73f030b4f67eeaeba301103\n", - " NaN\n", - " hiroshima associated repository portal\n", - " []\n", + " 175\n", + " {\"name\": \"hiroshima associated repository port...\n", + " [{'acronym': 'aura'}]\n", " http://harp.lib.hiroshima-u.ac.jp/\n", " this site provides access to the research outp...\n", " institutional\n", + " [\"en\"]\n", " 2020-09-18 12:53:48\n", " 2020-09-18 12:53:48\n", - " [multidisciplinary]\n", + " [\"multidisciplinary\"]\n", " [theses_and_dissertations]\n", - " [[rijksuniversiteit groningen, [rug], nl, [], ...\n", - " False\n", - " False\n", - " False\n", - " False\n", - " dspace\n", - " true\n", + " [{'name': 'rijksuniversiteit groningen', 'alte...\n", + " []\n", + " {\"name\": \"dspace\", \"version\": \"\"}\n", + " https://kidoks.bsz-bw.de/oai\n", + " yes\n", + " fully_functional\n", + " NaN\n", + " NaN\n", " \n", " \n", " freq\n", " 1\n", - " NaN\n", " 3\n", " 4\n", " 3\n", " 95\n", - " 5067\n", + " 5096\n", + " 1917\n", " 82\n", " 82\n", - " 3212\n", - " 460\n", + " 3227\n", + " 465\n", " 26\n", - " 4116\n", - " 4101\n", - " 5016\n", - " 4075\n", - " 800\n", - " 4374\n", + " 5098\n", + " 822\n", + " 3\n", + " 5742\n", + " 5276\n", + " NaN\n", + " NaN\n", " \n", " \n", " mean\n", " NaN\n", - " 4008.118801\n", " NaN\n", " NaN\n", " NaN\n", @@ -463,11 +509,12 @@ " NaN\n", " NaN\n", " NaN\n", + " 5.010186e+03\n", + " 1.760546e+05\n", " \n", " \n", " std\n", " NaN\n", - " 2869.948770\n", " NaN\n", " NaN\n", " NaN\n", @@ -484,11 +531,12 @@ " NaN\n", " NaN\n", " NaN\n", + " 4.206295e+04\n", + " 6.600825e+06\n", " \n", " \n", " min\n", " NaN\n", - " 2.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -505,11 +553,12 @@ " NaN\n", " NaN\n", " NaN\n", + " 0.000000e+00\n", + " 0.000000e+00\n", " \n", " \n", " 25%\n", " NaN\n", - " 1823.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -526,11 +575,12 @@ " NaN\n", " NaN\n", " NaN\n", + " 0.000000e+00\n", + " 8.950000e+02\n", " \n", " \n", " 50%\n", " NaN\n", - " 3361.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -547,11 +597,12 @@ " NaN\n", " NaN\n", " NaN\n", + " 4.220000e+02\n", + " 4.026000e+03\n", " \n", " \n", " 75%\n", " NaN\n", - " 5095.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -568,11 +619,12 @@ " NaN\n", " NaN\n", " NaN\n", + " 2.930500e+03\n", + " 1.630400e+04\n", " \n", " \n", " max\n", " NaN\n", - " 10175.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -589,41 +641,43 @@ " NaN\n", " NaN\n", " NaN\n", + " 1.817531e+06\n", + " 4.200000e+08\n", " \n", " \n", "\n", "" ], "text/plain": [ - " openaire_id opendoar_id \\\n", - "count 5707 5707.000000 \n", - "unique 5707 NaN \n", - "top opendoar____::3cf166c6b73f030b4f67eeaeba301103 NaN \n", - "freq 1 NaN \n", - "mean NaN 4008.118801 \n", - "std NaN 2869.948770 \n", - "min NaN 2.000000 \n", - "25% NaN 1823.000000 \n", - "50% NaN 3361.000000 \n", - "75% NaN 5095.000000 \n", - "max NaN 10175.000000 \n", + " system_metadata.id repository_metadata.name \\\n", + "count 5742 5742 \n", + "unique 5742 5713 \n", + "top 175 {\"name\": \"hiroshima associated repository port... \n", + "freq 1 3 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", "\n", - " repository_name additional_name \\\n", - "count 5707 2138 \n", - "unique 5670 2096 \n", - "top hiroshima associated repository portal [] \n", - "freq 3 4 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", + " repository_metadata.alternativename \\\n", + "count 2147 \n", + "unique 2107 \n", + "top [{'acronym': 'aura'}] \n", + "freq 4 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", "\n", - " repository_url \\\n", - "count 5707 \n", - "unique 5670 \n", + " repository_metadata.url \\\n", + "count 5742 \n", + "unique 5705 \n", "top http://harp.lib.hiroshima-u.ac.jp/ \n", "freq 3 \n", "mean NaN \n", @@ -634,73 +688,125 @@ "75% NaN \n", "max NaN \n", "\n", - " description type \\\n", - "count 5425 5707 \n", - "unique 4622 4 \n", - "top this site provides access to the research outp... institutional \n", - "freq 95 5067 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", + " repository_metadata.description \\\n", + "count 5421 \n", + "unique 4619 \n", + "top this site provides access to the research outp... \n", + "freq 95 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", "\n", - " update_date start_date subject \\\n", - "count 5707 5707 5542 \n", - "unique 2501 5538 819 \n", - "top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n", - "freq 82 82 3212 \n", - "mean NaN NaN NaN \n", - "std NaN NaN NaN \n", - "min NaN NaN NaN \n", - "25% NaN NaN NaN \n", - "50% NaN NaN NaN \n", - "75% NaN NaN NaN \n", - "max NaN NaN NaN \n", + " repository_metadata.type repository_metadata.content_languages \\\n", + "count 5742 5742 \n", + "unique 4 330 \n", + "top institutional [\"en\"] \n", + "freq 5096 1917 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", "\n", - " content_type \\\n", - "count 5563 \n", - "unique 476 \n", - "top [theses_and_dissertations] \n", - "freq 460 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN \n", + " system_metadata.date_modified system_metadata.date_created \\\n", + "count 5742 5742 \n", + "unique 2372 5573 \n", + "top 2020-09-18 12:53:48 2020-09-18 12:53:48 \n", + "freq 82 82 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", "\n", - " institution metadata_policy \\\n", - "count 5707 5707 \n", - "unique 5098 2 \n", - "top [[rijksuniversiteit groningen, [rug], nl, [], ... False \n", - "freq 26 4116 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", + " repository_metadata.content_subjects repository_metadata.content_types \\\n", + "count 5742 5598 \n", + "unique 821 477 \n", + "top [\"multidisciplinary\"] [theses_and_dissertations] \n", + "freq 3227 465 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", "\n", - " data_policy submission_policy content_policy software api \n", - "count 5707 5707 5707 5707 5707 \n", - "unique 2 2 2 321 2 \n", - "top False False False dspace true \n", - "freq 4101 5016 4075 800 4374 \n", - "mean NaN NaN NaN NaN NaN \n", - "std NaN NaN NaN NaN NaN \n", - "min NaN NaN NaN NaN NaN \n", - "25% NaN NaN NaN NaN NaN \n", - "50% NaN NaN NaN NaN NaN \n", - "75% NaN NaN NaN NaN NaN \n", - "max NaN NaN NaN NaN NaN " + " organization policy_urls \\\n", + "count 5742 5742 \n", + "unique 5201 642 \n", + "top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n", + "freq 26 5098 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " repository_metadata.software repository_metadata.oai_url \\\n", + "count 5742 4402 \n", + "unique 321 4370 \n", + "top {\"name\": \"dspace\", \"version\": \"\"} https://kidoks.bsz-bw.de/oai \n", + "freq 822 3 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " system_metadata.publicly_visible repository_metadata.repository_status \\\n", + "count 5742 5595 \n", + "unique 1 7 \n", + "top yes fully_functional \n", + "freq 5742 5276 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " repository_metadata.fulltext_record_count \\\n", + "count 2.299000e+03 \n", + "unique NaN \n", + "top NaN \n", + "freq NaN \n", + "mean 5.010186e+03 \n", + "std 4.206295e+04 \n", + "min 0.000000e+00 \n", + "25% 0.000000e+00 \n", + "50% 4.220000e+02 \n", + "75% 2.930500e+03 \n", + "max 1.817531e+06 \n", + "\n", + " repository_metadata.metadata_record_count \n", + "count 4.197000e+03 \n", + "unique NaN \n", + "top NaN \n", + "freq NaN \n", + "mean 1.760546e+05 \n", + "std 6.600825e+06 \n", + "min 0.000000e+00 \n", + "25% 8.950000e+02 \n", + "50% 4.026000e+03 \n", + "75% 1.630400e+04 \n", + "max 4.200000e+08 " ] }, - "execution_count": 29, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -711,34 +817,35 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "openaire_id 0\n", - "opendoar_id 0\n", - "repository_name 0\n", - "additional_name 3569\n", - "repository_url 0\n", - "description 282\n", - "type 0\n", - "update_date 0\n", - "start_date 0\n", - "subject 165\n", - "content_type 144\n", - "institution 0\n", - "metadata_policy 0\n", - "data_policy 0\n", - "submission_policy 0\n", - "content_policy 0\n", - "software 0\n", - "api 0\n", + "system_metadata.id 0\n", + "repository_metadata.name 0\n", + "repository_metadata.alternativename 3595\n", + "repository_metadata.url 0\n", + "repository_metadata.description 321\n", + "repository_metadata.type 0\n", + "repository_metadata.content_languages 0\n", + "system_metadata.date_modified 0\n", + "system_metadata.date_created 0\n", + "repository_metadata.content_subjects 0\n", + "repository_metadata.content_types 144\n", + "organization 0\n", + "policy_urls 0\n", + "repository_metadata.software 0\n", + "repository_metadata.oai_url 1340\n", + "system_metadata.publicly_visible 0\n", + "repository_metadata.repository_status 147\n", + "repository_metadata.fulltext_record_count 3443\n", + "repository_metadata.metadata_record_count 1545\n", "dtype: int64" ] }, - "execution_count": 30, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -749,10 +856,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "repository_metadata.content_types\n", + "bibliographic_references 865\n", + "books_chapters_and_sections 2194\n", + "conference_and_workshop_papers 1981\n", + "datasets 401\n", + "journal_articles 4030\n", + "learning_objects 789\n", + "other_special_item_types 1759\n", + "patents 182\n", + "software 92\n", + "theses_and_dissertations 3319\n", + "unpub_reports_and_working_papers 1904\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()" + ] }, { "cell_type": "code", diff --git a/notebooks/01.3-exploration-roar.ipynb b/notebooks/01.3-exploration-roar.ipynb index 05a0433..521afe4 100644 --- a/notebooks/01.3-exploration-roar.ipynb +++ b/notebooks/01.3-exploration-roar.ipynb @@ -993,12 +993,12 @@ " 1\n", " NaN\n", " NaN\n", - " disk0/00/00/14/07\n", + " disk0/00/00/09/21\n", " 2010-01-06 13:43:48\n", - " 2011-07-06 08:24:53\n", + " 2016-04-17 21:55:19\n", " 2010-01-06 13:43:48\n", " institutional\n", - " 12637\n", + " 3164\n", " NaN\n", " show\n", " NaN\n", @@ -1020,9 +1020,9 @@ " http://eprints.upnjatim.ac.id/\n", " Repositorio Institucional\n", " http://virtuelcampus.univ-msila.dz/fll\n", - " http://npl.csircentral.net/\n", + " http://repositorio.itesm.mx/ortec/\n", " http://eprints.upnjatim.ac.id/cgi/latest_tool?...\n", - " https://twitter.com/rpsicomdp?lang=es\n", + " http://twitter.com/bu_ufsc\n", " info:other:archives.eprints.org:import\n", " TRUE\n", " TRUE\n", @@ -1054,9 +1054,9 @@ " celestial\n", " opendoar\n", " 2021-01-25\n", - " 367\n", - " 738\n", - " 362\n", + " 1\n", + " 6\n", + " 1\n", " 824\n", " 806\n", " 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...\n", @@ -1155,13 +1155,13 @@ " dir datestamp lastmod \\\n", "count 5375 5375 5375 \n", "unique 5375 4127 3966 \n", - "top disk0/00/00/14/07 2010-01-06 13:43:48 2011-07-06 08:24:53 \n", + "top disk0/00/00/09/21 2010-01-06 13:43:48 2016-04-17 21:55:19 \n", "freq 1 16 8 \n", "\n", " status_changed type succeeds commentary \\\n", "count 5375 5375 107 0 \n", "unique 4158 12 107 0 \n", - "top 2010-01-06 13:43:48 institutional 12637 NaN \n", + "top 2010-01-06 13:43:48 institutional 3164 NaN \n", "freq 16 3795 1 NaN \n", "\n", " metadata_visibility latitude longitude relation_type relation_uri \\\n", @@ -1206,11 +1206,11 @@ "top Repositorio Institucional http://virtuelcampus.univ-msila.dz/fll \n", "freq 7 5 \n", "\n", - " sword_endpoint \\\n", - "count 176 \n", - "unique 170 \n", - "top http://npl.csircentral.net/ \n", - "freq 2 \n", + " sword_endpoint \\\n", + "count 176 \n", + "unique 170 \n", + "top http://repositorio.itesm.mx/ortec/ \n", + "freq 2 \n", "\n", " rss_feed \\\n", "count 1521 \n", @@ -1218,35 +1218,35 @@ "top http://eprints.upnjatim.ac.id/cgi/latest_tool?... \n", "freq 5 \n", "\n", - " twitter_feed \\\n", - "count 115 \n", - "unique 111 \n", - "top https://twitter.com/rpsicomdp?lang=es \n", - "freq 2 \n", + " twitter_feed description \\\n", + "count 115 3782 \n", + "unique 111 3304 \n", + "top http://twitter.com/bu_ufsc info:other:archives.eprints.org:import \n", + "freq 2 112 \n", "\n", - " description fulltext open_access mandate \\\n", - "count 3782 4127 4127 3676 \n", - "unique 3304 2 2 2 \n", - "top info:other:archives.eprints.org:import TRUE TRUE FALSE \n", - "freq 112 2758 2652 2699 \n", + " fulltext open_access mandate organisation_title \\\n", + "count 4127 4127 3676 5182 \n", + "unique 2 2 2 4437 \n", + "top TRUE TRUE FALSE Chinese Academy of Science (中国科学院) \n", + "freq 2758 2652 2699 9 \n", "\n", - " organisation_title organisation_home_page \\\n", - "count 5182 4898 \n", - "unique 4437 4328 \n", - "top Chinese Academy of Science (中国科学院) http://www.cas.cn/ \n", - "freq 9 9 \n", + " organisation_home_page location_country location_city \\\n", + "count 4898 5205 3774 \n", + "unique 4328 136 1875 \n", + "top http://www.cas.cn/ us Lima \n", + "freq 9 902 70 \n", "\n", - " location_country location_city location_latitude location_longitude \\\n", - "count 5205 3774 3752 3734 \n", - "unique 136 1875 2927 2965 \n", - "top us Lima 34.1607 -118.139 \n", - "freq 902 70 25 25 \n", + " location_latitude location_longitude software geoname version \\\n", + "count 3752 3734 4637 4671 5375 \n", + "unique 2927 2965 31 126 53 \n", + "top 34.1607 -118.139 dspace geoname_2_US other \n", + "freq 25 25 2307 840 4771 \n", "\n", - " software geoname version subjects date \\\n", - "count 4637 4671 5375 10524 5360 \n", - "unique 31 126 53 237 4830 \n", - "top dspace geoname_2_US other L1 2006-05-04 10:48:14 \n", - "freq 2307 840 4771 348 99 \n", + " subjects date \\\n", + "count 10524 5360 \n", + "unique 237 4830 \n", + "top L1 2006-05-04 10:48:14 \n", + "freq 348 99 \n", "\n", " note \\\n", "count 215 \n", @@ -1287,7 +1287,7 @@ " webometrics_rank webometrics_size webometrics_visibility \\\n", "count 148 148 148 \n", "unique 148 148 148 \n", - "top 367 738 362 \n", + "top 1 6 1 \n", "freq 1 1 1 \n", "\n", " webometrics_rich_files webometrics_scholar \\\n", @@ -1770,454 +1770,6 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidrev_numbereprint_statususeridimportidsourcedirdatestamplastmodstatus_changedtypesucceedscommentarymetadata_visibilitylatitudelongituderelation_typerelation_uriitem_issues_iditem_issues_typeitem_issues_descriptionitem_issues_timestampitem_issues_statusitem_issues_reported_byitem_issues_resolved_byitem_issues_commentitem_issues_countsword_depositorsword_slugexemplarhome_pagetitleoai_pmhsword_endpointrss_feedtwitter_feeddescriptionfulltextopen_accessmandateorganisation_titleorganisation_home_pagelocation_countrylocation_citylocation_latitudelocation_longitudesoftwaregeonameversionsubjectsdatenotesuggestionsactivity_lowactivity_mediumactivity_highrecordcountrecordhistoryfulltexts_totalfulltexts_docsfulltexts_rtotalfulltexts_rdocsregistry_nameregistry_idsubmit_tosubmitted_to_namesubmitted_to_donewebometrics_rankwebometrics_sizewebometrics_visibilitywebometrics_rich_fileswebometrics_scholarmonthly_depositstotal_depositsassociation
2606657archive1NaNNaNdisk0/00/00/06/062010-01-06 13:44:312016-04-17 21:53:142010-01-06 13:44:31subjectNaNNaNshowNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0NaNNaNNaNhttp://hal.archives-ouvertes.fr/HAL: Hyper Article en Lignehttp://hal.archives-ouvertes.fr/oai/oai.phpNaNNaNNaNNaNTRUETRUENaNNaNNaNfrNaNNaNNaNhalgeoname_2_FRotherNaN1998-11-02 11:53:57NaNNaN0026758164,12,17,26,43,57,81,185,431,861,1184,1517,2442...NaNNaNNaNNaNopendoar166NaNNaNNaN161110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...3063NaN
3606NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNcelestial1106NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4606NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNroarmap69NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " eprintid rev_number eprint_status userid importid source dir \\\n", - "2 606 657 archive 1 NaN NaN disk0/00/00/06/06 \n", - "3 606 NaN NaN NaN NaN NaN NaN \n", - "4 606 NaN NaN NaN NaN NaN NaN \n", - "\n", - " datestamp lastmod status_changed type \\\n", - "2 2010-01-06 13:44:31 2016-04-17 21:53:14 2010-01-06 13:44:31 subject \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - " succeeds commentary metadata_visibility latitude longitude relation_type \\\n", - "2 NaN NaN show NaN NaN NaN \n", - "3 NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN NaN \n", - "\n", - " relation_uri item_issues_id item_issues_type item_issues_description \\\n", - "2 NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - " item_issues_timestamp item_issues_status item_issues_reported_by \\\n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "\n", - " item_issues_resolved_by item_issues_comment item_issues_count \\\n", - "2 NaN NaN 0 \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "\n", - " sword_depositor sword_slug exemplar home_page \\\n", - "2 NaN NaN NaN http://hal.archives-ouvertes.fr/ \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - " title oai_pmh \\\n", - "2 HAL: Hyper Article en Ligne http://hal.archives-ouvertes.fr/oai/oai.php \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " sword_endpoint rss_feed twitter_feed description fulltext open_access \\\n", - "2 NaN NaN NaN NaN TRUE TRUE \n", - "3 NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN NaN \n", - "\n", - " mandate organisation_title organisation_home_page location_country \\\n", - "2 NaN NaN NaN fr \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - " location_city location_latitude location_longitude software geoname \\\n", - "2 NaN NaN NaN hal geoname_2_FR \n", - "3 NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN \n", - "\n", - " version subjects date note suggestions activity_low \\\n", - "2 other NaN 1998-11-02 11:53:57 NaN NaN 0 \n", - "3 NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN NaN \n", - "\n", - " activity_medium activity_high recordcount \\\n", - "2 0 2 675816 \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "\n", - " recordhistory fulltexts_total \\\n", - "2 4,12,17,26,43,57,81,185,431,861,1184,1517,2442... NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id \\\n", - "2 NaN NaN NaN opendoar 166 \n", - "3 NaN NaN NaN celestial 1106 \n", - "4 NaN NaN NaN roarmap 69 \n", - "\n", - " submit_to submitted_to_name submitted_to_done webometrics_rank \\\n", - "2 NaN NaN NaN 1 \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - " webometrics_size webometrics_visibility webometrics_rich_files \\\n", - "2 6 1 1 \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "\n", - " webometrics_scholar monthly_deposits \\\n", - "2 1 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " total_deposits association \n", - "2 3063 NaN \n", - "3 NaN NaN \n", - "4 NaN NaN " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df[roar_df.eprintid == '606']" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, "outputs": [ { "data": { @@ -2398,7 +1950,7 @@ " 1\n", " {nan, 633}\n", " {nan, archive}\n", - " {nan, 1}\n", + " {1, nan}\n", " {nan}\n", " {nan}\n", " {nan, disk0/00/00/00/01}\n", @@ -2408,7 +1960,7 @@ " {nan, subject}\n", " {nan}\n", " {nan}\n", - " {nan, show}\n", + " {show, nan}\n", " {nan}\n", " {nan}\n", " {nan}\n", @@ -2426,7 +1978,7 @@ " {nan}\n", " {nan}\n", " {nan, http://archivesic.ccsd.cnrs.fr/}\n", - " {@RCHIVESIC , nan}\n", + " {nan, @RCHIVESIC }\n", " {nan, http://archivesic.ccsd.cnrs.fr/oai/oai.php}\n", " {nan}\n", " {nan}\n", @@ -2442,7 +1994,7 @@ " {nan}\n", " {nan}\n", " {nan, hal}\n", - " {geoname_2_FR, nan}\n", + " {nan, geoname_2_FR}\n", " {nan, other}\n", " {nan}\n", " {nan, 2002-05-17 19:24:41}\n", @@ -2451,7 +2003,7 @@ " {nan, 0}\n", " {nan, 0}\n", " {nan, 0}\n", - " {nan, 25}\n", + " {25, nan}\n", " {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...\n", " {nan}\n", " {nan}\n", @@ -2475,17 +2027,17 @@ " 10\n", " {nan, 511}\n", " {nan, archive}\n", - " {nan, 1}\n", + " {1, nan}\n", " {nan}\n", " {nan}\n", - " {nan, disk0/00/00/00/10}\n", + " {disk0/00/00/00/10, nan}\n", " {nan, 2010-01-06 13:43:48}\n", " {nan, 2011-07-18 05:40:13}\n", " {nan, 2010-01-06 13:43:48}\n", " {nan, institutional}\n", " {nan}\n", " {nan}\n", - " {nan, show}\n", + " {show, nan}\n", " {nan}\n", " {nan}\n", " {nan}\n", @@ -2502,15 +2054,15 @@ " {nan}\n", " {nan}\n", " {nan}\n", - " {nan, http://www.diva-portal.org/mdh/}\n", + " {http://www.diva-portal.org/mdh/, nan}\n", " {nan, Academic Archive On-line (Mälardalen Uni...\n", " {nan, http://www.diva-portal.org/oai/mdh/OAI}\n", " {nan}\n", " {nan}\n", " {nan}\n", " {nan}\n", - " {nan, TRUE}\n", - " {nan, TRUE}\n", + " {TRUE, nan}\n", + " {TRUE, nan}\n", " {nan}\n", " {nan}\n", " {nan}\n", @@ -2519,7 +2071,7 @@ " {nan, 59.8667}\n", " {nan, 17.6333}\n", " {nan, diva}\n", - " {nan, geoname_2_SE}\n", + " {geoname_2_SE, nan}\n", " {nan, other}\n", " {nan}\n", " {nan, 2005-12-08 13:15:22}\n", @@ -2529,7 +2081,7 @@ " {nan, 0}\n", " {nan, 0}\n", " {nan, 100}\n", - " {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,1...\n", + " {0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,10...\n", " {nan}\n", " {nan}\n", " {nan}\n", @@ -2639,7 +2191,7 @@ " {nan, subject}\n", " {nan}\n", " {nan}\n", - " {nan, show}\n", + " {show, nan}\n", " {nan}\n", " {nan}\n", " {nan}\n", @@ -2656,18 +2208,18 @@ " {nan}\n", " {nan}\n", " {nan}\n", - " {http://edoc.sub.uni-hamburg.de/klimawandel/, ...\n", - " {nan, Klimawandel Dokumentenserver}\n", + " {nan, http://edoc.sub.uni-hamburg.de/klimawand...\n", + " {Klimawandel Dokumentenserver, nan}\n", " {nan, http://edoc.sub.uni-hamburg.de/klimawand...\n", " {nan}\n", " {nan}\n", " {nan}\n", " {nan, The \"Documentenserver Klimawandel\" (Repo...\n", - " {nan, TRUE}\n", - " {nan, TRUE}\n", - " {nan, TRUE}\n", - " {nan, KLIMZUG projects, Helmholtz-Zentrum Gees...\n", - " {http://www.climateservicecenter.de/, nan, htt...\n", + " {TRUE, nan}\n", + " {TRUE, nan}\n", + " {TRUE, nan}\n", + " {nan, Helmholtz-Zentrum Geesthacht, KLIMZUG pr...\n", + " {http://www.hzg.de/, nan, http://www.climatese...\n", " {nan, de}\n", " {nan, Hamburg}\n", " {nan, 53.5511}\n", @@ -2675,7 +2227,7 @@ " {nan, opus}\n", " {nan, geoname_2_DE}\n", " {nan, other}\n", - " {S1, HD, GF, GE, G1}\n", + " {HD, S1, GF, GE, G1}\n", " {nan, 2015-07-02 08:08:31}\n", " {nan}\n", " {nan}\n", @@ -2689,7 +2241,7 @@ " {nan}\n", " {nan}\n", " {nan, celestial, opendoar}\n", - " {3408, 5881, nan}\n", + " {nan, 5881, 3408}\n", " {nan}\n", " {nan}\n", " {nan}\n", @@ -2786,8 +2338,8 @@ "text/plain": [ " rev_number eprint_status userid importid source \\\n", "eprintid \n", - "1 {nan, 633} {nan, archive} {nan, 1} {nan} {nan} \n", - "10 {nan, 511} {nan, archive} {nan, 1} {nan} {nan} \n", + "1 {nan, 633} {nan, archive} {1, nan} {nan} {nan} \n", + "10 {nan, 511} {nan, archive} {1, nan} {nan} {nan} \n", "1000 {274} {archive} {1} {nan} {nan} \n", "10001 {nan, 20} {nan, archive} {nan, 91} {nan} {nan} \n", "10008 {11} {archive} {404} {nan} {nan} \n", @@ -2795,7 +2347,7 @@ " dir datestamp \\\n", "eprintid \n", "1 {nan, disk0/00/00/00/01} {nan, 2010-01-06 13:43:48} \n", - "10 {nan, disk0/00/00/00/10} {nan, 2010-01-06 13:43:48} \n", + "10 {disk0/00/00/00/10, nan} {nan, 2010-01-06 13:43:48} \n", "1000 {disk0/00/00/10/00} {2010-01-06 13:45:01} \n", "10001 {nan, disk0/00/01/00/01} {nan, 2015-08-08 14:52:11} \n", "10008 {disk0/00/01/00/08} {2015-08-08 14:52:26} \n", @@ -2810,10 +2362,10 @@ "\n", " type succeeds commentary metadata_visibility \\\n", "eprintid \n", - "1 {nan, subject} {nan} {nan} {nan, show} \n", - "10 {nan, institutional} {nan} {nan} {nan, show} \n", + "1 {nan, subject} {nan} {nan} {show, nan} \n", + "10 {nan, institutional} {nan} {nan} {show, nan} \n", "1000 {subject} {nan} {nan} {show} \n", - "10001 {nan, subject} {nan} {nan} {nan, show} \n", + "10001 {nan, subject} {nan} {nan} {show, nan} \n", "10008 {institutional} {nan} {nan} {show} \n", "\n", " latitude longitude relation_type relation_uri item_issues_id \\\n", @@ -2851,17 +2403,17 @@ " exemplar home_page \\\n", "eprintid \n", "1 {nan} {nan, http://archivesic.ccsd.cnrs.fr/} \n", - "10 {nan} {nan, http://www.diva-portal.org/mdh/} \n", + "10 {nan} {http://www.diva-portal.org/mdh/, nan} \n", "1000 {nan} {http://pam.pisharp.org/} \n", - "10001 {nan} {http://edoc.sub.uni-hamburg.de/klimawandel/, ... \n", + "10001 {nan} {nan, http://edoc.sub.uni-hamburg.de/klimawand... \n", "10008 {nan} {http://creativematter.skidmore.edu/} \n", "\n", " title \\\n", "eprintid \n", - "1 {@RCHIVESIC , nan} \n", + "1 {nan, @RCHIVESIC } \n", "10 {nan, Academic Archive On-line (Mälardalen Uni... \n", "1000 {PAM - Portuguese Archive of Mathematics} \n", - "10001 {nan, Klimawandel Dokumentenserver} \n", + "10001 {Klimawandel Dokumentenserver, nan} \n", "10008 {Creative Matter | Skidmore College Research} \n", "\n", " oai_pmh sword_endpoint \\\n", @@ -2883,17 +2435,17 @@ " description fulltext \\\n", "eprintid \n", "1 {nan} {nan} \n", - "10 {nan} {nan, TRUE} \n", + "10 {nan} {TRUE, nan} \n", "1000 {nan} {TRUE} \n", - "10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {nan, TRUE} \n", + "10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {TRUE, nan} \n", "10008 {Welcome to Creative Matter, a repository for ... {TRUE} \n", "\n", " open_access mandate \\\n", "eprintid \n", "1 {nan} {nan} \n", - "10 {nan, TRUE} {nan} \n", + "10 {TRUE, nan} {nan} \n", "1000 {TRUE} {nan} \n", - "10001 {nan, TRUE} {nan, TRUE} \n", + "10001 {TRUE, nan} {TRUE, nan} \n", "10008 {FALSE} {FALSE} \n", "\n", " organisation_title \\\n", @@ -2901,7 +2453,7 @@ "1 {nan} \n", "10 {nan} \n", "1000 {nan} \n", - "10001 {nan, KLIMZUG projects, Helmholtz-Zentrum Gees... \n", + "10001 {nan, Helmholtz-Zentrum Geesthacht, KLIMZUG pr... \n", "10008 {Skidmore College} \n", "\n", " organisation_home_page location_country \\\n", @@ -2909,7 +2461,7 @@ "1 {nan} {nan, fr} \n", "10 {nan} {nan, se} \n", "1000 {nan} {pt} \n", - "10001 {http://www.climateservicecenter.de/, nan, htt... {nan, de} \n", + "10001 {http://www.hzg.de/, nan, http://www.climatese... {nan, de} \n", "10008 {http://www.skidmore.edu/} {us} \n", "\n", " location_city location_latitude location_longitude \\\n", @@ -2922,8 +2474,8 @@ "\n", " software geoname version \\\n", "eprintid \n", - "1 {nan, hal} {geoname_2_FR, nan} {nan, other} \n", - "10 {nan, diva} {nan, geoname_2_SE} {nan, other} \n", + "1 {nan, hal} {nan, geoname_2_FR} {nan, other} \n", + "10 {nan, diva} {geoname_2_SE, nan} {nan, other} \n", "1000 {dspace} {geoname_2_PT} {other} \n", "10001 {nan, opus} {nan, geoname_2_DE} {nan, other} \n", "10008 {bepress} {geoname_2_US} {other} \n", @@ -2933,12 +2485,12 @@ "1 {nan} {nan, 2002-05-17 19:24:41} {nan} {nan} \n", "10 {nan} {nan, 2005-12-08 13:15:22} {nan} {nan} \n", "1000 {nan} {2006-05-04 10:48:14} {nan} {nan} \n", - "10001 {S1, HD, GF, GE, G1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n", + "10001 {HD, S1, GF, GE, G1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n", "10008 {nan} {2015-07-06 17:35:50} {nan} {nan} \n", "\n", " activity_low activity_medium activity_high recordcount \\\n", "eprintid \n", - "1 {nan, 0} {nan, 0} {nan, 0} {nan, 25} \n", + "1 {nan, 0} {nan, 0} {nan, 0} {25, nan} \n", "10 {nan, 0} {nan, 0} {nan, 0} {nan, 100} \n", "1000 {nan} {nan} {nan} {nan} \n", "10001 {nan} {nan} {nan} {nan} \n", @@ -2947,7 +2499,7 @@ " recordhistory fulltexts_total \\\n", "eprintid \n", "1 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... {nan} \n", - "10 {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,1... {nan} \n", + "10 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,10... {nan} \n", "1000 {nan} {nan} \n", "10001 {nan} {nan} \n", "10008 {nan} {nan} \n", @@ -2965,7 +2517,7 @@ "1 {celestial, opendoar} {58, 669} {nan} \n", "10 {celestial, opendoar} {258, 526} {nan} \n", "1000 {nan} {nan} {nan} \n", - "10001 {nan, celestial, opendoar} {3408, 5881, nan} {nan} \n", + "10001 {nan, celestial, opendoar} {nan, 5881, 3408} {nan} \n", "10008 {celestial} {5882} {nan} \n", "\n", " submitted_to_name submitted_to_done webometrics_rank \\\n", @@ -2993,7 +2545,7 @@ "10008 {nan} {nan} {nan} {nan} " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -3005,7 +2557,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -3455,8 +3007,8 @@ " TRUE\n", " TRUE\n", " TRUE\n", - " [KLIMZUG projects, Helmholtz-Zentrum Geesthach...\n", - " [http://www.climateservicecenter.de/, http://w...\n", + " [Helmholtz-Zentrum Geesthacht, KLIMZUG project...\n", + " [http://www.hzg.de/, http://www.climateservice...\n", " de\n", " Hamburg\n", " 53.5511\n", @@ -3464,7 +3016,7 @@ " opus\n", " geoname_2_DE\n", " other\n", - " [S1, GE, HD, GF, G1]\n", + " [GF, HD, GE, S1, G1]\n", " 2015-07-02 08:08:31\n", " NaN\n", " NaN\n", @@ -3478,7 +3030,7 @@ " NaN\n", " NaN\n", " [celestial, opendoar]\n", - " [3408, 5881]\n", + " [5881, 3408]\n", " NaN\n", " NaN\n", " NaN\n", @@ -3682,7 +3234,7 @@ "1 NaN \n", "10 NaN \n", "1000 NaN \n", - "10001 [KLIMZUG projects, Helmholtz-Zentrum Geesthach... \n", + "10001 [Helmholtz-Zentrum Geesthacht, KLIMZUG project... \n", "10008 Skidmore College \n", "\n", " organisation_home_page location_country \\\n", @@ -3690,7 +3242,7 @@ "1 NaN fr \n", "10 NaN se \n", "1000 NaN pt \n", - "10001 [http://www.climateservicecenter.de/, http://w... de \n", + "10001 [http://www.hzg.de/, http://www.climateservice... de \n", "10008 http://www.skidmore.edu/ us \n", "\n", " location_city location_latitude location_longitude software \\\n", @@ -3706,7 +3258,7 @@ "1 geoname_2_FR other NaN 2002-05-17 19:24:41 \n", "10 geoname_2_SE other NaN 2005-12-08 13:15:22 \n", "1000 geoname_2_PT other NaN 2006-05-04 10:48:14 \n", - "10001 geoname_2_DE other [S1, GE, HD, GF, G1] 2015-07-02 08:08:31 \n", + "10001 geoname_2_DE other [GF, HD, GE, S1, G1] 2015-07-02 08:08:31 \n", "10008 geoname_2_US other NaN 2015-07-06 17:35:50 \n", "\n", " note suggestions activity_low activity_medium activity_high \\\n", @@ -3738,7 +3290,7 @@ "1 [celestial, opendoar] [58, 669] NaN NaN \n", "10 [celestial, opendoar] [258, 526] NaN NaN \n", "1000 NaN NaN NaN NaN \n", - "10001 [celestial, opendoar] [3408, 5881] NaN NaN \n", + "10001 [celestial, opendoar] [5881, 3408] NaN NaN \n", "10008 celestial 5882 NaN NaN \n", "\n", " submitted_to_done webometrics_rank webometrics_size \\\n", @@ -3766,7 +3318,7 @@ "10008 NaN NaN NaN " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -3787,7 +3339,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -3953,8 +3505,8 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, roarmap, opendoar]\n", - " [69, 166, 1106]\n", + " [celestial, opendoar, roarmap]\n", + " [1106, 69, 166]\n", " NaN\n", " NaN\n", " NaN\n", @@ -4021,7 +3573,7 @@ "4188 NaN NaN NaN \n", "\n", " registry_name registry_id submit_to \\\n", - "4188 [celestial, roarmap, opendoar] [69, 166, 1106] NaN \n", + "4188 [celestial, opendoar, roarmap] [1106, 69, 166] NaN \n", "\n", " submitted_to_name submitted_to_done webometrics_rank webometrics_size \\\n", "4188 NaN NaN 1 6 \n", @@ -4036,7 +3588,7 @@ "4188 NaN " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -4048,7 +3600,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -4271,7 +3823,7 @@ " 2\n", " 2\n", " 3802\n", - " 3771\n", + " 3772\n", " 143\n", " 1861\n", " 2887\n", @@ -4292,8 +3844,8 @@ " 118\n", " 134\n", " 117\n", - " 7\n", - " 4257\n", + " 8\n", + " 4256\n", " 7\n", " 1\n", " 1\n", @@ -4308,18 +3860,18 @@ " \n", " \n", " top\n", - " 1259\n", + " 1\n", " 11\n", " archive\n", " 1\n", " NaN\n", " NaN\n", - " disk0/00/00/14/07\n", + " disk0/00/00/00/01\n", " 2010-01-06 13:43:48\n", - " 2016-05-02 05:43:04\n", + " 2011-07-06 08:24:53\n", " 2010-01-06 13:43:48\n", " institutional\n", - " 12637\n", + " 10164\n", " NaN\n", " show\n", " NaN\n", @@ -4341,9 +3893,9 @@ " http://eprints.upnjatim.ac.id/\n", " Repositorio Institucional\n", " http://kce.docressources.info/ws/PMBWs_2\n", - " http://npl.csircentral.net/\n", + " http://producao.usp.br/sword/servicedocument\n", " http://eprints.upnjatim.ac.id/cgi/latest_tool?...\n", - " http://twitter.com/bu_ufsc\n", + " http://my.indexcopernicus.com/fredemoreno\n", " info:other:archives.eprints.org:import\n", " TRUE\n", " TRUE\n", @@ -4371,13 +3923,13 @@ " 0\n", " 0\n", " [celestial, opendoar]\n", - " [2246, 1879]\n", - " [celestial, roarmap, opendoar]\n", + " 2479\n", + " [celestial, opendoar, roarmap]\n", " opendoar\n", " 2021-01-25\n", - " 367\n", - " 738\n", - " 668\n", + " 24\n", + " 46\n", + " 20\n", " 824\n", " 806\n", " 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...\n", @@ -5016,7 +4568,7 @@ " eprintid rev_number eprint_status userid importid source \\\n", "count 5375 5375 5375 5375 0.0 0.0 \n", "unique 5375 658 1 2135 NaN NaN \n", - "top 1259 11 archive 1 NaN NaN \n", + "top 1 11 archive 1 NaN NaN \n", "freq 1 332 5375 1333 NaN NaN \n", "mean NaN NaN NaN NaN NaN NaN \n", "std NaN NaN NaN NaN NaN NaN \n", @@ -5029,7 +4581,7 @@ " dir datestamp lastmod \\\n", "count 5375 5375 5375 \n", "unique 5375 4127 3966 \n", - "top disk0/00/00/14/07 2010-01-06 13:43:48 2016-05-02 05:43:04 \n", + "top disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-06 08:24:53 \n", "freq 1 16 8 \n", "mean NaN NaN NaN \n", "std NaN NaN NaN \n", @@ -5042,7 +4594,7 @@ " status_changed type succeeds commentary \\\n", "count 5375 5375 107 0.0 \n", "unique 4158 12 107 NaN \n", - "top 2010-01-06 13:43:48 institutional 12637 NaN \n", + "top 2010-01-06 13:43:48 institutional 10164 NaN \n", "freq 16 3795 1 NaN \n", "mean NaN NaN NaN NaN \n", "std NaN NaN NaN NaN \n", @@ -5143,18 +4695,18 @@ "75% NaN NaN \n", "max NaN NaN \n", "\n", - " sword_endpoint \\\n", - "count 176 \n", - "unique 170 \n", - "top http://npl.csircentral.net/ \n", - "freq 2 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN \n", + " sword_endpoint \\\n", + "count 176 \n", + "unique 170 \n", + "top http://producao.usp.br/sword/servicedocument \n", + "freq 2 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", "\n", " rss_feed \\\n", "count 1521 \n", @@ -5169,70 +4721,70 @@ "75% NaN \n", "max NaN \n", "\n", - " twitter_feed description \\\n", - "count 115 3782 \n", - "unique 111 3304 \n", - "top http://twitter.com/bu_ufsc info:other:archives.eprints.org:import \n", - "freq 2 112 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", + " twitter_feed \\\n", + "count 115 \n", + "unique 111 \n", + "top http://my.indexcopernicus.com/fredemoreno \n", + "freq 2 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", "\n", - " fulltext open_access mandate organisation_title \\\n", - "count 4127 4127 3676 4396 \n", - "unique 2 2 2 3802 \n", - "top TRUE TRUE FALSE Chinese Academy of Science (中国科学院) \n", - "freq 2758 2652 2699 9 \n", - "mean NaN NaN NaN NaN \n", - "std NaN NaN NaN NaN \n", - "min NaN NaN NaN NaN \n", - "25% NaN NaN NaN NaN \n", - "50% NaN NaN NaN NaN \n", - "75% NaN NaN NaN NaN \n", - "max NaN NaN NaN NaN \n", + " description fulltext open_access mandate \\\n", + "count 3782 4127 4127 3676 \n", + "unique 3304 2 2 2 \n", + "top info:other:archives.eprints.org:import TRUE TRUE FALSE \n", + "freq 112 2758 2652 2699 \n", + "mean NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN \n", "\n", - " organisation_home_page location_country location_city \\\n", - "count 4226 5080 3655 \n", - "unique 3771 143 1861 \n", - "top http://www.cas.cn/ us Lima \n", - "freq 9 886 69 \n", - "mean NaN NaN NaN \n", - "std NaN NaN NaN \n", - "min NaN NaN NaN \n", - "25% NaN NaN NaN \n", - "50% NaN NaN NaN \n", - "75% NaN NaN NaN \n", - "max NaN NaN NaN \n", + " organisation_title organisation_home_page \\\n", + "count 4396 4226 \n", + "unique 3802 3772 \n", + "top Chinese Academy of Science (中国科学院) http://www.cas.cn/ \n", + "freq 9 9 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", "\n", - " location_latitude location_longitude software geoname version \\\n", - "count 3681 3664 4637 4671 5375 \n", - "unique 2887 2917 31 126 53 \n", - "top 34.1607 -118.139 dspace geoname_2_US other \n", - "freq 25 25 2307 840 4771 \n", - "mean NaN NaN NaN NaN NaN \n", - "std NaN NaN NaN NaN NaN \n", - "min NaN NaN NaN NaN NaN \n", - "25% NaN NaN NaN NaN NaN \n", - "50% NaN NaN NaN NaN NaN \n", - "75% NaN NaN NaN NaN NaN \n", - "max NaN NaN NaN NaN NaN \n", + " location_country location_city location_latitude location_longitude \\\n", + "count 5080 3655 3681 3664 \n", + "unique 143 1861 2887 2917 \n", + "top us Lima 34.1607 -118.139 \n", + "freq 886 69 25 25 \n", + "mean NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN \n", "\n", - " subjects date \\\n", - "count 1250 5360 \n", - "unique 906 4830 \n", - "top K1 2006-05-04 10:48:14 \n", - "freq 53 99 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", + " software geoname version subjects date \\\n", + "count 4637 4671 5375 1250 5360 \n", + "unique 31 126 53 906 4830 \n", + "top dspace geoname_2_US other K1 2006-05-04 10:48:14 \n", + "freq 2307 840 4771 53 99 \n", + "mean NaN NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN NaN \n", "\n", " note \\\n", "count 215 \n", @@ -5288,7 +4840,7 @@ "\n", " fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name \\\n", "count 258 270 258 4603 \n", - "unique 118 134 117 7 \n", + "unique 118 134 117 8 \n", "top 0 0 0 [celestial, opendoar] \n", "freq 114 113 114 2114 \n", "mean NaN NaN NaN NaN \n", @@ -5299,23 +4851,23 @@ "75% NaN NaN NaN NaN \n", "max NaN NaN NaN NaN \n", "\n", - " registry_id submit_to submitted_to_name \\\n", - "count 4578 293 205 \n", - "unique 4257 7 1 \n", - "top [2246, 1879] [celestial, roarmap, opendoar] opendoar \n", - "freq 4 92 205 \n", - "mean NaN NaN NaN \n", - "std NaN NaN NaN \n", - "min NaN NaN NaN \n", - "25% NaN NaN NaN \n", - "50% NaN NaN NaN \n", - "75% NaN NaN NaN \n", - "max NaN NaN NaN \n", + " registry_id submit_to submitted_to_name \\\n", + "count 4578 293 205 \n", + "unique 4256 7 1 \n", + "top 2479 [celestial, opendoar, roarmap] opendoar \n", + "freq 4 92 205 \n", + "mean NaN NaN NaN \n", + "std NaN NaN NaN \n", + "min NaN NaN NaN \n", + "25% NaN NaN NaN \n", + "50% NaN NaN NaN \n", + "75% NaN NaN NaN \n", + "max NaN NaN NaN \n", "\n", " submitted_to_done webometrics_rank webometrics_size \\\n", "count 205 148 148 \n", "unique 1 148 148 \n", - "top 2021-01-25 367 738 \n", + "top 2021-01-25 24 46 \n", "freq 205 1 1 \n", "mean NaN NaN NaN \n", "std NaN NaN NaN \n", @@ -5328,7 +4880,7 @@ " webometrics_visibility webometrics_rich_files webometrics_scholar \\\n", "count 148 148 148 \n", "unique 148 146 143 \n", - "top 668 824 806 \n", + "top 20 824 806 \n", "freq 1 3 5 \n", "mean NaN NaN NaN \n", "std NaN NaN NaN \n", @@ -5365,7 +4917,7 @@ "max NaN " ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -5376,7 +4928,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -5425,7 +4977,7 @@ "dtype: int64" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -5436,7 +4988,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -5480,7 +5032,7 @@ "dtype: int64" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -5489,6 +5041,39 @@ "roar_df.isna().sum()[40:]" ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "type\n", + "database 74\n", + "demonstration 20\n", + "institutional 3795\n", + "journal 121\n", + "learning 77\n", + "multi 141\n", + "opendata 41\n", + "other 409\n", + "researchdata 54\n", + "subject 294\n", + "theses 347\n", + "webobservatory 2\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(roar_df.type).groupby('type').size()" + ] + }, { "cell_type": "code", "execution_count": 13, @@ -5497,7 +5082,10 @@ { "data": { "text/plain": [ - "array([nan, 'TRUE', 'FALSE'], dtype=object)" + "open_access\n", + "FALSE 1475\n", + "TRUE 2652\n", + "dtype: int64" ] }, "execution_count": 13, @@ -5506,7 +5094,7 @@ } ], "source": [ - "roar_df.open_access.unique()" + "pd.DataFrame(roar_df.open_access).groupby('open_access').size()" ] }, { @@ -5517,9 +5105,10 @@ { "data": { "text/plain": [ - "array(['subject', 'institutional', 'researchdata', 'theses', 'database',\n", - " 'other', 'journal', 'opendata', 'demonstration', 'multi',\n", - " 'learning', 'webobservatory'], dtype=object)" + "mandate\n", + "FALSE 2699\n", + "TRUE 977\n", + "dtype: int64" ] }, "execution_count": 14, @@ -5528,27 +5117,7 @@ } ], "source": [ - "roar_df.type.unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([nan, 'TRUE', 'FALSE'], dtype=object)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df.mandate.unique()" + "pd.DataFrame(roar_df.mandate).groupby('mandate').size()" ] }, { diff --git a/notebooks/01.4-exploration-fairsharing.ipynb b/notebooks/01.4-exploration-fairsharing.ipynb index 8c12ae4..5182e5f 100644 --- a/notebooks/01.4-exploration-fairsharing.ipynb +++ b/notebooks/01.4-exploration-fairsharing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -54,116 +54,468 @@ " \n", " \n", " \n", - " full_name\n", - " short_name\n", - " fs_url\n", - " url\n", - " countries\n", - " subjects\n", + " id\n", + " type\n", + " attributes.created-at\n", + " attributes.updated-at\n", + " attributes.metadata.doi\n", + " attributes.metadata.name\n", + " attributes.metadata.status\n", + " attributes.metadata.contacts\n", + " attributes.metadata.homepage\n", + " attributes.metadata.identifier\n", + " attributes.metadata.description\n", + " attributes.metadata.support-links\n", + " attributes.metadata.year-creation\n", + " attributes.metadata.data-processes\n", + " attributes.legacy-ids\n", + " attributes.fairsharing-registry\n", + " attributes.record-type\n", + " attributes.subjects\n", + " attributes.domains\n", + " attributes.taxonomies\n", + " attributes.user-defined-tags\n", + " attributes.countries\n", + " attributes.name\n", + " attributes.abbreviation\n", + " attributes.url\n", + " attributes.doi\n", + " attributes.fairsharing-licence\n", + " attributes.description\n", + " attributes.publications\n", + " attributes.licence-links\n", + " attributes.metadata.citations\n", + " attributes.metadata.abbreviation\n", + " attributes.metadata.access-points\n", + " attributes.metadata.associated-tools\n", + " attributes.metadata.deprecation-date\n", + " attributes.metadata.deprecation-reason\n", + " attributes.metadata.tombstone\n", " \n", " \n", " \n", " \n", " 0\n", - " GenBank\n", - " GenBank\n", - " https://fairsharing.org/10.25504/FAIRsharing.9...\n", - " https://www.ncbi.nlm.nih.gov/genbank/\n", - " European Union,Japan,United States\n", - " Bioinformatics,Data Management,Data Submission...\n", + " 1723\n", + " fairsharing-records\n", + " 2014-11-04T15:23:40.000Z\n", + " 2021-09-30T11:39:06.829Z\n", + " 10.25504/FAIRsharing.8t18te\n", + " Cell Image Library\n", + " ready\n", + " [{'contact-name': 'David Orloff', 'contact-ema...\n", + " http://www.cellimagelibrary.org\n", + " 1723\n", + " This library is a public and easily accessible...\n", + " [{'url': 'http://www.cellimagelibrary.org/page...\n", + " 2010.0\n", + " [{'name': 'live update', 'type': 'data release...\n", + " [biodbcore-000180, bsg-d000180]\n", + " Database\n", + " repository\n", + " [Cell Biology, Life Science]\n", + " [Cell, Microscopy, Light microscopy, Electron ...\n", + " [All]\n", + " []\n", + " [United States]\n", + " FAIRsharing record for: Cell Image Library\n", + " None\n", + " https://fairsharing.org/10.25504/FAIRsharing.8...\n", + " 10.25504/FAIRsharing.8t18te\n", + " https://creativecommons.org/licenses/by-sa/4.0...\n", + " This FAIRsharing record describes: This librar...\n", + " [{'id': 232, 'pubmed_id': 23203874, 'title': '...\n", + " [{'licence-name': 'Cell Image Library Data Pol...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 1\n", - " GlycoNAVI\n", - " GlycoNAVI\n", - " https://fairsharing.org/10.25504/FAIRsharing.w...\n", - " https://glyconavi.org/\n", - " Japan\n", - " Chemistry,Glycomics,Life Science,Organic Chemi...\n", + " 3101\n", + " fairsharing-records\n", + " 2020-09-16T08:49:13.000Z\n", + " 2021-09-30T11:36:45.452Z\n", + " NaN\n", + " WHOI Ship Data-Grabber System\n", + " ready\n", + " NaN\n", + " http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html\n", + " 3101\n", + " The WHOI Ship DataGrabber system provides the ...\n", + " [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o...\n", + " 2004.0\n", + " [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai...\n", + " [biodbcore-001609, bsg-d001609]\n", + " Database\n", + " repository\n", + " [Earth Science, Water Research, Oceanography]\n", + " []\n", + " [Not applicable]\n", + " [subseafloor environments]\n", + " [United States]\n", + " FAIRsharing record for: WHOI Ship Data-Grabber...\n", + " None\n", + " https://fairsharing.org/fairsharing_records/3101\n", + " None\n", + " https://creativecommons.org/licenses/by-sa/4.0...\n", + " This FAIRsharing record describes: The WHOI Sh...\n", + " []\n", + " [{'licence-name': 'NDSF Data Archive Policy', ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 2\n", - " ADHDgene\n", - " ADHDgene\n", - " https://fairsharing.org/10.25504/FAIRsharing.m...\n", - " http://adhd.psych.ac.cn/\n", - " China\n", - " Biomedical Science,Genetics\n", + " 2649\n", + " fairsharing-records\n", + " 2018-08-07T20:23:32.000Z\n", + " 2021-09-30T11:39:07.898Z\n", + " NaN\n", + " Electron Microscope Public Image Archive\n", + " ready\n", + " [{'contact-name': 'General contact', 'contact-...\n", + " https://www.ebi.ac.uk/pdbe/emdb/empiar/\n", + " 2649\n", + " EMPIAR, the Electron Microscopy Public Image A...\n", + " [{'url': 'https://www.ebi.ac.uk/support/EMPIAR...\n", + " 2015.0\n", + " [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi...\n", + " [biodbcore-001140, bsg-d001140]\n", + " Database\n", + " repository\n", + " [Bioinformatics, Biology]\n", + " [Protein image, Microscopy, Electron microscop...\n", + " [All]\n", + " []\n", + " [Greece, Czech Republic, United Kingdom, Icela...\n", + " FAIRsharing record for: Electron Microscope Pu...\n", + " EMPIAR\n", + " https://fairsharing.org/fairsharing_records/2649\n", + " None\n", + " https://creativecommons.org/licenses/by-sa/4.0...\n", + " This FAIRsharing record describes: EMPIAR, the...\n", + " [{'id': 2232, 'pubmed_id': 27067018, 'title': ...\n", + " [{'licence-name': 'EMBL-EBI Terms of Use', 'li...\n", + " [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27...\n", + " EMPIAR\n", + " [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi...\n", + " [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi...\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 3\n", - " Allele frequency resource for research and tea...\n", - " ALFRED\n", - " https://fairsharing.org/10.25504/FAIRsharing.y...\n", - " http://alfred.med.yale.edu\n", - " United States\n", - " Life Science\n", + " 2657\n", + " fairsharing-records\n", + " 2018-08-13T15:12:11.000Z\n", + " 2021-09-30T11:37:28.736Z\n", + " 10.25504/FAIRsharing.tnByoG\n", + " ClinicalStudyDataRequest.com\n", + " ready\n", + " [{'contact-email': 'support@clinicalstudydatar...\n", + " https://clinicalstudydatarequest.com/\n", + " 2657\n", + " ClinicalStudyDataRequest.com (CSDR) is a conso...\n", + " [{'url': 'https://clinicalstudydatarequest.com...\n", + " 2014.0\n", + " [{'url': 'https://clinicalstudydatarequest.com...\n", + " [biodbcore-001149, bsg-d001149]\n", + " Database\n", + " repository\n", + " [Preclinical Studies, Biomedical Science]\n", + " []\n", + " [Homo sapiens]\n", + " []\n", + " [Worldwide]\n", + " FAIRsharing record for: ClinicalStudyDataReque...\n", + " CSDR\n", + " https://fairsharing.org/10.25504/FAIRsharing.t...\n", + " 10.25504/FAIRsharing.tnByoG\n", + " https://creativecommons.org/licenses/by-sa/4.0...\n", + " This FAIRsharing record describes: ClinicalStu...\n", + " []\n", + " [{'licence-name': 'CSDR Data Sharing Agreement...\n", + " NaN\n", + " CSDR\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 4\n", - " Animal Transcription Factor Database\n", - " AnimalTFDB\n", - " https://fairsharing.org/10.25504/FAIRsharing.e...\n", - " http://bioinfo.life.hust.edu.cn/AnimalTFDB/\n", - " China\n", - " Life Science\n", + " 2078\n", + " fairsharing-records\n", + " 2014-11-04T15:23:40.000Z\n", + " 2021-09-30T11:34:43.129Z\n", + " 10.25504/FAIRsharing.3axym7\n", + " Germplasm Resources Information Network\n", + " ready\n", + " [{'contact-email': 'dbmu@ars-grin.gov'}]\n", + " https://www.ars-grin.gov/\n", + " 2078\n", + " GRIN provides National Genetic Resources Progr...\n", + " [{'url': 'https://www.ars-grin.gov/Pages/Colle...\n", + " 2010.0\n", + " [{'url': 'https://www.ars-grin.gov/', 'name': ...\n", + " [biodbcore-000546, bsg-d000546]\n", + " Database\n", + " repository\n", + " [Life Science]\n", + " [Cell, Cell culture, Germplasm]\n", + " [Bacteria, Metazoa, Viridiplantae]\n", + " []\n", + " [United States]\n", + " FAIRsharing record for: Germplasm Resources In...\n", + " GRIN\n", + " https://fairsharing.org/10.25504/FAIRsharing.3...\n", + " 10.25504/FAIRsharing.3axym7\n", + " https://creativecommons.org/licenses/by-sa/4.0...\n", + " This FAIRsharing record describes: GRIN provid...\n", + " []\n", + " []\n", + " NaN\n", + " GRIN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " full_name short_name \\\n", - "0 GenBank GenBank \n", - "1 GlycoNAVI GlycoNAVI \n", - "2 ADHDgene ADHDgene \n", - "3 Allele frequency resource for research and tea... ALFRED \n", - "4 Animal Transcription Factor Database AnimalTFDB \n", + " id type attributes.created-at \\\n", + "0 1723 fairsharing-records 2014-11-04T15:23:40.000Z \n", + "1 3101 fairsharing-records 2020-09-16T08:49:13.000Z \n", + "2 2649 fairsharing-records 2018-08-07T20:23:32.000Z \n", + "3 2657 fairsharing-records 2018-08-13T15:12:11.000Z \n", + "4 2078 fairsharing-records 2014-11-04T15:23:40.000Z \n", "\n", - " fs_url \\\n", - "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", - "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", - "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", - "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", - "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", + " attributes.updated-at attributes.metadata.doi \\\n", + "0 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te \n", + "1 2021-09-30T11:36:45.452Z NaN \n", + "2 2021-09-30T11:39:07.898Z NaN \n", + "3 2021-09-30T11:37:28.736Z 10.25504/FAIRsharing.tnByoG \n", + "4 2021-09-30T11:34:43.129Z 10.25504/FAIRsharing.3axym7 \n", "\n", - " url \\\n", - "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", - "1 https://glyconavi.org/ \n", - "2 http://adhd.psych.ac.cn/ \n", - "3 http://alfred.med.yale.edu \n", - "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", + " attributes.metadata.name attributes.metadata.status \\\n", + "0 Cell Image Library ready \n", + "1 WHOI Ship Data-Grabber System ready \n", + "2 Electron Microscope Public Image Archive ready \n", + "3 ClinicalStudyDataRequest.com ready \n", + "4 Germplasm Resources Information Network ready \n", "\n", - " countries \\\n", - "0 European Union,Japan,United States \n", - "1 Japan \n", - "2 China \n", - "3 United States \n", - "4 China \n", + " attributes.metadata.contacts \\\n", + "0 [{'contact-name': 'David Orloff', 'contact-ema... \n", + "1 NaN \n", + "2 [{'contact-name': 'General contact', 'contact-... \n", + "3 [{'contact-email': 'support@clinicalstudydatar... \n", + "4 [{'contact-email': 'dbmu@ars-grin.gov'}] \n", "\n", - " subjects \n", - "0 Bioinformatics,Data Management,Data Submission... \n", - "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", - "2 Biomedical Science,Genetics \n", - "3 Life Science \n", - "4 Life Science " + " attributes.metadata.homepage \\\n", + "0 http://www.cellimagelibrary.org \n", + "1 http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html \n", + "2 https://www.ebi.ac.uk/pdbe/emdb/empiar/ \n", + "3 https://clinicalstudydatarequest.com/ \n", + "4 https://www.ars-grin.gov/ \n", + "\n", + " attributes.metadata.identifier \\\n", + "0 1723 \n", + "1 3101 \n", + "2 2649 \n", + "3 2657 \n", + "4 2078 \n", + "\n", + " attributes.metadata.description \\\n", + "0 This library is a public and easily accessible... \n", + "1 The WHOI Ship DataGrabber system provides the ... \n", + "2 EMPIAR, the Electron Microscopy Public Image A... \n", + "3 ClinicalStudyDataRequest.com (CSDR) is a conso... \n", + "4 GRIN provides National Genetic Resources Progr... \n", + "\n", + " attributes.metadata.support-links \\\n", + "0 [{'url': 'http://www.cellimagelibrary.org/page... \n", + "1 [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... \n", + "2 [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... \n", + "3 [{'url': 'https://clinicalstudydatarequest.com... \n", + "4 [{'url': 'https://www.ars-grin.gov/Pages/Colle... \n", + "\n", + " attributes.metadata.year-creation \\\n", + "0 2010.0 \n", + "1 2004.0 \n", + "2 2015.0 \n", + "3 2014.0 \n", + "4 2010.0 \n", + "\n", + " attributes.metadata.data-processes \\\n", + "0 [{'name': 'live update', 'type': 'data release... \n", + "1 [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... \n", + "2 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... \n", + "3 [{'url': 'https://clinicalstudydatarequest.com... \n", + "4 [{'url': 'https://www.ars-grin.gov/', 'name': ... \n", + "\n", + " attributes.legacy-ids attributes.fairsharing-registry \\\n", + "0 [biodbcore-000180, bsg-d000180] Database \n", + "1 [biodbcore-001609, bsg-d001609] Database \n", + "2 [biodbcore-001140, bsg-d001140] Database \n", + "3 [biodbcore-001149, bsg-d001149] Database \n", + "4 [biodbcore-000546, bsg-d000546] Database \n", + "\n", + " attributes.record-type attributes.subjects \\\n", + "0 repository [Cell Biology, Life Science] \n", + "1 repository [Earth Science, Water Research, Oceanography] \n", + "2 repository [Bioinformatics, Biology] \n", + "3 repository [Preclinical Studies, Biomedical Science] \n", + "4 repository [Life Science] \n", + "\n", + " attributes.domains \\\n", + "0 [Cell, Microscopy, Light microscopy, Electron ... \n", + "1 [] \n", + "2 [Protein image, Microscopy, Electron microscop... \n", + "3 [] \n", + "4 [Cell, Cell culture, Germplasm] \n", + "\n", + " attributes.taxonomies attributes.user-defined-tags \\\n", + "0 [All] [] \n", + "1 [Not applicable] [subseafloor environments] \n", + "2 [All] [] \n", + "3 [Homo sapiens] [] \n", + "4 [Bacteria, Metazoa, Viridiplantae] [] \n", + "\n", + " attributes.countries \\\n", + "0 [United States] \n", + "1 [United States] \n", + "2 [Greece, Czech Republic, United Kingdom, Icela... \n", + "3 [Worldwide] \n", + "4 [United States] \n", + "\n", + " attributes.name attributes.abbreviation \\\n", + "0 FAIRsharing record for: Cell Image Library None \n", + "1 FAIRsharing record for: WHOI Ship Data-Grabber... None \n", + "2 FAIRsharing record for: Electron Microscope Pu... EMPIAR \n", + "3 FAIRsharing record for: ClinicalStudyDataReque... CSDR \n", + "4 FAIRsharing record for: Germplasm Resources In... GRIN \n", + "\n", + " attributes.url \\\n", + "0 https://fairsharing.org/10.25504/FAIRsharing.8... \n", + "1 https://fairsharing.org/fairsharing_records/3101 \n", + "2 https://fairsharing.org/fairsharing_records/2649 \n", + "3 https://fairsharing.org/10.25504/FAIRsharing.t... \n", + "4 https://fairsharing.org/10.25504/FAIRsharing.3... \n", + "\n", + " attributes.doi \\\n", + "0 10.25504/FAIRsharing.8t18te \n", + "1 None \n", + "2 None \n", + "3 10.25504/FAIRsharing.tnByoG \n", + "4 10.25504/FAIRsharing.3axym7 \n", + "\n", + " attributes.fairsharing-licence \\\n", + "0 https://creativecommons.org/licenses/by-sa/4.0... \n", + "1 https://creativecommons.org/licenses/by-sa/4.0... \n", + "2 https://creativecommons.org/licenses/by-sa/4.0... \n", + "3 https://creativecommons.org/licenses/by-sa/4.0... \n", + "4 https://creativecommons.org/licenses/by-sa/4.0... \n", + "\n", + " attributes.description \\\n", + "0 This FAIRsharing record describes: This librar... \n", + "1 This FAIRsharing record describes: The WHOI Sh... \n", + "2 This FAIRsharing record describes: EMPIAR, the... \n", + "3 This FAIRsharing record describes: ClinicalStu... \n", + "4 This FAIRsharing record describes: GRIN provid... \n", + "\n", + " attributes.publications \\\n", + "0 [{'id': 232, 'pubmed_id': 23203874, 'title': '... \n", + "1 [] \n", + "2 [{'id': 2232, 'pubmed_id': 27067018, 'title': ... \n", + "3 [] \n", + "4 [] \n", + "\n", + " attributes.licence-links \\\n", + "0 [{'licence-name': 'Cell Image Library Data Pol... \n", + "1 [{'licence-name': 'NDSF Data Archive Policy', ... \n", + "2 [{'licence-name': 'EMBL-EBI Terms of Use', 'li... \n", + "3 [{'licence-name': 'CSDR Data Sharing Agreement... \n", + "4 [] \n", + "\n", + " attributes.metadata.citations \\\n", + "0 NaN \n", + "1 NaN \n", + "2 [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " attributes.metadata.abbreviation \\\n", + "0 NaN \n", + "1 NaN \n", + "2 EMPIAR \n", + "3 CSDR \n", + "4 GRIN \n", + "\n", + " attributes.metadata.access-points \\\n", + "0 NaN \n", + "1 NaN \n", + "2 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " attributes.metadata.associated-tools \\\n", + "0 NaN \n", + "1 NaN \n", + "2 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " attributes.metadata.deprecation-date attributes.metadata.deprecation-reason \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " attributes.metadata.tombstone \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " ] }, - "execution_count": 2, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", - " delimiter='|', header=0,\n", - " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", + "with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:\n", + " lines = f.read().splitlines()\n", + " \n", + "fairsharing_df = pd.DataFrame(lines)\n", + "fairsharing_df.columns = ['json_element']\n", + "fairsharing_df['json_element'].apply(json.loads)\n", + "fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))\n", + "\n", "fairsharing_df.head()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -187,76 +539,831 @@ " \n", " \n", " \n", - " full_name\n", - " short_name\n", - " fs_url\n", - " url\n", - " countries\n", - " subjects\n", + " id\n", + " type\n", + " attributes.created-at\n", + " attributes.updated-at\n", + " attributes.metadata.doi\n", + " attributes.metadata.name\n", + " attributes.metadata.status\n", + " attributes.metadata.contacts\n", + " attributes.metadata.homepage\n", + " attributes.metadata.identifier\n", + " attributes.metadata.description\n", + " attributes.metadata.support-links\n", + " attributes.metadata.year-creation\n", + " attributes.metadata.data-processes\n", + " attributes.legacy-ids\n", + " attributes.fairsharing-registry\n", + " attributes.record-type\n", + " attributes.subjects\n", + " attributes.domains\n", + " attributes.taxonomies\n", + " attributes.user-defined-tags\n", + " attributes.countries\n", + " attributes.name\n", + " attributes.abbreviation\n", + " attributes.url\n", + " attributes.doi\n", + " attributes.fairsharing-licence\n", + " attributes.description\n", + " attributes.publications\n", + " attributes.licence-links\n", + " attributes.metadata.citations\n", + " attributes.metadata.abbreviation\n", + " attributes.metadata.access-points\n", + " attributes.metadata.associated-tools\n", + " attributes.metadata.deprecation-date\n", + " attributes.metadata.deprecation-reason\n", + " attributes.metadata.tombstone\n", " \n", " \n", " \n", " \n", " count\n", - " 1752\n", - " 1752\n", - " 1752\n", - " 1752\n", - " 1749\n", - " 1690\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1354\n", + " 1797\n", + " 1797\n", + " 1678\n", + " 1797\n", + " 1797.000000\n", + " 1797\n", + " 1608\n", + " 1492.000000\n", + " 1565\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1638\n", + " 1797\n", + " 1354\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 1797\n", + " 326\n", + " 1638\n", + " 449\n", + " 618\n", + " 217\n", + " 217\n", + " 1\n", " \n", " \n", " unique\n", - " 1752\n", - " 1741\n", - " 1752\n", - " 1752\n", - " 178\n", - " 834\n", + " 1797\n", + " 1\n", + " 1162\n", + " 1797\n", + " 1354\n", + " 1796\n", + " 4\n", + " 1576\n", + " 1797\n", + " NaN\n", + " 1797\n", + " 1594\n", + " NaN\n", + " 1563\n", + " 1797\n", + " 1\n", + " 3\n", + " 888\n", + " 1163\n", + " 378\n", + " 384\n", + " 185\n", + " 1796\n", + " 1626\n", + " 1797\n", + " 1354\n", + " 1\n", + " 1797\n", + " 1109\n", + " 1082\n", + " 320\n", + " 1626\n", + " 444\n", + " 615\n", + " 55\n", + " 86\n", + " 1\n", " \n", " \n", " top\n", - " Brassica Information Portal\n", + " 1723\n", + " fairsharing-records\n", + " 2014-11-04T15:23:40.000Z\n", + " 2021-09-30T11:39:06.829Z\n", + " 10.25504/FAIRsharing.8t18te\n", + " OmicsDB\n", + " ready\n", + " [{'contact-name': 'Sam Hokin', 'contact-email'...\n", + " http://www.cellimagelibrary.org\n", + " NaN\n", + " This library is a public and easily accessible...\n", + " [{'url': 'https://github.com/gbif/ipt/wiki/IPT...\n", + " NaN\n", + " [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea...\n", + " [biodbcore-000180, bsg-d000180]\n", + " Database\n", + " repository\n", + " [Life Science]\n", + " []\n", + " [All]\n", + " []\n", + " [United States]\n", + " FAIRsharing record for: OmicsDB\n", " CGD\n", - " https://fairsharing.org/10.25504/FAIRsharing.e...\n", - " http://web.iodp.tamu.edu/LORE/\n", - " United States\n", - " Life Science\n", + " https://fairsharing.org/10.25504/FAIRsharing.8...\n", + " 10.25504/FAIRsharing.8t18te\n", + " https://creativecommons.org/licenses/by-sa/4.0...\n", + " This FAIRsharing record describes: This librar...\n", + " []\n", + " []\n", + " [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31...\n", + " CGD\n", + " [{'url': 'https://github.com/Ensembl', 'name':...\n", + " [{'url': 'http://www.h-invitational.jp/hinv/bl...\n", + " 2021-9-17\n", + " This resource is no longer available at the st...\n", + " True\n", " \n", " \n", " freq\n", " 1\n", + " 1797\n", + " 636\n", + " 1\n", + " 1\n", + " 2\n", + " 1540\n", + " 6\n", + " 1\n", + " NaN\n", + " 1\n", + " 6\n", + " NaN\n", + " 2\n", + " 1\n", + " 1797\n", + " 926\n", + " 350\n", + " 265\n", + " 502\n", + " 1193\n", + " 594\n", + " 2\n", " 3\n", " 1\n", " 1\n", - " 588\n", - " 367\n", + " 1797\n", + " 1\n", + " 661\n", + " 716\n", + " 6\n", + " 3\n", + " 3\n", + " 2\n", + " 84\n", + " 113\n", + " 1\n", + " \n", + " \n", + " mean\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2446.100167\n", + " NaN\n", + " NaN\n", + " 2007.636059\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " std\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 520.058757\n", + " NaN\n", + " NaN\n", + " 10.953269\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " min\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1547.000000\n", + " NaN\n", + " NaN\n", + " 1894.000000\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 25%\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1996.000000\n", + " NaN\n", + " NaN\n", + " 2004.000000\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 50%\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2445.000000\n", + " NaN\n", + " NaN\n", + " 2010.000000\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 75%\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2897.000000\n", + " NaN\n", + " NaN\n", + " 2014.000000\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " max\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 3346.000000\n", + " NaN\n", + " NaN\n", + " 2021.000000\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " full_name short_name \\\n", - "count 1752 1752 \n", - "unique 1752 1741 \n", - "top Brassica Information Portal CGD \n", - "freq 1 3 \n", + " id type attributes.created-at \\\n", + "count 1797 1797 1797 \n", + "unique 1797 1 1162 \n", + "top 1723 fairsharing-records 2014-11-04T15:23:40.000Z \n", + "freq 1 1797 636 \n", + "mean NaN NaN NaN \n", + "std NaN NaN NaN \n", + "min NaN NaN NaN \n", + "25% NaN NaN NaN \n", + "50% NaN NaN NaN \n", + "75% NaN NaN NaN \n", + "max NaN NaN NaN \n", "\n", - " fs_url \\\n", - "count 1752 \n", - "unique 1752 \n", - "top https://fairsharing.org/10.25504/FAIRsharing.e... \n", + " attributes.updated-at attributes.metadata.doi \\\n", + "count 1797 1354 \n", + "unique 1797 1354 \n", + "top 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te \n", + "freq 1 1 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " attributes.metadata.name attributes.metadata.status \\\n", + "count 1797 1797 \n", + "unique 1796 4 \n", + "top OmicsDB ready \n", + "freq 2 1540 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " attributes.metadata.contacts \\\n", + "count 1678 \n", + "unique 1576 \n", + "top [{'contact-name': 'Sam Hokin', 'contact-email'... \n", + "freq 6 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.homepage attributes.metadata.identifier \\\n", + "count 1797 1797.000000 \n", + "unique 1797 NaN \n", + "top http://www.cellimagelibrary.org NaN \n", + "freq 1 NaN \n", + "mean NaN 2446.100167 \n", + "std NaN 520.058757 \n", + "min NaN 1547.000000 \n", + "25% NaN 1996.000000 \n", + "50% NaN 2445.000000 \n", + "75% NaN 2897.000000 \n", + "max NaN 3346.000000 \n", + "\n", + " attributes.metadata.description \\\n", + "count 1797 \n", + "unique 1797 \n", + "top This library is a public and easily accessible... \n", "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", "\n", - " url countries subjects \n", - "count 1752 1749 1690 \n", - "unique 1752 178 834 \n", - "top http://web.iodp.tamu.edu/LORE/ United States Life Science \n", - "freq 1 588 367 " + " attributes.metadata.support-links \\\n", + "count 1608 \n", + "unique 1594 \n", + "top [{'url': 'https://github.com/gbif/ipt/wiki/IPT... \n", + "freq 6 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.year-creation \\\n", + "count 1492.000000 \n", + "unique NaN \n", + "top NaN \n", + "freq NaN \n", + "mean 2007.636059 \n", + "std 10.953269 \n", + "min 1894.000000 \n", + "25% 2004.000000 \n", + "50% 2010.000000 \n", + "75% 2014.000000 \n", + "max 2021.000000 \n", + "\n", + " attributes.metadata.data-processes \\\n", + "count 1565 \n", + "unique 1563 \n", + "top [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... \n", + "freq 2 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.legacy-ids attributes.fairsharing-registry \\\n", + "count 1797 1797 \n", + "unique 1797 1 \n", + "top [biodbcore-000180, bsg-d000180] Database \n", + "freq 1 1797 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " attributes.record-type attributes.subjects attributes.domains \\\n", + "count 1797 1797 1797 \n", + "unique 3 888 1163 \n", + "top repository [Life Science] [] \n", + "freq 926 350 265 \n", + "mean NaN NaN NaN \n", + "std NaN NaN NaN \n", + "min NaN NaN NaN \n", + "25% NaN NaN NaN \n", + "50% NaN NaN NaN \n", + "75% NaN NaN NaN \n", + "max NaN NaN NaN \n", + "\n", + " attributes.taxonomies attributes.user-defined-tags \\\n", + "count 1797 1797 \n", + "unique 378 384 \n", + "top [All] [] \n", + "freq 502 1193 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " attributes.countries attributes.name \\\n", + "count 1797 1797 \n", + "unique 185 1796 \n", + "top [United States] FAIRsharing record for: OmicsDB \n", + "freq 594 2 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " attributes.abbreviation \\\n", + "count 1638 \n", + "unique 1626 \n", + "top CGD \n", + "freq 3 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.url \\\n", + "count 1797 \n", + "unique 1797 \n", + "top https://fairsharing.org/10.25504/FAIRsharing.8... \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.doi \\\n", + "count 1354 \n", + "unique 1354 \n", + "top 10.25504/FAIRsharing.8t18te \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.fairsharing-licence \\\n", + "count 1797 \n", + "unique 1 \n", + "top https://creativecommons.org/licenses/by-sa/4.0... \n", + "freq 1797 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.description \\\n", + "count 1797 \n", + "unique 1797 \n", + "top This FAIRsharing record describes: This librar... \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.publications attributes.licence-links \\\n", + "count 1797 1797 \n", + "unique 1109 1082 \n", + "top [] [] \n", + "freq 661 716 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " attributes.metadata.citations \\\n", + "count 326 \n", + "unique 320 \n", + "top [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... \n", + "freq 6 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.abbreviation \\\n", + "count 1638 \n", + "unique 1626 \n", + "top CGD \n", + "freq 3 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.access-points \\\n", + "count 449 \n", + "unique 444 \n", + "top [{'url': 'https://github.com/Ensembl', 'name':... \n", + "freq 3 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.associated-tools \\\n", + "count 618 \n", + "unique 615 \n", + "top [{'url': 'http://www.h-invitational.jp/hinv/bl... \n", + "freq 2 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.deprecation-date \\\n", + "count 217 \n", + "unique 55 \n", + "top 2021-9-17 \n", + "freq 84 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.deprecation-reason \\\n", + "count 217 \n", + "unique 86 \n", + "top This resource is no longer available at the st... \n", + "freq 113 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " attributes.metadata.tombstone \n", + "count 1 \n", + "unique 1 \n", + "top True \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -267,22 +1374,53 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "full_name 0\n", - "short_name 0\n", - "fs_url 0\n", - "url 0\n", - "countries 3\n", - "subjects 62\n", + "id 0\n", + "type 0\n", + "attributes.created-at 0\n", + "attributes.updated-at 0\n", + "attributes.metadata.doi 443\n", + "attributes.metadata.name 0\n", + "attributes.metadata.status 0\n", + "attributes.metadata.contacts 119\n", + "attributes.metadata.homepage 0\n", + "attributes.metadata.identifier 0\n", + "attributes.metadata.description 0\n", + "attributes.metadata.support-links 189\n", + "attributes.metadata.year-creation 305\n", + "attributes.metadata.data-processes 232\n", + "attributes.legacy-ids 0\n", + "attributes.fairsharing-registry 0\n", + "attributes.record-type 0\n", + "attributes.subjects 0\n", + "attributes.domains 0\n", + "attributes.taxonomies 0\n", + "attributes.user-defined-tags 0\n", + "attributes.countries 0\n", + "attributes.name 0\n", + "attributes.abbreviation 159\n", + "attributes.url 0\n", + "attributes.doi 443\n", + "attributes.fairsharing-licence 0\n", + "attributes.description 0\n", + "attributes.publications 0\n", + "attributes.licence-links 0\n", + "attributes.metadata.citations 1471\n", + "attributes.metadata.abbreviation 159\n", + "attributes.metadata.access-points 1348\n", + "attributes.metadata.associated-tools 1179\n", + "attributes.metadata.deprecation-date 1580\n", + "attributes.metadata.deprecation-reason 1580\n", + "attributes.metadata.tombstone 1796\n", "dtype: int64" ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -291,6 +1429,30 @@ "fairsharing_df.isna().sum()" ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "attributes.record-type\n", + "knowledgebase 774\n", + "knowledgebase_and_repository 97\n", + "repository 926\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(fairsharing_df['attributes.record-type']).groupby('attributes.record-type').size()" + ] + }, { "cell_type": "code", "execution_count": null,