diff --git a/notebooks/01.1-exploration-re3data.ipynb b/notebooks/01.1-exploration-re3data.ipynb
index 6c14da2..820ed23 100644
--- a/notebooks/01.1-exploration-re3data.ipynb
+++ b/notebooks/01.1-exploration-re3data.ipynb
@@ -1,20 +1,5 @@
{
"cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Information to check\n",
- "- names\n",
- "- description\n",
- "- url\n",
- "- subjects & keywords\n",
- "- content type\n",
- "- repo type\n",
- "- policies\n",
- "\n"
- ]
- },
{
"cell_type": "code",
"execution_count": 1,
@@ -69,327 +54,353 @@
" \n",
" \n",
" | \n",
- " openaire_id | \n",
- " re3data_id | \n",
- " repository_name | \n",
- " additional_name | \n",
- " repository_url | \n",
- " repository_id | \n",
+ " orgIdentifier | \n",
+ " repositoryName | \n",
+ " repositoryName.language | \n",
+ " additionalName | \n",
+ " repositoryURL | \n",
+ " repositoryIdentifier | \n",
+ " repositoryContact | \n",
" description | \n",
+ " description.language | \n",
" type | \n",
" size | \n",
- " update_date | \n",
- " start_date | \n",
- " end_date | \n",
+ " startDate | \n",
+ " endDate | \n",
+ " repositoryLanguage | \n",
" subject | \n",
- " mission_statement | \n",
- " content_type | \n",
- " provider_type | \n",
+ " missionStatementURL | \n",
+ " contentType | \n",
+ " providerType | \n",
" keyword | \n",
" institution | \n",
" policy | \n",
- " database_access | \n",
- " database_license | \n",
- " data_access | \n",
- " data_license | \n",
- " data_upload | \n",
- " data_upload_license | \n",
+ " databaseAccess | \n",
+ " databaseLicense | \n",
+ " dataAccess | \n",
+ " dataLicense | \n",
+ " dataUploadType | \n",
+ " dataUploadLicense | \n",
" software | \n",
" versioning | \n",
" api | \n",
- " pid_system | \n",
- " citation_guideline_url | \n",
- " aid_system | \n",
- " enhanced_publication | \n",
- " quality_management | \n",
+ " pidSystem | \n",
+ " citationGuidelineURL | \n",
+ " aidSystem | \n",
+ " enhancedPublication | \n",
+ " qualityManagement | \n",
" certificate | \n",
- " metadata_standard | \n",
+ " metadataStandard | \n",
" syndication | \n",
" remarks | \n",
- " entry_date | \n",
- " last_update | \n",
+ " entryDate | \n",
+ " lastUpdate | \n",
"
\n",
" \n",
"
\n",
" \n",
" 0 | \n",
- " re3data_____::91780fe96da5ba32f804e43359c154ba | \n",
" r3d100000001 | \n",
" Odum Institute Archive Dataverse | \n",
+ " eng | \n",
" [] | \n",
" https://dataverse.unc.edu/dataverse/odum | \n",
" [] | \n",
+ " [\"https://dataverse.unc.edu/dataverse/odum#\", ... | \n",
" The Odum Institute Archive Dataverse contains ... | \n",
+ " eng | \n",
" [disciplinary] | \n",
- " 13 dataverses; 3.050 datasets | \n",
- " 2020-12-04 | \n",
+ " {\"size\": \"13 dataverses; 3.050 datasets\", \"upd... | \n",
" NaN | \n",
" NaN | \n",
- " [1 Humanities and Social Sciences, 111 Social ... | \n",
- " false | \n",
- " [Databases, Plain text, Scientific and statist... | \n",
+ " [\"eng\"] | \n",
+ " [{'name': '1 Humanities and Social Sciences', ... | \n",
+ " NaN | \n",
+ " [{'name': 'Databases', 'scheme': 'parse'}, {'n... | \n",
" [dataProvider] | \n",
" [FAIR, Middle East, crime, demography, economy... | \n",
- " [[Odum Institute for Research in Social Scienc... | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
+ " [{'institutionName': 'Odum Institute for Resea... | \n",
+ " [{\"policyName\": \"Collection Development Policy... | \n",
+ " {\"databaseAccessType\": \"open\", \"databaseAcces... | \n",
+ " [{\"databaseLicenseName\": \"CC0\", \"databaseLicen... | \n",
+ " [{\"dataAccessType\": \"embargoed\", \"dataAccessRe... | \n",
+ " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... | \n",
+ " restricted | \n",
+ " [] | \n",
+ " [\"DataVerse\"] | \n",
" NaN | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " {} | \n",
+ " [\"DOI\"] | \n",
+ " NaN | \n",
+ " [] | \n",
" unknown | \n",
" yes | \n",
- " true | \n",
- " true | \n",
- " false | \n",
+ " [\"other\"] | \n",
+ " [{\"metadataStandardName\": \"DDI - Data Document... | \n",
+ " {} | \n",
" Odum Dataverse is covered by Thomson Reuters D... | \n",
" 2013-06-10 | \n",
" 2021-07-06 | \n",
"
\n",
" \n",
" 1 | \n",
- " re3data_____::cc3ea05c863cd49af75f7f54e0e86f09 | \n",
" r3d100000002 | \n",
" Access to Archival Databases | \n",
- " [AAD] | \n",
+ " eng | \n",
+ " [{'additionalName': 'AAD', 'additionalNameLang... | \n",
" https://aad.archives.gov/aad/ | \n",
" [RRID:SCR_010479, RRID:nlx_157752] | \n",
+ " [\"https://www.archives.gov/contact\"] | \n",
" You will find in the Access to Archival Databa... | \n",
+ " eng | \n",
" [disciplinary] | \n",
- " NaN | \n",
- " NaN | \n",
+ " {\"size\": \"\", \"updatedp\": \"\"} | \n",
" 1985 | \n",
" NaN | \n",
- " [1 Humanities and Social Sciences, 102 History... | \n",
- " true | \n",
- " [Images, Standard office documents, Structured... | \n",
+ " [\"eng\", \"spa\"] | \n",
+ " [{'name': '1 Humanities and Social Sciences', ... | \n",
+ " https://www.archives.gov/publications/general-... | \n",
+ " [{'name': 'Images', 'scheme': 'parse'}, {'name... | \n",
" [dataProvider] | \n",
" [US History] | \n",
- " [[The U.S. National Archives and Records Admin... | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
+ " [{'institutionName': 'The U.S. National Archiv... | \n",
+ " [{\"policyName\": \"Contribution Policy\", \"policy... | \n",
+ " {\"databaseAccessType\": \"open\", \"databaseAcces... | \n",
+ " [] | \n",
+ " [{\"dataAccessType\": \"open\", \"dataAccessRestric... | \n",
+ " [{\"dataLicenseName\": \"Copyrights\", \"dataLicens... | \n",
+ " restricted | \n",
+ " [] | \n",
+ " [\"unknown\"] | \n",
" no | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " {\"api\": \"https://www.archives.gov/developer#to... | \n",
+ " [\"none\"] | \n",
+ " https://aad.archives.gov/aad/help/getting-star... | \n",
+ " [] | \n",
" unknown | \n",
" unknown | \n",
- " false | \n",
- " false | \n",
- " true | \n",
+ " [] | \n",
+ " [] | \n",
+ " {\"syndication\": \"http://www.archives.gov/socia... | \n",
" NaN | \n",
" 2012-07-04 | \n",
" 2021-05-25 | \n",
"
\n",
" \n",
" 2 | \n",
- " re3data_____::a2f73fbe91311f4356d0d7957c441773 | \n",
" r3d100000004 | \n",
" Datenbank Gesprochenes Deutsch | \n",
- " [DGD, DGD2 (formerly), Database for Spoken Ger... | \n",
+ " deu | \n",
+ " [{'additionalName': 'DGD', 'additionalNameLang... | \n",
" https://dgd.ids-mannheim.de/ | \n",
" [] | \n",
+ " [\"dgd@ids-mannheim.de\"] | \n",
" The \"Database for Spoken German (DGD)\" is a co... | \n",
+ " eng | \n",
" [disciplinary] | \n",
- " 34 corpora | \n",
- " 2020-02-03 | \n",
+ " {\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"} | \n",
" 2012 | \n",
" NaN | \n",
- " [1 Humanities and Social Sciences, 104 Linguis... | \n",
- " true | \n",
- " [Audiovisual data, Standard office documents, ... | \n",
+ " [\"deu\"] | \n",
+ " [{'name': '1 Humanities and Social Sciences', ... | \n",
+ " https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... | \n",
+ " [{'name': 'Audiovisual data', 'scheme': 'parse... | \n",
" [dataProvider, serviceProvider] | \n",
" [Australian German, FOLK, German dialects, Pfe... | \n",
- " [[Institut für Deutsche Sprache, Archiv für Ge... | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
+ " [{'institutionName': 'Institut für Deutsche Sp... | \n",
+ " [{\"policyName\": \"Erfurter Aufruf zur Sicherung... | \n",
+ " {\"databaseAccessType\": \"restricted\", \"databas... | \n",
+ " [] | \n",
+ " [{\"dataAccessType\": \"restricted\", \"dataAccessR... | \n",
+ " [{\"dataLicenseName\": \"other\", \"dataLicenseURL\"... | \n",
+ " restricted | \n",
+ " [] | \n",
+ " [\"other\"] | \n",
" yes | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " {} | \n",
+ " [\"none\"] | \n",
+ " http://agd.ids-mannheim.de/konditionen.shtml | \n",
+ " [] | \n",
" unknown | \n",
" unknown | \n",
- " true | \n",
- " false | \n",
- " false | \n",
+ " [\"RatSWD\"] | \n",
+ " [] | \n",
+ " {} | \n",
" NaN | \n",
" 2012-07-20 | \n",
" 2020-08-27 | \n",
"
\n",
" \n",
" 3 | \n",
- " re3data_____::0394b97eb11f19785cbca1ec830429da | \n",
" r3d100000005 | \n",
" UNC Dataverse | \n",
- " [University of North Carolina Dataverse] | \n",
+ " eng | \n",
+ " [{'additionalName': 'University of North Carol... | \n",
" https://dataverse.unc.edu/ | \n",
" [] | \n",
+ " [\"https://dataverse.unc.edu/\", \"odumarchive@un... | \n",
" UNC Dataverse is an open-source repository sof... | \n",
+ " eng | \n",
" [institutional] | \n",
- " 186 dataverses; 25.272 studies; 229.442 files | \n",
- " 2020-11-30 | \n",
+ " {\"size\": \"186 dataverses; 25.272 studies; 229.... | \n",
" 2011 | \n",
" NaN | \n",
- " [1 Humanities and Social Sciences, 111 Social ... | \n",
- " true | \n",
- " [Archived data, Plain text, Raw data, Scientif... | \n",
+ " [\"eng\"] | \n",
+ " [{'name': '1 Humanities and Social Sciences', ... | \n",
+ " https://odum.unc.edu/about/mission-vision/ | \n",
+ " [{'name': 'Archived data', 'scheme': 'parse'},... | \n",
" [dataProvider, serviceProvider] | \n",
" [FAIR, census, demographic survey, demography,... | \n",
- " [[Odum Institute for Research in Social Scienc... | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " [{'institutionName': 'Odum Institute for Resea... | \n",
+ " [{\"policyName\": \"Collection Development Policy... | \n",
+ " {\"databaseAccessType\": \"open\", \"databaseAcces... | \n",
+ " [] | \n",
+ " [{\"dataAccessType\": \"open\", \"dataAccessRestric... | \n",
+ " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... | \n",
+ " restricted | \n",
+ " [{\"dataUploadLicenseName\": \"Data Deposit Form\"... | \n",
+ " [\"DataVerse\"] | \n",
" yes | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " {\"api\": \"https://guides.dataverse.org/en/lates... | \n",
+ " [\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"] | \n",
+ " https://dataverse.org/best-practices/data-cita... | \n",
+ " [] | \n",
" unknown | \n",
" yes | \n",
- " false | \n",
- " true | \n",
- " false | \n",
- " The Odum Institute houses one of the oldest an... | \n",
+ " [] | \n",
+ " [{\"metadataStandardName\": \"DDI - Data Document... | \n",
+ " {} | \n",
+ " UNC Dataverse is covered by Clarivate Data Cit... | \n",
" 2012-07-23 | \n",
- " 2020-11-30 | \n",
+ " 2021-08-11 | \n",
"
\n",
" \n",
" 4 | \n",
- " re3data_____::a48f09c562b247a9919acfe195549b47 | \n",
" r3d100000006 | \n",
" Archaeology Data Service | \n",
- " [ADS] | \n",
+ " eng | \n",
+ " [{'additionalName': 'ADS', 'additionalNameLang... | \n",
" https://archaeologydataservice.ac.uk/ | \n",
" [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] | \n",
+ " [\"help@archaeologydataservice.ac.uk\", \"https:/... | \n",
" The ADS is an accredited digital repository fo... | \n",
+ " eng | \n",
" [disciplinary] | \n",
- " 1837 results | \n",
- " 2020-05-20 | \n",
+ " {\"size\": \"1837 results\", \"updatedp\": \"2020-05-... | \n",
" 1996-10-01 | \n",
" NaN | \n",
- " [1 Humanities and Social Sciences, 101 Ancient... | \n",
- " true | \n",
- " [Archived data, Audiovisual data, Databases, I... | \n",
+ " [\"eng\"] | \n",
+ " [{'name': '1 Humanities and Social Sciences', ... | \n",
+ " https://archaeologydataservice.ac.uk/about/our... | \n",
+ " [{'name': 'Archived data', 'scheme': 'parse'},... | \n",
" [dataProvider, serviceProvider] | \n",
" [FAIR, archaeology, cultural heritage, prehist... | \n",
- " [[Arts and Humanities Research Council, [AHRC]... | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " [{'institutionName': 'Arts and Humanities Rese... | \n",
+ " [{\"policyName\": \"ADS Guides to good practice\",... | \n",
+ " {\"databaseAccessType\": \"open\", \"databaseAcces... | \n",
+ " [{\"databaseLicenseName\": \"CC\", \"databaseLicens... | \n",
+ " [{\"dataAccessType\": \"open\", \"dataAccessRestric... | \n",
+ " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... | \n",
+ " restricted | \n",
+ " [{\"dataUploadLicenseName\": \"Guidelines for Dep... | \n",
+ " [\"other\"] | \n",
" yes | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " {\"api\": \"https://archaeologydataservice.ac.uk/... | \n",
+ " [\"DOI\"] | \n",
+ " https://archaeologydataservice.ac.uk/advice/te... | \n",
+ " [] | \n",
" unknown | \n",
" yes | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " [\"other\"] | \n",
+ " [{\"metadataStandardName\": \"DataCite Metadata S... | \n",
+ " {\"syndication\": \"https://archaeologydataservic... | \n",
" ADS is covered by Clarivate Data Citation Inde... | \n",
" 2012-07-23 | \n",
- " 2021-06-11 | \n",
+ " 2021-09-02 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " openaire_id re3data_id \\\n",
- "0 re3data_____::91780fe96da5ba32f804e43359c154ba r3d100000001 \n",
- "1 re3data_____::cc3ea05c863cd49af75f7f54e0e86f09 r3d100000002 \n",
- "2 re3data_____::a2f73fbe91311f4356d0d7957c441773 r3d100000004 \n",
- "3 re3data_____::0394b97eb11f19785cbca1ec830429da r3d100000005 \n",
- "4 re3data_____::a48f09c562b247a9919acfe195549b47 r3d100000006 \n",
+ " orgIdentifier repositoryName repositoryName.language \\\n",
+ "0 r3d100000001 Odum Institute Archive Dataverse eng \n",
+ "1 r3d100000002 Access to Archival Databases eng \n",
+ "2 r3d100000004 Datenbank Gesprochenes Deutsch deu \n",
+ "3 r3d100000005 UNC Dataverse eng \n",
+ "4 r3d100000006 Archaeology Data Service eng \n",
"\n",
- " repository_name \\\n",
- "0 Odum Institute Archive Dataverse \n",
- "1 Access to Archival Databases \n",
- "2 Datenbank Gesprochenes Deutsch \n",
- "3 UNC Dataverse \n",
- "4 Archaeology Data Service \n",
- "\n",
- " additional_name \\\n",
+ " additionalName \\\n",
"0 [] \n",
- "1 [AAD] \n",
- "2 [DGD, DGD2 (formerly), Database for Spoken Ger... \n",
- "3 [University of North Carolina Dataverse] \n",
- "4 [ADS] \n",
+ "1 [{'additionalName': 'AAD', 'additionalNameLang... \n",
+ "2 [{'additionalName': 'DGD', 'additionalNameLang... \n",
+ "3 [{'additionalName': 'University of North Carol... \n",
+ "4 [{'additionalName': 'ADS', 'additionalNameLang... \n",
"\n",
- " repository_url \\\n",
+ " repositoryURL \\\n",
"0 https://dataverse.unc.edu/dataverse/odum \n",
"1 https://aad.archives.gov/aad/ \n",
"2 https://dgd.ids-mannheim.de/ \n",
"3 https://dataverse.unc.edu/ \n",
"4 https://archaeologydataservice.ac.uk/ \n",
"\n",
- " repository_id \\\n",
+ " repositoryIdentifier \\\n",
"0 [] \n",
"1 [RRID:SCR_010479, RRID:nlx_157752] \n",
"2 [] \n",
"3 [] \n",
"4 [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] \n",
"\n",
- " description type \\\n",
- "0 The Odum Institute Archive Dataverse contains ... [disciplinary] \n",
- "1 You will find in the Access to Archival Databa... [disciplinary] \n",
- "2 The \"Database for Spoken German (DGD)\" is a co... [disciplinary] \n",
- "3 UNC Dataverse is an open-source repository sof... [institutional] \n",
- "4 The ADS is an accredited digital repository fo... [disciplinary] \n",
+ " repositoryContact \\\n",
+ "0 [\"https://dataverse.unc.edu/dataverse/odum#\", ... \n",
+ "1 [\"https://www.archives.gov/contact\"] \n",
+ "2 [\"dgd@ids-mannheim.de\"] \n",
+ "3 [\"https://dataverse.unc.edu/\", \"odumarchive@un... \n",
+ "4 [\"help@archaeologydataservice.ac.uk\", \"https:/... \n",
"\n",
- " size update_date start_date \\\n",
- "0 13 dataverses; 3.050 datasets 2020-12-04 NaN \n",
- "1 NaN NaN 1985 \n",
- "2 34 corpora 2020-02-03 2012 \n",
- "3 186 dataverses; 25.272 studies; 229.442 files 2020-11-30 2011 \n",
- "4 1837 results 2020-05-20 1996-10-01 \n",
+ " description description.language \\\n",
+ "0 The Odum Institute Archive Dataverse contains ... eng \n",
+ "1 You will find in the Access to Archival Databa... eng \n",
+ "2 The \"Database for Spoken German (DGD)\" is a co... eng \n",
+ "3 UNC Dataverse is an open-source repository sof... eng \n",
+ "4 The ADS is an accredited digital repository fo... eng \n",
"\n",
- " end_date subject \\\n",
- "0 NaN [1 Humanities and Social Sciences, 111 Social ... \n",
- "1 NaN [1 Humanities and Social Sciences, 102 History... \n",
- "2 NaN [1 Humanities and Social Sciences, 104 Linguis... \n",
- "3 NaN [1 Humanities and Social Sciences, 111 Social ... \n",
- "4 NaN [1 Humanities and Social Sciences, 101 Ancient... \n",
+ " type size \\\n",
+ "0 [disciplinary] {\"size\": \"13 dataverses; 3.050 datasets\", \"upd... \n",
+ "1 [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n",
+ "2 [disciplinary] {\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"} \n",
+ "3 [institutional] {\"size\": \"186 dataverses; 25.272 studies; 229.... \n",
+ "4 [disciplinary] {\"size\": \"1837 results\", \"updatedp\": \"2020-05-... \n",
"\n",
- " mission_statement content_type \\\n",
- "0 false [Databases, Plain text, Scientific and statist... \n",
- "1 true [Images, Standard office documents, Structured... \n",
- "2 true [Audiovisual data, Standard office documents, ... \n",
- "3 true [Archived data, Plain text, Raw data, Scientif... \n",
- "4 true [Archived data, Audiovisual data, Databases, I... \n",
+ " startDate endDate repositoryLanguage \\\n",
+ "0 NaN NaN [\"eng\"] \n",
+ "1 1985 NaN [\"eng\", \"spa\"] \n",
+ "2 2012 NaN [\"deu\"] \n",
+ "3 2011 NaN [\"eng\"] \n",
+ "4 1996-10-01 NaN [\"eng\"] \n",
"\n",
- " provider_type \\\n",
+ " subject \\\n",
+ "0 [{'name': '1 Humanities and Social Sciences', ... \n",
+ "1 [{'name': '1 Humanities and Social Sciences', ... \n",
+ "2 [{'name': '1 Humanities and Social Sciences', ... \n",
+ "3 [{'name': '1 Humanities and Social Sciences', ... \n",
+ "4 [{'name': '1 Humanities and Social Sciences', ... \n",
+ "\n",
+ " missionStatementURL \\\n",
+ "0 NaN \n",
+ "1 https://www.archives.gov/publications/general-... \n",
+ "2 https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... \n",
+ "3 https://odum.unc.edu/about/mission-vision/ \n",
+ "4 https://archaeologydataservice.ac.uk/about/our... \n",
+ "\n",
+ " contentType \\\n",
+ "0 [{'name': 'Databases', 'scheme': 'parse'}, {'n... \n",
+ "1 [{'name': 'Images', 'scheme': 'parse'}, {'name... \n",
+ "2 [{'name': 'Audiovisual data', 'scheme': 'parse... \n",
+ "3 [{'name': 'Archived data', 'scheme': 'parse'},... \n",
+ "4 [{'name': 'Archived data', 'scheme': 'parse'},... \n",
+ "\n",
+ " providerType \\\n",
"0 [dataProvider] \n",
"1 [dataProvider] \n",
"2 [dataProvider, serviceProvider] \n",
@@ -403,47 +414,103 @@
"3 [FAIR, census, demographic survey, demography,... \n",
"4 [FAIR, archaeology, cultural heritage, prehist... \n",
"\n",
- " institution policy database_access \\\n",
- "0 [[Odum Institute for Research in Social Scienc... true true \n",
- "1 [[The U.S. National Archives and Records Admin... true true \n",
- "2 [[Institut für Deutsche Sprache, Archiv für Ge... true true \n",
- "3 [[Odum Institute for Research in Social Scienc... true true \n",
- "4 [[Arts and Humanities Research Council, [AHRC]... true true \n",
+ " institution \\\n",
+ "0 [{'institutionName': 'Odum Institute for Resea... \n",
+ "1 [{'institutionName': 'The U.S. National Archiv... \n",
+ "2 [{'institutionName': 'Institut für Deutsche Sp... \n",
+ "3 [{'institutionName': 'Odum Institute for Resea... \n",
+ "4 [{'institutionName': 'Arts and Humanities Rese... \n",
"\n",
- " database_license data_access data_license data_upload data_upload_license \\\n",
- "0 true true true true false \n",
- "1 false true true true false \n",
- "2 false true true true false \n",
- "3 false true true true true \n",
- "4 true true true true true \n",
+ " policy \\\n",
+ "0 [{\"policyName\": \"Collection Development Policy... \n",
+ "1 [{\"policyName\": \"Contribution Policy\", \"policy... \n",
+ "2 [{\"policyName\": \"Erfurter Aufruf zur Sicherung... \n",
+ "3 [{\"policyName\": \"Collection Development Policy... \n",
+ "4 [{\"policyName\": \"ADS Guides to good practice\",... \n",
"\n",
- " software versioning api pid_system citation_guideline_url aid_system \\\n",
- "0 true NaN false true true true \n",
- "1 true no true true true true \n",
- "2 true yes false true true true \n",
- "3 true yes true true true true \n",
- "4 true yes true true true true \n",
+ " databaseAccess \\\n",
+ "0 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
+ "1 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
+ "2 {\"databaseAccessType\": \"restricted\", \"databas... \n",
+ "3 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
+ "4 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
"\n",
- " enhanced_publication quality_management certificate metadata_standard \\\n",
- "0 unknown yes true true \n",
- "1 unknown unknown false false \n",
- "2 unknown unknown true false \n",
- "3 unknown yes false true \n",
- "4 unknown yes true true \n",
+ " databaseLicense \\\n",
+ "0 [{\"databaseLicenseName\": \"CC0\", \"databaseLicen... \n",
+ "1 [] \n",
+ "2 [] \n",
+ "3 [] \n",
+ "4 [{\"databaseLicenseName\": \"CC\", \"databaseLicens... \n",
"\n",
- " syndication remarks entry_date \\\n",
- "0 false Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 \n",
- "1 true NaN 2012-07-04 \n",
- "2 false NaN 2012-07-20 \n",
- "3 false The Odum Institute houses one of the oldest an... 2012-07-23 \n",
- "4 true ADS is covered by Clarivate Data Citation Inde... 2012-07-23 \n",
+ " dataAccess \\\n",
+ "0 [{\"dataAccessType\": \"embargoed\", \"dataAccessRe... \n",
+ "1 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
+ "2 [{\"dataAccessType\": \"restricted\", \"dataAccessR... \n",
+ "3 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
+ "4 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
"\n",
- " last_update \n",
- "0 2021-07-06 \n",
- "1 2021-05-25 \n",
- "2 2020-08-27 \n",
- "3 2020-11-30 \n",
- "4 2021-06-11 "
+ " dataLicense dataUploadType \\\n",
+ "0 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
+ "1 [{\"dataLicenseName\": \"Copyrights\", \"dataLicens... restricted \n",
+ "2 [{\"dataLicenseName\": \"other\", \"dataLicenseURL\"... restricted \n",
+ "3 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
+ "4 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
+ "\n",
+ " dataUploadLicense software \\\n",
+ "0 [] [\"DataVerse\"] \n",
+ "1 [] [\"unknown\"] \n",
+ "2 [] [\"other\"] \n",
+ "3 [{\"dataUploadLicenseName\": \"Data Deposit Form\"... [\"DataVerse\"] \n",
+ "4 [{\"dataUploadLicenseName\": \"Guidelines for Dep... [\"other\"] \n",
+ "\n",
+ " versioning api \\\n",
+ "0 NaN {} \n",
+ "1 no {\"api\": \"https://www.archives.gov/developer#to... \n",
+ "2 yes {} \n",
+ "3 yes {\"api\": \"https://guides.dataverse.org/en/lates... \n",
+ "4 yes {\"api\": \"https://archaeologydataservice.ac.uk/... \n",
+ "\n",
+ " pidSystem \\\n",
+ "0 [\"DOI\"] \n",
+ "1 [\"none\"] \n",
+ "2 [\"none\"] \n",
+ "3 [\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"] \n",
+ "4 [\"DOI\"] \n",
+ "\n",
+ " citationGuidelineURL aidSystem \\\n",
+ "0 NaN [] \n",
+ "1 https://aad.archives.gov/aad/help/getting-star... [] \n",
+ "2 http://agd.ids-mannheim.de/konditionen.shtml [] \n",
+ "3 https://dataverse.org/best-practices/data-cita... [] \n",
+ "4 https://archaeologydataservice.ac.uk/advice/te... [] \n",
+ "\n",
+ " enhancedPublication qualityManagement certificate \\\n",
+ "0 unknown yes [\"other\"] \n",
+ "1 unknown unknown [] \n",
+ "2 unknown unknown [\"RatSWD\"] \n",
+ "3 unknown yes [] \n",
+ "4 unknown yes [\"other\"] \n",
+ "\n",
+ " metadataStandard \\\n",
+ "0 [{\"metadataStandardName\": \"DDI - Data Document... \n",
+ "1 [] \n",
+ "2 [] \n",
+ "3 [{\"metadataStandardName\": \"DDI - Data Document... \n",
+ "4 [{\"metadataStandardName\": \"DataCite Metadata S... \n",
+ "\n",
+ " syndication \\\n",
+ "0 {} \n",
+ "1 {\"syndication\": \"http://www.archives.gov/socia... \n",
+ "2 {} \n",
+ "3 {} \n",
+ "4 {\"syndication\": \"https://archaeologydataservic... \n",
+ "\n",
+ " remarks entryDate lastUpdate \n",
+ "0 Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06 \n",
+ "1 NaN 2012-07-04 2021-05-25 \n",
+ "2 NaN 2012-07-20 2020-08-27 \n",
+ "3 UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-08-11 \n",
+ "4 ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02 "
]
},
"execution_count": 2,
@@ -452,16 +519,17 @@
}
],
"source": [
- "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
+ "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t',\n",
" converters={'subject': ast.literal_eval,\n",
" 'keyword': ast.literal_eval,\n",
- " 'additional_name': ast.literal_eval,\n",
- " 'repository_id': ast.literal_eval,\n",
+ " 'additionalName': ast.literal_eval,\n",
+ " 'repositoryIdentifier': ast.literal_eval,\n",
" 'type': ast.literal_eval,\n",
- " 'content_type': ast.literal_eval,\n",
- " 'provider_type': ast.literal_eval,\n",
+ " 'contentType': ast.literal_eval,\n",
+ " 'providerType': ast.literal_eval,\n",
" 'institution': ast.literal_eval\n",
" })\n",
+ "\n",
"re3data_df.head()"
]
},
@@ -473,16 +541,17 @@
{
"data": {
"text/plain": [
- "Index(['openaire_id', 're3data_id', 'repository_name', 'additional_name',\n",
- " 'repository_url', 'repository_id', 'description', 'type', 'size',\n",
- " 'update_date', 'start_date', 'end_date', 'subject', 'mission_statement',\n",
- " 'content_type', 'provider_type', 'keyword', 'institution', 'policy',\n",
- " 'database_access', 'database_license', 'data_access', 'data_license',\n",
- " 'data_upload', 'data_upload_license', 'software', 'versioning', 'api',\n",
- " 'pid_system', 'citation_guideline_url', 'aid_system',\n",
- " 'enhanced_publication', 'quality_management', 'certificate',\n",
- " 'metadata_standard', 'syndication', 'remarks', 'entry_date',\n",
- " 'last_update'],\n",
+ "Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',\n",
+ " 'additionalName', 'repositoryURL', 'repositoryIdentifier',\n",
+ " 'repositoryContact', 'description', 'description.language', 'type',\n",
+ " 'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',\n",
+ " 'missionStatementURL', 'contentType', 'providerType', 'keyword',\n",
+ " 'institution', 'policy', 'databaseAccess', 'databaseLicense',\n",
+ " 'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',\n",
+ " 'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',\n",
+ " 'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',\n",
+ " 'metadataStandard', 'syndication', 'remarks', 'entryDate',\n",
+ " 'lastUpdate'],\n",
" dtype='object')"
]
},
@@ -536,298 +605,338 @@
" \n",
" \n",
" | \n",
- " openaire_id | \n",
- " re3data_id | \n",
- " repository_name | \n",
- " additional_name | \n",
- " repository_url | \n",
- " repository_id | \n",
+ " orgIdentifier | \n",
+ " repositoryName | \n",
+ " repositoryName.language | \n",
+ " additionalName | \n",
+ " repositoryURL | \n",
+ " repositoryIdentifier | \n",
+ " repositoryContact | \n",
" description | \n",
+ " description.language | \n",
" type | \n",
" size | \n",
- " update_date | \n",
- " start_date | \n",
- " end_date | \n",
+ " startDate | \n",
+ " endDate | \n",
+ " repositoryLanguage | \n",
" subject | \n",
- " mission_statement | \n",
- " content_type | \n",
- " provider_type | \n",
+ " missionStatementURL | \n",
+ " contentType | \n",
+ " providerType | \n",
" keyword | \n",
" institution | \n",
" policy | \n",
- " database_access | \n",
- " database_license | \n",
- " data_access | \n",
- " data_license | \n",
- " data_upload | \n",
- " data_upload_license | \n",
+ " databaseAccess | \n",
+ " databaseLicense | \n",
+ " dataAccess | \n",
+ " dataLicense | \n",
+ " dataUploadType | \n",
+ " dataUploadLicense | \n",
" software | \n",
" versioning | \n",
" api | \n",
- " pid_system | \n",
- " citation_guideline_url | \n",
- " aid_system | \n",
- " enhanced_publication | \n",
- " quality_management | \n",
+ " pidSystem | \n",
+ " citationGuidelineURL | \n",
+ " aidSystem | \n",
+ " enhancedPublication | \n",
+ " qualityManagement | \n",
" certificate | \n",
- " metadata_standard | \n",
+ " metadataStandard | \n",
" syndication | \n",
" remarks | \n",
- " entry_date | \n",
- " last_update | \n",
+ " entryDate | \n",
+ " lastUpdate | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2137 | \n",
- " 2686 | \n",
- " 829 | \n",
- " 2707 | \n",
- " 2677 | \n",
- " 1260 | \n",
- " 1248 | \n",
- " 1762 | \n",
- " 146 | \n",
- " 2685 | \n",
- " 2707 | \n",
- " 2700 | \n",
- " 2699 | \n",
- " 2699 | \n",
- " 2706 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 1292 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2704 | \n",
- " 2705 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 1637 | \n",
- " 2707 | \n",
- " 2707 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2170 | \n",
+ " 2716 | \n",
+ " 863 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2710 | \n",
+ " 2739 | \n",
+ " 1776 | \n",
+ " 157 | \n",
+ " 2739 | \n",
+ " 2720 | \n",
+ " 2318 | \n",
+ " 2732 | \n",
+ " 2735 | \n",
+ " 2732 | \n",
+ " 2738 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2711 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 1316 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 1512 | \n",
+ " 2739 | \n",
+ " 2737 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
+ " 1674 | \n",
+ " 2739 | \n",
+ " 2739 | \n",
"
\n",
" \n",
" unique | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2704 | \n",
- " 2128 | \n",
- " 2683 | \n",
- " 828 | \n",
- " 2705 | \n",
+ " 2739 | \n",
+ " 2736 | \n",
+ " 19 | \n",
+ " 2161 | \n",
+ " 2713 | \n",
+ " 863 | \n",
+ " 2459 | \n",
+ " 2737 | \n",
+ " 6 | \n",
" 8 | \n",
- " 1233 | \n",
- " 687 | \n",
- " 351 | \n",
- " 79 | \n",
- " 1367 | \n",
- " 2 | \n",
- " 1323 | \n",
+ " 1289 | \n",
+ " 352 | \n",
+ " 80 | \n",
+ " 107 | \n",
+ " 1388 | \n",
+ " 2249 | \n",
+ " 1337 | \n",
" 4 | \n",
- " 2474 | \n",
- " 2685 | \n",
+ " 2503 | \n",
+ " 2719 | \n",
+ " 2319 | \n",
+ " 12 | \n",
+ " 375 | \n",
+ " 145 | \n",
+ " 2263 | \n",
+ " 3 | \n",
+ " 681 | \n",
+ " 23 | \n",
" 2 | \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1 | \n",
- " 1 | \n",
+ " 1146 | \n",
+ " 29 | \n",
+ " 1321 | \n",
+ " 12 | \n",
" 3 | \n",
" 3 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1632 | \n",
- " 1259 | \n",
- " 814 | \n",
+ " 14 | \n",
+ " 172 | \n",
+ " 563 | \n",
+ " 1656 | \n",
+ " 1275 | \n",
+ " 740 | \n",
"
\n",
" \n",
" top | \n",
- " re3data_____::4cea5a5ea78542232a51190879756661 | \n",
- " r3d100011254 | \n",
- " EarthChem Library | \n",
- " [IRIS] | \n",
- " http://www.jcvi.org/cms/home/ | \n",
- " [doi:10.17171/1-6] | \n",
- " The repository is no longer available. >>>!!!<... | \n",
+ " r3d100000001 | \n",
+ " Språkbanken | \n",
+ " eng | \n",
+ " [{'additionalName': 'MPC', 'additionalNameLang... | \n",
+ " http://icgem.gfz-potsdam.de/home | \n",
+ " [RRID:SCR_010479, RRID:nlx_157752] | \n",
+ " [] | \n",
+ " The National Archives and Records Administrati... | \n",
+ " eng | \n",
" [disciplinary] | \n",
- " 2 datasets | \n",
- " 2019-05-15 | \n",
+ " {\"size\": \"\", \"updatedp\": \"\"} | \n",
" 2008 | \n",
" 2015 | \n",
- " [1 Humanities and Social Sciences, 2 Life Scie... | \n",
- " true | \n",
- " [Standard office documents] | \n",
+ " [\"eng\"] | \n",
+ " [{'name': '1 Humanities and Social Sciences', ... | \n",
+ " https://learn.scholarsportal.info/all-guides/d... | \n",
+ " [{'name': 'Standard office documents', 'scheme... | \n",
" [dataProvider] | \n",
" [multidisciplinary] | \n",
- " [[National Center for Biotechnology Informatio... | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- " false | \n",
- " true | \n",
+ " [{'institutionName': 'National Center for Biot... | \n",
+ " [][] | \n",
+ " {\"databaseAccessType\": \"open\", \"databaseAcces... | \n",
+ " [] | \n",
+ " [{\"dataAccessType\": \"open\", \"dataAccessRestric... | \n",
+ " [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... | \n",
+ " restricted | \n",
+ " [] | \n",
+ " [\"unknown\"] | \n",
" yes | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- " true | \n",
+ " {} | \n",
+ " [\"none\"] | \n",
+ " https://dataverse.org/best-practices/data-cita... | \n",
+ " [] | \n",
" unknown | \n",
" yes | \n",
- " false | \n",
- " false | \n",
- " false | \n",
- " The National Institute of Standards and Techno... | \n",
+ " [] | \n",
+ " [] | \n",
+ " {} | \n",
+ " is covered by Elsevier. | \n",
" 2016-05-10 | \n",
- " 2021-07-02 | \n",
+ " 2021-09-03 | \n",
"
\n",
" \n",
" freq | \n",
" 1 | \n",
+ " 2 | \n",
+ " 2554 | \n",
+ " 2 | \n",
+ " 2 | \n",
" 1 | \n",
+ " 202 | \n",
" 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1713 | \n",
- " 6 | \n",
- " 15 | \n",
+ " 2723 | \n",
+ " 1733 | \n",
+ " 1450 | \n",
" 92 | \n",
" 11 | \n",
- " 222 | \n",
- " 2286 | \n",
+ " 2063 | \n",
+ " 226 | \n",
+ " 14 | \n",
" 30 | \n",
- " 1748 | \n",
- " 190 | \n",
+ " 1771 | \n",
+ " 193 | \n",
" 6 | \n",
- " 2394 | \n",
- " 2707 | \n",
- " 2134 | \n",
- " 2701 | \n",
- " 2693 | \n",
- " 2681 | \n",
- " 1988 | \n",
- " 2227 | \n",
- " 1086 | \n",
- " 1485 | \n",
- " 2448 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 1592 | \n",
- " 1492 | \n",
- " 2481 | \n",
- " 1655 | \n",
- " 2129 | \n",
- " 3 | \n",
+ " 312 | \n",
+ " 2571 | \n",
+ " 2159 | \n",
+ " 1269 | \n",
+ " 64 | \n",
+ " 1793 | \n",
+ " 2013 | \n",
+ " 1226 | \n",
+ " 1108 | \n",
+ " 1498 | \n",
+ " 1361 | \n",
+ " 72 | \n",
+ " 2155 | \n",
+ " 1608 | \n",
+ " 1515 | \n",
+ " 2509 | \n",
+ " 1669 | \n",
+ " 2162 | \n",
+ " 14 | \n",
" 20 | \n",
- " 47 | \n",
+ " 137 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " openaire_id re3data_id \\\n",
- "count 2707 2707 \n",
- "unique 2707 2707 \n",
- "top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n",
- "freq 1 1 \n",
+ " orgIdentifier repositoryName repositoryName.language \\\n",
+ "count 2739 2739 2739 \n",
+ "unique 2739 2736 19 \n",
+ "top r3d100000001 Språkbanken eng \n",
+ "freq 1 2 2554 \n",
"\n",
- " repository_name additional_name repository_url \\\n",
- "count 2707 2137 2686 \n",
- "unique 2704 2128 2683 \n",
- "top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n",
- "freq 2 2 2 \n",
+ " additionalName \\\n",
+ "count 2170 \n",
+ "unique 2161 \n",
+ "top [{'additionalName': 'MPC', 'additionalNameLang... \n",
+ "freq 2 \n",
"\n",
- " repository_id description \\\n",
- "count 829 2707 \n",
- "unique 828 2705 \n",
- "top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n",
- "freq 2 2 \n",
+ " repositoryURL repositoryIdentifier \\\n",
+ "count 2716 863 \n",
+ "unique 2713 863 \n",
+ "top http://icgem.gfz-potsdam.de/home [RRID:SCR_010479, RRID:nlx_157752] \n",
+ "freq 2 1 \n",
"\n",
- " type size update_date start_date end_date \\\n",
- "count 2677 1260 1248 1762 146 \n",
- "unique 8 1233 687 351 79 \n",
- "top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n",
- "freq 1713 6 15 92 11 \n",
+ " repositoryContact description \\\n",
+ "count 2739 2739 \n",
+ "unique 2459 2737 \n",
+ "top [] The National Archives and Records Administrati... \n",
+ "freq 202 2 \n",
"\n",
- " subject mission_statement \\\n",
- "count 2685 2707 \n",
- "unique 1367 2 \n",
- "top [1 Humanities and Social Sciences, 2 Life Scie... true \n",
- "freq 222 2286 \n",
+ " description.language type size \\\n",
+ "count 2739 2710 2739 \n",
+ "unique 6 8 1289 \n",
+ "top eng [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n",
+ "freq 2723 1733 1450 \n",
"\n",
- " content_type provider_type keyword \\\n",
- "count 2700 2699 2699 \n",
- "unique 1323 4 2474 \n",
- "top [Standard office documents] [dataProvider] [multidisciplinary] \n",
- "freq 30 1748 190 \n",
+ " startDate endDate repositoryLanguage \\\n",
+ "count 1776 157 2739 \n",
+ "unique 352 80 107 \n",
+ "top 2008 2015 [\"eng\"] \n",
+ "freq 92 11 2063 \n",
+ "\n",
+ " subject \\\n",
+ "count 2720 \n",
+ "unique 1388 \n",
+ "top [{'name': '1 Humanities and Social Sciences', ... \n",
+ "freq 226 \n",
+ "\n",
+ " missionStatementURL \\\n",
+ "count 2318 \n",
+ "unique 2249 \n",
+ "top https://learn.scholarsportal.info/all-guides/d... \n",
+ "freq 14 \n",
+ "\n",
+ " contentType providerType \\\n",
+ "count 2732 2735 \n",
+ "unique 1337 4 \n",
+ "top [{'name': 'Standard office documents', 'scheme... [dataProvider] \n",
+ "freq 30 1771 \n",
+ "\n",
+ " keyword \\\n",
+ "count 2732 \n",
+ "unique 2503 \n",
+ "top [multidisciplinary] \n",
+ "freq 193 \n",
"\n",
" institution policy \\\n",
- "count 2706 2707 \n",
- "unique 2685 2 \n",
- "top [[National Center for Biotechnology Informatio... true \n",
- "freq 6 2394 \n",
+ "count 2738 2739 \n",
+ "unique 2719 2319 \n",
+ "top [{'institutionName': 'National Center for Biot... [][] \n",
+ "freq 6 312 \n",
"\n",
- " database_access database_license data_access data_license data_upload \\\n",
- "count 2707 2707 2707 2707 2707 \n",
- "unique 1 2 2 2 2 \n",
- "top true false true true true \n",
- "freq 2707 2134 2701 2693 2681 \n",
+ " databaseAccess databaseLicense \\\n",
+ "count 2739 2739 \n",
+ "unique 12 375 \n",
+ "top {\"databaseAccessType\": \"open\", \"databaseAcces... [] \n",
+ "freq 2571 2159 \n",
"\n",
- " data_upload_license software versioning api pid_system \\\n",
- "count 2707 2707 1292 2707 2707 \n",
- "unique 2 2 2 2 2 \n",
- "top false true yes false true \n",
- "freq 1988 2227 1086 1485 2448 \n",
+ " dataAccess \\\n",
+ "count 2739 \n",
+ "unique 145 \n",
+ "top [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
+ "freq 1269 \n",
"\n",
- " citation_guideline_url aid_system enhanced_publication \\\n",
- "count 2707 2707 2704 \n",
- "unique 1 1 3 \n",
- "top true true unknown \n",
- "freq 2707 2707 1592 \n",
+ " dataLicense dataUploadType \\\n",
+ "count 2739 2711 \n",
+ "unique 2263 3 \n",
+ "top [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
+ "freq 64 1793 \n",
"\n",
- " quality_management certificate metadata_standard syndication \\\n",
- "count 2705 2707 2707 2707 \n",
- "unique 3 2 2 2 \n",
- "top yes false false false \n",
- "freq 1492 2481 1655 2129 \n",
+ " dataUploadLicense software versioning api pidSystem \\\n",
+ "count 2739 2739 1316 2739 2739 \n",
+ "unique 681 23 2 1146 29 \n",
+ "top [] [\"unknown\"] yes {} [\"none\"] \n",
+ "freq 2013 1226 1108 1498 1361 \n",
"\n",
- " remarks entry_date \\\n",
- "count 1637 2707 \n",
- "unique 1632 1259 \n",
- "top The National Institute of Standards and Techno... 2016-05-10 \n",
- "freq 3 20 \n",
+ " citationGuidelineURL aidSystem \\\n",
+ "count 1512 2739 \n",
+ "unique 1321 12 \n",
+ "top https://dataverse.org/best-practices/data-cita... [] \n",
+ "freq 72 2155 \n",
"\n",
- " last_update \n",
- "count 2707 \n",
- "unique 814 \n",
- "top 2021-07-02 \n",
- "freq 47 "
+ " enhancedPublication qualityManagement certificate metadataStandard \\\n",
+ "count 2737 2739 2739 2739 \n",
+ "unique 3 3 14 172 \n",
+ "top unknown yes [] [] \n",
+ "freq 1608 1515 2509 1669 \n",
+ "\n",
+ " syndication remarks entryDate lastUpdate \n",
+ "count 2739 1674 2739 2739 \n",
+ "unique 563 1656 1275 740 \n",
+ "top {} is covered by Elsevier. 2016-05-10 2021-09-03 \n",
+ "freq 2162 14 20 137 "
]
},
"execution_count": 5,
@@ -847,45 +956,47 @@
{
"data": {
"text/plain": [
- "openaire_id 0\n",
- "re3data_id 0\n",
- "repository_name 0\n",
- "additional_name 570\n",
- "repository_url 21\n",
- "repository_id 1878\n",
- "description 0\n",
- "type 30\n",
- "size 1447\n",
- "update_date 1459\n",
- "start_date 945\n",
- "end_date 2561\n",
- "subject 22\n",
- "mission_statement 0\n",
- "content_type 7\n",
- "provider_type 8\n",
- "keyword 8\n",
- "institution 1\n",
- "policy 0\n",
- "database_access 0\n",
- "database_license 0\n",
- "data_access 0\n",
- "data_license 0\n",
- "data_upload 0\n",
- "data_upload_license 0\n",
- "software 0\n",
- "versioning 1415\n",
- "api 0\n",
- "pid_system 0\n",
- "citation_guideline_url 0\n",
- "aid_system 0\n",
- "enhanced_publication 3\n",
- "quality_management 2\n",
- "certificate 0\n",
- "metadata_standard 0\n",
- "syndication 0\n",
- "remarks 1070\n",
- "entry_date 0\n",
- "last_update 0\n",
+ "orgIdentifier 0\n",
+ "repositoryName 0\n",
+ "repositoryName.language 0\n",
+ "additionalName 569\n",
+ "repositoryURL 23\n",
+ "repositoryIdentifier 1876\n",
+ "repositoryContact 0\n",
+ "description 0\n",
+ "description.language 0\n",
+ "type 29\n",
+ "size 0\n",
+ "startDate 963\n",
+ "endDate 2582\n",
+ "repositoryLanguage 0\n",
+ "subject 19\n",
+ "missionStatementURL 421\n",
+ "contentType 7\n",
+ "providerType 4\n",
+ "keyword 7\n",
+ "institution 1\n",
+ "policy 0\n",
+ "databaseAccess 0\n",
+ "databaseLicense 0\n",
+ "dataAccess 0\n",
+ "dataLicense 0\n",
+ "dataUploadType 28\n",
+ "dataUploadLicense 0\n",
+ "software 0\n",
+ "versioning 1423\n",
+ "api 0\n",
+ "pidSystem 0\n",
+ "citationGuidelineURL 1227\n",
+ "aidSystem 0\n",
+ "enhancedPublication 2\n",
+ "qualityManagement 0\n",
+ "certificate 0\n",
+ "metadataStandard 0\n",
+ "syndication 0\n",
+ "remarks 1065\n",
+ "entryDate 0\n",
+ "lastUpdate 0\n",
"dtype: int64"
]
},
@@ -906,12 +1017,23 @@
{
"data": {
"text/plain": [
- "array(['Databases', 'Plain text',\n",
- " 'Scientific and statistical data formats',\n",
- " 'Standard office documents', 'other', 'Images', 'Structured text',\n",
- " 'Audiovisual data', 'Archived data', 'Raw data',\n",
- " 'Software applications', 'Source code', 'Structured graphics',\n",
- " 'Configuration data', 'Networkbased data', nan], dtype=object)"
+ "contentType\n",
+ "Archived data 658\n",
+ "Audiovisual data 542\n",
+ "Configuration data 79\n",
+ "Databases 586\n",
+ "Images 1378\n",
+ "Networkbased data 153\n",
+ "Plain text 1158\n",
+ "Raw data 1197\n",
+ "Scientific and statistical data formats 1685\n",
+ "Software applications 456\n",
+ "Source code 209\n",
+ "Standard office documents 1684\n",
+ "Structured graphics 917\n",
+ "Structured text 848\n",
+ "other 962\n",
+ "dtype: int64"
]
},
"execution_count": 7,
@@ -920,7 +1042,8 @@
}
],
"source": [
- "re3data_df.content_type.explode().unique()"
+ "types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)\n",
+ "pd.DataFrame(types).groupby('contentType').size()"
]
},
{
@@ -931,7 +1054,10 @@
{
"data": {
"text/plain": [
- "array(['dataProvider', 'serviceProvider', nan], dtype=object)"
+ "providerType\n",
+ "dataProvider 2491\n",
+ "serviceProvider 963\n",
+ "dtype: int64"
]
},
"execution_count": 8,
@@ -940,8 +1066,15 @@
}
],
"source": [
- "re3data_df.provider_type.explode().unique()"
+ "pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb
index d7eb22a..394152e 100644
--- a/notebooks/01.2-exploration-opendoar.ipynb
+++ b/notebooks/01.2-exploration-opendoar.ipynb
@@ -30,7 +30,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -54,241 +54,283 @@
" \n",
" \n",
" | \n",
- " openaire_id | \n",
- " opendoar_id | \n",
- " repository_name | \n",
- " additional_name | \n",
- " repository_url | \n",
- " description | \n",
- " type | \n",
- " update_date | \n",
- " start_date | \n",
- " subject | \n",
- " content_type | \n",
- " institution | \n",
- " metadata_policy | \n",
- " data_policy | \n",
- " submission_policy | \n",
- " content_policy | \n",
- " software | \n",
- " api | \n",
+ " system_metadata.id | \n",
+ " repository_metadata.name | \n",
+ " repository_metadata.alternativename | \n",
+ " repository_metadata.url | \n",
+ " repository_metadata.description | \n",
+ " repository_metadata.type | \n",
+ " repository_metadata.content_languages | \n",
+ " system_metadata.date_modified | \n",
+ " system_metadata.date_created | \n",
+ " repository_metadata.content_subjects | \n",
+ " repository_metadata.content_types | \n",
+ " organization | \n",
+ " policy_urls | \n",
+ " repository_metadata.software | \n",
+ " repository_metadata.oai_url | \n",
+ " system_metadata.publicly_visible | \n",
+ " repository_metadata.repository_status | \n",
+ " repository_metadata.fulltext_record_count | \n",
+ " repository_metadata.metadata_record_count | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
- " opendoar____::38b3eff8baf56627478ec76a704e9b52 | \n",
+ " 175 | \n",
+ " {\"name\": \"hku theses online\", \"language\": \"en\"} | \n",
+ " [] | \n",
+ " http://hub.hku.hk/handle/10722/1057 | \n",
+ " this is an institutional repository providing ... | \n",
+ " institutional | \n",
+ " [\"zh\", \"en\"] | \n",
+ " 2021-03-25 10:16:18 | \n",
+ " 2005-12-21 12:44:08 | \n",
+ " [\"multidisciplinary\"] | \n",
+ " [bibliographic_references, theses_and_disserta... | \n",
+ " [{'name': 'university of hong kong', 'alternat... | \n",
+ " [] | \n",
+ " {\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap... | \n",
+ " NaN | \n",
+ " yes | \n",
+ " fully_functional | \n",
+ " NaN | \n",
+ " 11850.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 64 | \n",
+ " {\"name\": \"research support scheme - central eu... | \n",
+ " [] | \n",
+ " http://rss.archives.ceu.hu/ | \n",
+ " this is an institutional repository collecting... | \n",
+ " institutional | \n",
+ " [\"cs\", \"en\", \"hu\", \"ru\"] | \n",
+ " 2021-03-25 09:48:31 | \n",
+ " 2006-01-04 14:59:30 | \n",
+ " [\"multidisciplinary\"] | \n",
+ " [unpub_reports_and_working_papers] | \n",
+ " [{'name': 'central european university', 'alte... | \n",
+ " [] | \n",
+ " {\"name\": \"eprints\", \"version\": \"2.2.1\"} | \n",
+ " http://rss.archives.ceu.hu/perl/oai2 | \n",
+ " yes | \n",
+ " fully_functional | \n",
+ " NaN | \n",
+ " 164.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 151 | \n",
+ " {\"name\": \"cadmus, eui research repository\", \"l... | \n",
+ " [] | \n",
+ " http://cadmus.eui.eu/ | \n",
+ " cadmus is the name of the eui research reposit... | \n",
+ " institutional | \n",
+ " [\"nl\", \"en\", \"fr\", \"de\", \"it\"] | \n",
+ " 2021-09-13 13:35:36 | \n",
+ " 2006-01-04 12:07:07 | \n",
+ " [\"history and archaeology\", \"multidisciplinary... | \n",
+ " [journal_articles, theses_and_dissertations, u... | \n",
+ " [{'name': 'european university institute', 'al... | \n",
+ " [{\"policy_url\": \"https://www.eui.eu/research/e... | \n",
+ " {\"name\": \"dspace\", \"version\": \"5.2\"} | \n",
+ " http://cadmus.eui.eu/oai/request | \n",
+ " yes | \n",
+ " fully_functional | \n",
+ " 3867.0 | \n",
+ " 24869.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 105 | \n",
+ " {\"name\": \"document server@uhasselt\", \"language... | \n",
+ " [] | \n",
+ " https://doclib.uhasselt.be/dspace/ | \n",
+ " this site is a university repository providing... | \n",
+ " institutional | \n",
+ " [\"nl\", \"en\", \"fr\", \"de\"] | \n",
+ " 2021-04-16 15:23:52 | \n",
+ " 2006-01-24 15:46:44 | \n",
+ " [\"multidisciplinary\"] | \n",
+ " [journal_articles, conference_and_workshop_pap... | \n",
+ " [{'name': 'uhasselt', 'alternativeName': 'hass... | \n",
+ " [] | \n",
+ " {\"name\": \"dspace\", \"version\": \"1.7.2\"} | \n",
+ " http://doclib.uhasselt.be/dspace-oai/request | \n",
+ " yes | \n",
+ " fully_functional | \n",
+ " 0.0 | \n",
+ " 27376.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
" 101 | \n",
- " utrecht university repository | \n",
+ " {\"name\": \"utrecht university repository\", \"lan... | \n",
" [] | \n",
" http://dspace.library.uu.nl | \n",
" this site is a university repository providing... | \n",
" institutional | \n",
+ " [\"nl\", \"en\"] | \n",
" 2021-04-16 15:22:03 | \n",
" 2006-01-13 12:55:13 | \n",
- " [multidisciplinary] | \n",
+ " [\"multidisciplinary\"] | \n",
" [journal_articles, conference_and_workshop_pap... | \n",
- " [[university of utrecht, [universiteit utrecht... | \n",
- " True | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " dspace | \n",
- " true | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " opendoar____::2b44928ae11fb9384c4cf38708677c48 | \n",
- " 115 | \n",
- " dspace at indian institute of management kozhi... | \n",
- " [dspace@iimk] | \n",
- " http://dspace.iimk.ac.in/ | \n",
- " this site is a subject based university reposi... | \n",
- " institutional | \n",
- " 2021-02-18 17:36:43 | \n",
- " 2006-01-04 11:54:34 | \n",
- " [ecology and environment, social sciences gene... | \n",
- " [journal_articles, conference_and_workshop_pap... | \n",
- " [[indian institute of management kozhikode, [i... | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " dspace 4.1 | \n",
- " true | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " opendoar____::3416a75f4cea9109507cacd8e2f2aefc | \n",
- " 41 | \n",
- " caltech engineering and science online | \n",
+ " [{'name': 'university of utrecht', 'alternativ... | \n",
" [] | \n",
- " http://calteches.library.caltech.edu/ | \n",
- " the caltech archives holds approximately 220 c... | \n",
- " institutional | \n",
- " 2021-02-18 17:36:28 | \n",
- " 2006-01-04 14:47:04 | \n",
- " [biology and biochemistry, chemistry and chemi... | \n",
- " [journal_articles, conference_and_workshop_pap... | \n",
- " [[california institute of technology, [caltech... | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " eprints 3.1.3 | \n",
- " true | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " opendoar____::07e1cd7dca89a1678042477183b7ac3f | \n",
- " 119 | \n",
- " dcu online research access service | \n",
- " [doras] | \n",
- " http://doras.dcu.ie/ | \n",
- " this site is an institutional repository provi... | \n",
- " institutional | \n",
- " 2021-02-18 17:36:44 | \n",
- " 2006-01-04 11:15:19 | \n",
- " [multidisciplinary] | \n",
- " [journal_articles, conference_and_workshop_pap... | \n",
- " [[dublin city university, [dcu], ie, [], , htt... | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " eprints 3.0.5 | \n",
- " true | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " opendoar____::d1f491a404d6854880943e5c3cd9ca25 | \n",
- " 129 | \n",
- " earth-prints repository | \n",
- " [] | \n",
- " http://www.earth-prints.org/ | \n",
- " a subject based repository providing open acce... | \n",
- " disciplinary | \n",
- " 2021-04-19 08:28:38 | \n",
- " 2006-01-30 16:43:11 | \n",
- " [earth and planetary sciences] | \n",
- " [journal_articles, conference_and_workshop_pap... | \n",
- " [[istituto nazionale di geofisica e vulcanolog... | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " dspace 5.8.1-snapshot | \n",
- " true | \n",
+ " {\"name\": \"dspace\", \"version\": \"\"} | \n",
+ " https://dspace.library.uu.nl/oai/request | \n",
+ " yes | \n",
+ " fully_functional | \n",
+ " 1686.0 | \n",
+ " 185637.0 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " openaire_id opendoar_id \\\n",
- "0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 \n",
- "1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 \n",
- "2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 \n",
- "3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 \n",
- "4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 \n",
+ " system_metadata.id repository_metadata.name \\\n",
+ "0 175 {\"name\": \"hku theses online\", \"language\": \"en\"} \n",
+ "1 64 {\"name\": \"research support scheme - central eu... \n",
+ "2 151 {\"name\": \"cadmus, eui research repository\", \"l... \n",
+ "3 105 {\"name\": \"document server@uhasselt\", \"language... \n",
+ "4 101 {\"name\": \"utrecht university repository\", \"lan... \n",
"\n",
- " repository_name additional_name \\\n",
- "0 utrecht university repository [] \n",
- "1 dspace at indian institute of management kozhi... [dspace@iimk] \n",
- "2 caltech engineering and science online [] \n",
- "3 dcu online research access service [doras] \n",
- "4 earth-prints repository [] \n",
+ " repository_metadata.alternativename repository_metadata.url \\\n",
+ "0 [] http://hub.hku.hk/handle/10722/1057 \n",
+ "1 [] http://rss.archives.ceu.hu/ \n",
+ "2 [] http://cadmus.eui.eu/ \n",
+ "3 [] https://doclib.uhasselt.be/dspace/ \n",
+ "4 [] http://dspace.library.uu.nl \n",
"\n",
- " repository_url \\\n",
- "0 http://dspace.library.uu.nl \n",
- "1 http://dspace.iimk.ac.in/ \n",
- "2 http://calteches.library.caltech.edu/ \n",
- "3 http://doras.dcu.ie/ \n",
- "4 http://www.earth-prints.org/ \n",
+ " repository_metadata.description repository_metadata.type \\\n",
+ "0 this is an institutional repository providing ... institutional \n",
+ "1 this is an institutional repository collecting... institutional \n",
+ "2 cadmus is the name of the eui research reposit... institutional \n",
+ "3 this site is a university repository providing... institutional \n",
+ "4 this site is a university repository providing... institutional \n",
"\n",
- " description type \\\n",
- "0 this site is a university repository providing... institutional \n",
- "1 this site is a subject based university reposi... institutional \n",
- "2 the caltech archives holds approximately 220 c... institutional \n",
- "3 this site is an institutional repository provi... institutional \n",
- "4 a subject based repository providing open acce... disciplinary \n",
+ " repository_metadata.content_languages system_metadata.date_modified \\\n",
+ "0 [\"zh\", \"en\"] 2021-03-25 10:16:18 \n",
+ "1 [\"cs\", \"en\", \"hu\", \"ru\"] 2021-03-25 09:48:31 \n",
+ "2 [\"nl\", \"en\", \"fr\", \"de\", \"it\"] 2021-09-13 13:35:36 \n",
+ "3 [\"nl\", \"en\", \"fr\", \"de\"] 2021-04-16 15:23:52 \n",
+ "4 [\"nl\", \"en\"] 2021-04-16 15:22:03 \n",
"\n",
- " update_date start_date \\\n",
- "0 2021-04-16 15:22:03 2006-01-13 12:55:13 \n",
- "1 2021-02-18 17:36:43 2006-01-04 11:54:34 \n",
- "2 2021-02-18 17:36:28 2006-01-04 14:47:04 \n",
- "3 2021-02-18 17:36:44 2006-01-04 11:15:19 \n",
- "4 2021-04-19 08:28:38 2006-01-30 16:43:11 \n",
+ " system_metadata.date_created \\\n",
+ "0 2005-12-21 12:44:08 \n",
+ "1 2006-01-04 14:59:30 \n",
+ "2 2006-01-04 12:07:07 \n",
+ "3 2006-01-24 15:46:44 \n",
+ "4 2006-01-13 12:55:13 \n",
"\n",
- " subject \\\n",
- "0 [multidisciplinary] \n",
- "1 [ecology and environment, social sciences gene... \n",
- "2 [biology and biochemistry, chemistry and chemi... \n",
- "3 [multidisciplinary] \n",
- "4 [earth and planetary sciences] \n",
+ " repository_metadata.content_subjects \\\n",
+ "0 [\"multidisciplinary\"] \n",
+ "1 [\"multidisciplinary\"] \n",
+ "2 [\"history and archaeology\", \"multidisciplinary... \n",
+ "3 [\"multidisciplinary\"] \n",
+ "4 [\"multidisciplinary\"] \n",
"\n",
- " content_type \\\n",
- "0 [journal_articles, conference_and_workshop_pap... \n",
- "1 [journal_articles, conference_and_workshop_pap... \n",
- "2 [journal_articles, conference_and_workshop_pap... \n",
+ " repository_metadata.content_types \\\n",
+ "0 [bibliographic_references, theses_and_disserta... \n",
+ "1 [unpub_reports_and_working_papers] \n",
+ "2 [journal_articles, theses_and_dissertations, u... \n",
"3 [journal_articles, conference_and_workshop_pap... \n",
"4 [journal_articles, conference_and_workshop_pap... \n",
"\n",
- " institution metadata_policy \\\n",
- "0 [[university of utrecht, [universiteit utrecht... True \n",
- "1 [[indian institute of management kozhikode, [i... True \n",
- "2 [[california institute of technology, [caltech... True \n",
- "3 [[dublin city university, [dcu], ie, [], , htt... True \n",
- "4 [[istituto nazionale di geofisica e vulcanolog... True \n",
+ " organization \\\n",
+ "0 [{'name': 'university of hong kong', 'alternat... \n",
+ "1 [{'name': 'central european university', 'alte... \n",
+ "2 [{'name': 'european university institute', 'al... \n",
+ "3 [{'name': 'uhasselt', 'alternativeName': 'hass... \n",
+ "4 [{'name': 'university of utrecht', 'alternativ... \n",
"\n",
- " data_policy submission_policy content_policy software \\\n",
- "0 True False True dspace \n",
- "1 True True True dspace 4.1 \n",
- "2 True True True eprints 3.1.3 \n",
- "3 True True True eprints 3.0.5 \n",
- "4 True True True dspace 5.8.1-snapshot \n",
+ " policy_urls \\\n",
+ "0 [] \n",
+ "1 [] \n",
+ "2 [{\"policy_url\": \"https://www.eui.eu/research/e... \n",
+ "3 [] \n",
+ "4 [] \n",
"\n",
- " api \n",
- "0 true \n",
- "1 true \n",
- "2 true \n",
- "3 true \n",
- "4 true "
+ " repository_metadata.software \\\n",
+ "0 {\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap... \n",
+ "1 {\"name\": \"eprints\", \"version\": \"2.2.1\"} \n",
+ "2 {\"name\": \"dspace\", \"version\": \"5.2\"} \n",
+ "3 {\"name\": \"dspace\", \"version\": \"1.7.2\"} \n",
+ "4 {\"name\": \"dspace\", \"version\": \"\"} \n",
+ "\n",
+ " repository_metadata.oai_url \\\n",
+ "0 NaN \n",
+ "1 http://rss.archives.ceu.hu/perl/oai2 \n",
+ "2 http://cadmus.eui.eu/oai/request \n",
+ "3 http://doclib.uhasselt.be/dspace-oai/request \n",
+ "4 https://dspace.library.uu.nl/oai/request \n",
+ "\n",
+ " system_metadata.publicly_visible repository_metadata.repository_status \\\n",
+ "0 yes fully_functional \n",
+ "1 yes fully_functional \n",
+ "2 yes fully_functional \n",
+ "3 yes fully_functional \n",
+ "4 yes fully_functional \n",
+ "\n",
+ " repository_metadata.fulltext_record_count \\\n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 3867.0 \n",
+ "3 0.0 \n",
+ "4 1686.0 \n",
+ "\n",
+ " repository_metadata.metadata_record_count \n",
+ "0 11850.0 \n",
+ "1 164.0 \n",
+ "2 24869.0 \n",
+ "3 27376.0 \n",
+ "4 185637.0 "
]
},
- "execution_count": 24,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
- " converters={'subject': ast.literal_eval,\n",
- " 'additional_name': ast.literal_eval,\n",
- " 'opendoar_id': ast.literal_eval,\n",
- " 'content_type': ast.literal_eval,\n",
- " 'institution': ast.literal_eval\n",
- " })\n",
+ " converters={'repository_metadata.content_subjects_phrases': ast.literal_eval,\n",
+ " 'repository_metadata.alternativename': ast.literal_eval,\n",
+ " 'repository_metadata.content_types': ast.literal_eval,\n",
+ " 'organization': ast.literal_eval\n",
+ " },\n",
+ " dtype={'system_metadata.id': str})\n",
+ "\n",
"opendoar_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
- " 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
- " 'subject', 'content_type', 'institution', 'metadata_policy',\n",
- " 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
- " 'api'],\n",
+ "Index(['system_metadata.id', 'repository_metadata.name',\n",
+ " 'repository_metadata.alternativename', 'repository_metadata.url',\n",
+ " 'repository_metadata.description', 'repository_metadata.type',\n",
+ " 'repository_metadata.content_languages',\n",
+ " 'system_metadata.date_modified', 'system_metadata.date_created',\n",
+ " 'repository_metadata.content_subjects',\n",
+ " 'repository_metadata.content_types', 'organization', 'policy_urls',\n",
+ " 'repository_metadata.software', 'repository_metadata.oai_url',\n",
+ " 'system_metadata.publicly_visible',\n",
+ " 'repository_metadata.repository_status',\n",
+ " 'repository_metadata.fulltext_record_count',\n",
+ " 'repository_metadata.metadata_record_count'],\n",
" dtype='object')"
]
},
- "execution_count": 25,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -299,7 +341,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -314,7 +356,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -338,115 +380,119 @@
" \n",
" \n",
" | \n",
- " openaire_id | \n",
- " opendoar_id | \n",
- " repository_name | \n",
- " additional_name | \n",
- " repository_url | \n",
- " description | \n",
- " type | \n",
- " update_date | \n",
- " start_date | \n",
- " subject | \n",
- " content_type | \n",
- " institution | \n",
- " metadata_policy | \n",
- " data_policy | \n",
- " submission_policy | \n",
- " content_policy | \n",
- " software | \n",
- " api | \n",
+ " system_metadata.id | \n",
+ " repository_metadata.name | \n",
+ " repository_metadata.alternativename | \n",
+ " repository_metadata.url | \n",
+ " repository_metadata.description | \n",
+ " repository_metadata.type | \n",
+ " repository_metadata.content_languages | \n",
+ " system_metadata.date_modified | \n",
+ " system_metadata.date_created | \n",
+ " repository_metadata.content_subjects | \n",
+ " repository_metadata.content_types | \n",
+ " organization | \n",
+ " policy_urls | \n",
+ " repository_metadata.software | \n",
+ " repository_metadata.oai_url | \n",
+ " system_metadata.publicly_visible | \n",
+ " repository_metadata.repository_status | \n",
+ " repository_metadata.fulltext_record_count | \n",
+ " repository_metadata.metadata_record_count | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
- " 5707 | \n",
- " 5707.000000 | \n",
- " 5707 | \n",
- " 2138 | \n",
- " 5707 | \n",
- " 5425 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5542 | \n",
- " 5563 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 2147 | \n",
+ " 5742 | \n",
+ " 5421 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 5598 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 5742 | \n",
+ " 4402 | \n",
+ " 5742 | \n",
+ " 5595 | \n",
+ " 2.299000e+03 | \n",
+ " 4.197000e+03 | \n",
"
\n",
" \n",
" unique | \n",
- " 5707 | \n",
- " NaN | \n",
- " 5670 | \n",
- " 2096 | \n",
- " 5670 | \n",
- " 4622 | \n",
+ " 5742 | \n",
+ " 5713 | \n",
+ " 2107 | \n",
+ " 5705 | \n",
+ " 4619 | \n",
" 4 | \n",
- " 2501 | \n",
- " 5538 | \n",
- " 819 | \n",
- " 476 | \n",
- " 5098 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
+ " 330 | \n",
+ " 2372 | \n",
+ " 5573 | \n",
+ " 821 | \n",
+ " 477 | \n",
+ " 5201 | \n",
+ " 642 | \n",
" 321 | \n",
- " 2 | \n",
+ " 4370 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" top | \n",
- " opendoar____::3cf166c6b73f030b4f67eeaeba301103 | \n",
- " NaN | \n",
- " hiroshima associated repository portal | \n",
- " [] | \n",
+ " 175 | \n",
+ " {\"name\": \"hiroshima associated repository port... | \n",
+ " [{'acronym': 'aura'}] | \n",
" http://harp.lib.hiroshima-u.ac.jp/ | \n",
" this site provides access to the research outp... | \n",
" institutional | \n",
+ " [\"en\"] | \n",
" 2020-09-18 12:53:48 | \n",
" 2020-09-18 12:53:48 | \n",
- " [multidisciplinary] | \n",
+ " [\"multidisciplinary\"] | \n",
" [theses_and_dissertations] | \n",
- " [[rijksuniversiteit groningen, [rug], nl, [], ... | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " dspace | \n",
- " true | \n",
+ " [{'name': 'rijksuniversiteit groningen', 'alte... | \n",
+ " [] | \n",
+ " {\"name\": \"dspace\", \"version\": \"\"} | \n",
+ " https://kidoks.bsz-bw.de/oai | \n",
+ " yes | \n",
+ " fully_functional | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" freq | \n",
" 1 | \n",
- " NaN | \n",
" 3 | \n",
" 4 | \n",
" 3 | \n",
" 95 | \n",
- " 5067 | \n",
+ " 5096 | \n",
+ " 1917 | \n",
" 82 | \n",
" 82 | \n",
- " 3212 | \n",
- " 460 | \n",
+ " 3227 | \n",
+ " 465 | \n",
" 26 | \n",
- " 4116 | \n",
- " 4101 | \n",
- " 5016 | \n",
- " 4075 | \n",
- " 800 | \n",
- " 4374 | \n",
+ " 5098 | \n",
+ " 822 | \n",
+ " 3 | \n",
+ " 5742 | \n",
+ " 5276 | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" mean | \n",
" NaN | \n",
- " 4008.118801 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -463,11 +509,12 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 5.010186e+03 | \n",
+ " 1.760546e+05 | \n",
"
\n",
" \n",
" std | \n",
" NaN | \n",
- " 2869.948770 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -484,11 +531,12 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 4.206295e+04 | \n",
+ " 6.600825e+06 | \n",
"
\n",
" \n",
" min | \n",
" NaN | \n",
- " 2.000000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -505,11 +553,12 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
"
\n",
" \n",
" 25% | \n",
" NaN | \n",
- " 1823.000000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -526,11 +575,12 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 0.000000e+00 | \n",
+ " 8.950000e+02 | \n",
"
\n",
" \n",
" 50% | \n",
" NaN | \n",
- " 3361.000000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -547,11 +597,12 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 4.220000e+02 | \n",
+ " 4.026000e+03 | \n",
"
\n",
" \n",
" 75% | \n",
" NaN | \n",
- " 5095.000000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -568,11 +619,12 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 2.930500e+03 | \n",
+ " 1.630400e+04 | \n",
"
\n",
" \n",
" max | \n",
" NaN | \n",
- " 10175.000000 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -589,41 +641,43 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " 1.817531e+06 | \n",
+ " 4.200000e+08 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " openaire_id opendoar_id \\\n",
- "count 5707 5707.000000 \n",
- "unique 5707 NaN \n",
- "top opendoar____::3cf166c6b73f030b4f67eeaeba301103 NaN \n",
- "freq 1 NaN \n",
- "mean NaN 4008.118801 \n",
- "std NaN 2869.948770 \n",
- "min NaN 2.000000 \n",
- "25% NaN 1823.000000 \n",
- "50% NaN 3361.000000 \n",
- "75% NaN 5095.000000 \n",
- "max NaN 10175.000000 \n",
+ " system_metadata.id repository_metadata.name \\\n",
+ "count 5742 5742 \n",
+ "unique 5742 5713 \n",
+ "top 175 {\"name\": \"hiroshima associated repository port... \n",
+ "freq 1 3 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
"\n",
- " repository_name additional_name \\\n",
- "count 5707 2138 \n",
- "unique 5670 2096 \n",
- "top hiroshima associated repository portal [] \n",
- "freq 3 4 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
+ " repository_metadata.alternativename \\\n",
+ "count 2147 \n",
+ "unique 2107 \n",
+ "top [{'acronym': 'aura'}] \n",
+ "freq 4 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
"\n",
- " repository_url \\\n",
- "count 5707 \n",
- "unique 5670 \n",
+ " repository_metadata.url \\\n",
+ "count 5742 \n",
+ "unique 5705 \n",
"top http://harp.lib.hiroshima-u.ac.jp/ \n",
"freq 3 \n",
"mean NaN \n",
@@ -634,73 +688,125 @@
"75% NaN \n",
"max NaN \n",
"\n",
- " description type \\\n",
- "count 5425 5707 \n",
- "unique 4622 4 \n",
- "top this site provides access to the research outp... institutional \n",
- "freq 95 5067 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
+ " repository_metadata.description \\\n",
+ "count 5421 \n",
+ "unique 4619 \n",
+ "top this site provides access to the research outp... \n",
+ "freq 95 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
"\n",
- " update_date start_date subject \\\n",
- "count 5707 5707 5542 \n",
- "unique 2501 5538 819 \n",
- "top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
- "freq 82 82 3212 \n",
- "mean NaN NaN NaN \n",
- "std NaN NaN NaN \n",
- "min NaN NaN NaN \n",
- "25% NaN NaN NaN \n",
- "50% NaN NaN NaN \n",
- "75% NaN NaN NaN \n",
- "max NaN NaN NaN \n",
+ " repository_metadata.type repository_metadata.content_languages \\\n",
+ "count 5742 5742 \n",
+ "unique 4 330 \n",
+ "top institutional [\"en\"] \n",
+ "freq 5096 1917 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
"\n",
- " content_type \\\n",
- "count 5563 \n",
- "unique 476 \n",
- "top [theses_and_dissertations] \n",
- "freq 460 \n",
- "mean NaN \n",
- "std NaN \n",
- "min NaN \n",
- "25% NaN \n",
- "50% NaN \n",
- "75% NaN \n",
- "max NaN \n",
+ " system_metadata.date_modified system_metadata.date_created \\\n",
+ "count 5742 5742 \n",
+ "unique 2372 5573 \n",
+ "top 2020-09-18 12:53:48 2020-09-18 12:53:48 \n",
+ "freq 82 82 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
"\n",
- " institution metadata_policy \\\n",
- "count 5707 5707 \n",
- "unique 5098 2 \n",
- "top [[rijksuniversiteit groningen, [rug], nl, [], ... False \n",
- "freq 26 4116 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
+ " repository_metadata.content_subjects repository_metadata.content_types \\\n",
+ "count 5742 5598 \n",
+ "unique 821 477 \n",
+ "top [\"multidisciplinary\"] [theses_and_dissertations] \n",
+ "freq 3227 465 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
"\n",
- " data_policy submission_policy content_policy software api \n",
- "count 5707 5707 5707 5707 5707 \n",
- "unique 2 2 2 321 2 \n",
- "top False False False dspace true \n",
- "freq 4101 5016 4075 800 4374 \n",
- "mean NaN NaN NaN NaN NaN \n",
- "std NaN NaN NaN NaN NaN \n",
- "min NaN NaN NaN NaN NaN \n",
- "25% NaN NaN NaN NaN NaN \n",
- "50% NaN NaN NaN NaN NaN \n",
- "75% NaN NaN NaN NaN NaN \n",
- "max NaN NaN NaN NaN NaN "
+ " organization policy_urls \\\n",
+ "count 5742 5742 \n",
+ "unique 5201 642 \n",
+ "top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n",
+ "freq 26 5098 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " repository_metadata.software repository_metadata.oai_url \\\n",
+ "count 5742 4402 \n",
+ "unique 321 4370 \n",
+ "top {\"name\": \"dspace\", \"version\": \"\"} https://kidoks.bsz-bw.de/oai \n",
+ "freq 822 3 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " system_metadata.publicly_visible repository_metadata.repository_status \\\n",
+ "count 5742 5595 \n",
+ "unique 1 7 \n",
+ "top yes fully_functional \n",
+ "freq 5742 5276 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " repository_metadata.fulltext_record_count \\\n",
+ "count 2.299000e+03 \n",
+ "unique NaN \n",
+ "top NaN \n",
+ "freq NaN \n",
+ "mean 5.010186e+03 \n",
+ "std 4.206295e+04 \n",
+ "min 0.000000e+00 \n",
+ "25% 0.000000e+00 \n",
+ "50% 4.220000e+02 \n",
+ "75% 2.930500e+03 \n",
+ "max 1.817531e+06 \n",
+ "\n",
+ " repository_metadata.metadata_record_count \n",
+ "count 4.197000e+03 \n",
+ "unique NaN \n",
+ "top NaN \n",
+ "freq NaN \n",
+ "mean 1.760546e+05 \n",
+ "std 6.600825e+06 \n",
+ "min 0.000000e+00 \n",
+ "25% 8.950000e+02 \n",
+ "50% 4.026000e+03 \n",
+ "75% 1.630400e+04 \n",
+ "max 4.200000e+08 "
]
},
- "execution_count": 29,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -711,34 +817,35 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "openaire_id 0\n",
- "opendoar_id 0\n",
- "repository_name 0\n",
- "additional_name 3569\n",
- "repository_url 0\n",
- "description 282\n",
- "type 0\n",
- "update_date 0\n",
- "start_date 0\n",
- "subject 165\n",
- "content_type 144\n",
- "institution 0\n",
- "metadata_policy 0\n",
- "data_policy 0\n",
- "submission_policy 0\n",
- "content_policy 0\n",
- "software 0\n",
- "api 0\n",
+ "system_metadata.id 0\n",
+ "repository_metadata.name 0\n",
+ "repository_metadata.alternativename 3595\n",
+ "repository_metadata.url 0\n",
+ "repository_metadata.description 321\n",
+ "repository_metadata.type 0\n",
+ "repository_metadata.content_languages 0\n",
+ "system_metadata.date_modified 0\n",
+ "system_metadata.date_created 0\n",
+ "repository_metadata.content_subjects 0\n",
+ "repository_metadata.content_types 144\n",
+ "organization 0\n",
+ "policy_urls 0\n",
+ "repository_metadata.software 0\n",
+ "repository_metadata.oai_url 1340\n",
+ "system_metadata.publicly_visible 0\n",
+ "repository_metadata.repository_status 147\n",
+ "repository_metadata.fulltext_record_count 3443\n",
+ "repository_metadata.metadata_record_count 1545\n",
"dtype: int64"
]
},
- "execution_count": 30,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -749,10 +856,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "repository_metadata.content_types\n",
+ "bibliographic_references 865\n",
+ "books_chapters_and_sections 2194\n",
+ "conference_and_workshop_papers 1981\n",
+ "datasets 401\n",
+ "journal_articles 4030\n",
+ "learning_objects 789\n",
+ "other_special_item_types 1759\n",
+ "patents 182\n",
+ "software 92\n",
+ "theses_and_dissertations 3319\n",
+ "unpub_reports_and_working_papers 1904\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()"
+ ]
},
{
"cell_type": "code",
diff --git a/notebooks/01.3-exploration-roar.ipynb b/notebooks/01.3-exploration-roar.ipynb
index 05a0433..521afe4 100644
--- a/notebooks/01.3-exploration-roar.ipynb
+++ b/notebooks/01.3-exploration-roar.ipynb
@@ -993,12 +993,12 @@
" 1 | \n",
" NaN | \n",
" NaN | \n",
- " disk0/00/00/14/07 | \n",
+ " disk0/00/00/09/21 | \n",
" 2010-01-06 13:43:48 | \n",
- " 2011-07-06 08:24:53 | \n",
+ " 2016-04-17 21:55:19 | \n",
" 2010-01-06 13:43:48 | \n",
" institutional | \n",
- " 12637 | \n",
+ " 3164 | \n",
" NaN | \n",
" show | \n",
" NaN | \n",
@@ -1020,9 +1020,9 @@
" http://eprints.upnjatim.ac.id/ | \n",
" Repositorio Institucional | \n",
" http://virtuelcampus.univ-msila.dz/fll | \n",
- " http://npl.csircentral.net/ | \n",
+ " http://repositorio.itesm.mx/ortec/ | \n",
" http://eprints.upnjatim.ac.id/cgi/latest_tool?... | \n",
- " https://twitter.com/rpsicomdp?lang=es | \n",
+ " http://twitter.com/bu_ufsc | \n",
" info:other:archives.eprints.org:import | \n",
" TRUE | \n",
" TRUE | \n",
@@ -1054,9 +1054,9 @@
" celestial | \n",
" opendoar | \n",
" 2021-01-25 | \n",
- " 367 | \n",
- " 738 | \n",
- " 362 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 1 | \n",
" 824 | \n",
" 806 | \n",
" 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | \n",
@@ -1155,13 +1155,13 @@
" dir datestamp lastmod \\\n",
"count 5375 5375 5375 \n",
"unique 5375 4127 3966 \n",
- "top disk0/00/00/14/07 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
+ "top disk0/00/00/09/21 2010-01-06 13:43:48 2016-04-17 21:55:19 \n",
"freq 1 16 8 \n",
"\n",
" status_changed type succeeds commentary \\\n",
"count 5375 5375 107 0 \n",
"unique 4158 12 107 0 \n",
- "top 2010-01-06 13:43:48 institutional 12637 NaN \n",
+ "top 2010-01-06 13:43:48 institutional 3164 NaN \n",
"freq 16 3795 1 NaN \n",
"\n",
" metadata_visibility latitude longitude relation_type relation_uri \\\n",
@@ -1206,11 +1206,11 @@
"top Repositorio Institucional http://virtuelcampus.univ-msila.dz/fll \n",
"freq 7 5 \n",
"\n",
- " sword_endpoint \\\n",
- "count 176 \n",
- "unique 170 \n",
- "top http://npl.csircentral.net/ \n",
- "freq 2 \n",
+ " sword_endpoint \\\n",
+ "count 176 \n",
+ "unique 170 \n",
+ "top http://repositorio.itesm.mx/ortec/ \n",
+ "freq 2 \n",
"\n",
" rss_feed \\\n",
"count 1521 \n",
@@ -1218,35 +1218,35 @@
"top http://eprints.upnjatim.ac.id/cgi/latest_tool?... \n",
"freq 5 \n",
"\n",
- " twitter_feed \\\n",
- "count 115 \n",
- "unique 111 \n",
- "top https://twitter.com/rpsicomdp?lang=es \n",
- "freq 2 \n",
+ " twitter_feed description \\\n",
+ "count 115 3782 \n",
+ "unique 111 3304 \n",
+ "top http://twitter.com/bu_ufsc info:other:archives.eprints.org:import \n",
+ "freq 2 112 \n",
"\n",
- " description fulltext open_access mandate \\\n",
- "count 3782 4127 4127 3676 \n",
- "unique 3304 2 2 2 \n",
- "top info:other:archives.eprints.org:import TRUE TRUE FALSE \n",
- "freq 112 2758 2652 2699 \n",
+ " fulltext open_access mandate organisation_title \\\n",
+ "count 4127 4127 3676 5182 \n",
+ "unique 2 2 2 4437 \n",
+ "top TRUE TRUE FALSE Chinese Academy of Science (中国科学院) \n",
+ "freq 2758 2652 2699 9 \n",
"\n",
- " organisation_title organisation_home_page \\\n",
- "count 5182 4898 \n",
- "unique 4437 4328 \n",
- "top Chinese Academy of Science (中国科学院) http://www.cas.cn/ \n",
- "freq 9 9 \n",
+ " organisation_home_page location_country location_city \\\n",
+ "count 4898 5205 3774 \n",
+ "unique 4328 136 1875 \n",
+ "top http://www.cas.cn/ us Lima \n",
+ "freq 9 902 70 \n",
"\n",
- " location_country location_city location_latitude location_longitude \\\n",
- "count 5205 3774 3752 3734 \n",
- "unique 136 1875 2927 2965 \n",
- "top us Lima 34.1607 -118.139 \n",
- "freq 902 70 25 25 \n",
+ " location_latitude location_longitude software geoname version \\\n",
+ "count 3752 3734 4637 4671 5375 \n",
+ "unique 2927 2965 31 126 53 \n",
+ "top 34.1607 -118.139 dspace geoname_2_US other \n",
+ "freq 25 25 2307 840 4771 \n",
"\n",
- " software geoname version subjects date \\\n",
- "count 4637 4671 5375 10524 5360 \n",
- "unique 31 126 53 237 4830 \n",
- "top dspace geoname_2_US other L1 2006-05-04 10:48:14 \n",
- "freq 2307 840 4771 348 99 \n",
+ " subjects date \\\n",
+ "count 10524 5360 \n",
+ "unique 237 4830 \n",
+ "top L1 2006-05-04 10:48:14 \n",
+ "freq 348 99 \n",
"\n",
" note \\\n",
"count 215 \n",
@@ -1287,7 +1287,7 @@
" webometrics_rank webometrics_size webometrics_visibility \\\n",
"count 148 148 148 \n",
"unique 148 148 148 \n",
- "top 367 738 362 \n",
+ "top 1 6 1 \n",
"freq 1 1 1 \n",
"\n",
" webometrics_rich_files webometrics_scholar \\\n",
@@ -1770,454 +1770,6 @@
"cell_type": "code",
"execution_count": 6,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " eprintid | \n",
- " rev_number | \n",
- " eprint_status | \n",
- " userid | \n",
- " importid | \n",
- " source | \n",
- " dir | \n",
- " datestamp | \n",
- " lastmod | \n",
- " status_changed | \n",
- " type | \n",
- " succeeds | \n",
- " commentary | \n",
- " metadata_visibility | \n",
- " latitude | \n",
- " longitude | \n",
- " relation_type | \n",
- " relation_uri | \n",
- " item_issues_id | \n",
- " item_issues_type | \n",
- " item_issues_description | \n",
- " item_issues_timestamp | \n",
- " item_issues_status | \n",
- " item_issues_reported_by | \n",
- " item_issues_resolved_by | \n",
- " item_issues_comment | \n",
- " item_issues_count | \n",
- " sword_depositor | \n",
- " sword_slug | \n",
- " exemplar | \n",
- " home_page | \n",
- " title | \n",
- " oai_pmh | \n",
- " sword_endpoint | \n",
- " rss_feed | \n",
- " twitter_feed | \n",
- " description | \n",
- " fulltext | \n",
- " open_access | \n",
- " mandate | \n",
- " organisation_title | \n",
- " organisation_home_page | \n",
- " location_country | \n",
- " location_city | \n",
- " location_latitude | \n",
- " location_longitude | \n",
- " software | \n",
- " geoname | \n",
- " version | \n",
- " subjects | \n",
- " date | \n",
- " note | \n",
- " suggestions | \n",
- " activity_low | \n",
- " activity_medium | \n",
- " activity_high | \n",
- " recordcount | \n",
- " recordhistory | \n",
- " fulltexts_total | \n",
- " fulltexts_docs | \n",
- " fulltexts_rtotal | \n",
- " fulltexts_rdocs | \n",
- " registry_name | \n",
- " registry_id | \n",
- " submit_to | \n",
- " submitted_to_name | \n",
- " submitted_to_done | \n",
- " webometrics_rank | \n",
- " webometrics_size | \n",
- " webometrics_visibility | \n",
- " webometrics_rich_files | \n",
- " webometrics_scholar | \n",
- " monthly_deposits | \n",
- " total_deposits | \n",
- " association | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 2 | \n",
- " 606 | \n",
- " 657 | \n",
- " archive | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " disk0/00/00/06/06 | \n",
- " 2010-01-06 13:44:31 | \n",
- " 2016-04-17 21:53:14 | \n",
- " 2010-01-06 13:44:31 | \n",
- " subject | \n",
- " NaN | \n",
- " NaN | \n",
- " show | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " http://hal.archives-ouvertes.fr/ | \n",
- " HAL: Hyper Article en Ligne | \n",
- " http://hal.archives-ouvertes.fr/oai/oai.php | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " TRUE | \n",
- " TRUE | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " fr | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " hal | \n",
- " geoname_2_FR | \n",
- " other | \n",
- " NaN | \n",
- " 1998-11-02 11:53:57 | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " 0 | \n",
- " 2 | \n",
- " 675816 | \n",
- " 4,12,17,26,43,57,81,185,431,861,1184,1517,2442... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " opendoar | \n",
- " 166 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1 | \n",
- " 6 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | \n",
- " 3063 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 606 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " celestial | \n",
- " 1106 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 606 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " roarmap | \n",
- " 69 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " eprintid rev_number eprint_status userid importid source dir \\\n",
- "2 606 657 archive 1 NaN NaN disk0/00/00/06/06 \n",
- "3 606 NaN NaN NaN NaN NaN NaN \n",
- "4 606 NaN NaN NaN NaN NaN NaN \n",
- "\n",
- " datestamp lastmod status_changed type \\\n",
- "2 2010-01-06 13:44:31 2016-04-17 21:53:14 2010-01-06 13:44:31 subject \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " succeeds commentary metadata_visibility latitude longitude relation_type \\\n",
- "2 NaN NaN show NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN NaN \n",
- "\n",
- " relation_uri item_issues_id item_issues_type item_issues_description \\\n",
- "2 NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " item_issues_timestamp item_issues_status item_issues_reported_by \\\n",
- "2 NaN NaN NaN \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "\n",
- " item_issues_resolved_by item_issues_comment item_issues_count \\\n",
- "2 NaN NaN 0 \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "\n",
- " sword_depositor sword_slug exemplar home_page \\\n",
- "2 NaN NaN NaN http://hal.archives-ouvertes.fr/ \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " title oai_pmh \\\n",
- "2 HAL: Hyper Article en Ligne http://hal.archives-ouvertes.fr/oai/oai.php \n",
- "3 NaN NaN \n",
- "4 NaN NaN \n",
- "\n",
- " sword_endpoint rss_feed twitter_feed description fulltext open_access \\\n",
- "2 NaN NaN NaN NaN TRUE TRUE \n",
- "3 NaN NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN NaN \n",
- "\n",
- " mandate organisation_title organisation_home_page location_country \\\n",
- "2 NaN NaN NaN fr \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " location_city location_latitude location_longitude software geoname \\\n",
- "2 NaN NaN NaN hal geoname_2_FR \n",
- "3 NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN \n",
- "\n",
- " version subjects date note suggestions activity_low \\\n",
- "2 other NaN 1998-11-02 11:53:57 NaN NaN 0 \n",
- "3 NaN NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN NaN \n",
- "\n",
- " activity_medium activity_high recordcount \\\n",
- "2 0 2 675816 \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "\n",
- " recordhistory fulltexts_total \\\n",
- "2 4,12,17,26,43,57,81,185,431,861,1184,1517,2442... NaN \n",
- "3 NaN NaN \n",
- "4 NaN NaN \n",
- "\n",
- " fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id \\\n",
- "2 NaN NaN NaN opendoar 166 \n",
- "3 NaN NaN NaN celestial 1106 \n",
- "4 NaN NaN NaN roarmap 69 \n",
- "\n",
- " submit_to submitted_to_name submitted_to_done webometrics_rank \\\n",
- "2 NaN NaN NaN 1 \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " webometrics_size webometrics_visibility webometrics_rich_files \\\n",
- "2 6 1 1 \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "\n",
- " webometrics_scholar monthly_deposits \\\n",
- "2 1 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... \n",
- "3 NaN NaN \n",
- "4 NaN NaN \n",
- "\n",
- " total_deposits association \n",
- "2 3063 NaN \n",
- "3 NaN NaN \n",
- "4 NaN NaN "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "roar_df[roar_df.eprintid == '606']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
"outputs": [
{
"data": {
@@ -2398,7 +1950,7 @@
" 1 | \n",
" {nan, 633} | \n",
" {nan, archive} | \n",
- " {nan, 1} | \n",
+ " {1, nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan, disk0/00/00/00/01} | \n",
@@ -2408,7 +1960,7 @@
" {nan, subject} | \n",
" {nan} | \n",
" {nan} | \n",
- " {nan, show} | \n",
+ " {show, nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2426,7 +1978,7 @@
" {nan} | \n",
" {nan} | \n",
" {nan, http://archivesic.ccsd.cnrs.fr/} | \n",
- " {@RCHIVESIC , nan} | \n",
+ " {nan, @RCHIVESIC } | \n",
" {nan, http://archivesic.ccsd.cnrs.fr/oai/oai.php} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2442,7 +1994,7 @@
" {nan} | \n",
" {nan} | \n",
" {nan, hal} | \n",
- " {geoname_2_FR, nan} | \n",
+ " {nan, geoname_2_FR} | \n",
" {nan, other} | \n",
" {nan} | \n",
" {nan, 2002-05-17 19:24:41} | \n",
@@ -2451,7 +2003,7 @@
" {nan, 0} | \n",
" {nan, 0} | \n",
" {nan, 0} | \n",
- " {nan, 25} | \n",
+ " {25, nan} | \n",
" {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2475,17 +2027,17 @@
" 10 | \n",
" {nan, 511} | \n",
" {nan, archive} | \n",
- " {nan, 1} | \n",
+ " {1, nan} | \n",
" {nan} | \n",
" {nan} | \n",
- " {nan, disk0/00/00/00/10} | \n",
+ " {disk0/00/00/00/10, nan} | \n",
" {nan, 2010-01-06 13:43:48} | \n",
" {nan, 2011-07-18 05:40:13} | \n",
" {nan, 2010-01-06 13:43:48} | \n",
" {nan, institutional} | \n",
" {nan} | \n",
" {nan} | \n",
- " {nan, show} | \n",
+ " {show, nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2502,15 +2054,15 @@
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
- " {nan, http://www.diva-portal.org/mdh/} | \n",
+ " {http://www.diva-portal.org/mdh/, nan} | \n",
" {nan, Academic Archive On-line (Mälardalen Uni... | \n",
" {nan, http://www.diva-portal.org/oai/mdh/OAI} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
- " {nan, TRUE} | \n",
- " {nan, TRUE} | \n",
+ " {TRUE, nan} | \n",
+ " {TRUE, nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2519,7 +2071,7 @@
" {nan, 59.8667} | \n",
" {nan, 17.6333} | \n",
" {nan, diva} | \n",
- " {nan, geoname_2_SE} | \n",
+ " {geoname_2_SE, nan} | \n",
" {nan, other} | \n",
" {nan} | \n",
" {nan, 2005-12-08 13:15:22} | \n",
@@ -2529,7 +2081,7 @@
" {nan, 0} | \n",
" {nan, 0} | \n",
" {nan, 100} | \n",
- " {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,1... | \n",
+ " {0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,10... | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2639,7 +2191,7 @@
" {nan, subject} | \n",
" {nan} | \n",
" {nan} | \n",
- " {nan, show} | \n",
+ " {show, nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2656,18 +2208,18 @@
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
- " {http://edoc.sub.uni-hamburg.de/klimawandel/, ... | \n",
- " {nan, Klimawandel Dokumentenserver} | \n",
+ " {nan, http://edoc.sub.uni-hamburg.de/klimawand... | \n",
+ " {Klimawandel Dokumentenserver, nan} | \n",
" {nan, http://edoc.sub.uni-hamburg.de/klimawand... | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan, The \"Documentenserver Klimawandel\" (Repo... | \n",
- " {nan, TRUE} | \n",
- " {nan, TRUE} | \n",
- " {nan, TRUE} | \n",
- " {nan, KLIMZUG projects, Helmholtz-Zentrum Gees... | \n",
- " {http://www.climateservicecenter.de/, nan, htt... | \n",
+ " {TRUE, nan} | \n",
+ " {TRUE, nan} | \n",
+ " {TRUE, nan} | \n",
+ " {nan, Helmholtz-Zentrum Geesthacht, KLIMZUG pr... | \n",
+ " {http://www.hzg.de/, nan, http://www.climatese... | \n",
" {nan, de} | \n",
" {nan, Hamburg} | \n",
" {nan, 53.5511} | \n",
@@ -2675,7 +2227,7 @@
" {nan, opus} | \n",
" {nan, geoname_2_DE} | \n",
" {nan, other} | \n",
- " {S1, HD, GF, GE, G1} | \n",
+ " {HD, S1, GF, GE, G1} | \n",
" {nan, 2015-07-02 08:08:31} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2689,7 +2241,7 @@
" {nan} | \n",
" {nan} | \n",
" {nan, celestial, opendoar} | \n",
- " {3408, 5881, nan} | \n",
+ " {nan, 5881, 3408} | \n",
" {nan} | \n",
" {nan} | \n",
" {nan} | \n",
@@ -2786,8 +2338,8 @@
"text/plain": [
" rev_number eprint_status userid importid source \\\n",
"eprintid \n",
- "1 {nan, 633} {nan, archive} {nan, 1} {nan} {nan} \n",
- "10 {nan, 511} {nan, archive} {nan, 1} {nan} {nan} \n",
+ "1 {nan, 633} {nan, archive} {1, nan} {nan} {nan} \n",
+ "10 {nan, 511} {nan, archive} {1, nan} {nan} {nan} \n",
"1000 {274} {archive} {1} {nan} {nan} \n",
"10001 {nan, 20} {nan, archive} {nan, 91} {nan} {nan} \n",
"10008 {11} {archive} {404} {nan} {nan} \n",
@@ -2795,7 +2347,7 @@
" dir datestamp \\\n",
"eprintid \n",
"1 {nan, disk0/00/00/00/01} {nan, 2010-01-06 13:43:48} \n",
- "10 {nan, disk0/00/00/00/10} {nan, 2010-01-06 13:43:48} \n",
+ "10 {disk0/00/00/00/10, nan} {nan, 2010-01-06 13:43:48} \n",
"1000 {disk0/00/00/10/00} {2010-01-06 13:45:01} \n",
"10001 {nan, disk0/00/01/00/01} {nan, 2015-08-08 14:52:11} \n",
"10008 {disk0/00/01/00/08} {2015-08-08 14:52:26} \n",
@@ -2810,10 +2362,10 @@
"\n",
" type succeeds commentary metadata_visibility \\\n",
"eprintid \n",
- "1 {nan, subject} {nan} {nan} {nan, show} \n",
- "10 {nan, institutional} {nan} {nan} {nan, show} \n",
+ "1 {nan, subject} {nan} {nan} {show, nan} \n",
+ "10 {nan, institutional} {nan} {nan} {show, nan} \n",
"1000 {subject} {nan} {nan} {show} \n",
- "10001 {nan, subject} {nan} {nan} {nan, show} \n",
+ "10001 {nan, subject} {nan} {nan} {show, nan} \n",
"10008 {institutional} {nan} {nan} {show} \n",
"\n",
" latitude longitude relation_type relation_uri item_issues_id \\\n",
@@ -2851,17 +2403,17 @@
" exemplar home_page \\\n",
"eprintid \n",
"1 {nan} {nan, http://archivesic.ccsd.cnrs.fr/} \n",
- "10 {nan} {nan, http://www.diva-portal.org/mdh/} \n",
+ "10 {nan} {http://www.diva-portal.org/mdh/, nan} \n",
"1000 {nan} {http://pam.pisharp.org/} \n",
- "10001 {nan} {http://edoc.sub.uni-hamburg.de/klimawandel/, ... \n",
+ "10001 {nan} {nan, http://edoc.sub.uni-hamburg.de/klimawand... \n",
"10008 {nan} {http://creativematter.skidmore.edu/} \n",
"\n",
" title \\\n",
"eprintid \n",
- "1 {@RCHIVESIC , nan} \n",
+ "1 {nan, @RCHIVESIC } \n",
"10 {nan, Academic Archive On-line (Mälardalen Uni... \n",
"1000 {PAM - Portuguese Archive of Mathematics} \n",
- "10001 {nan, Klimawandel Dokumentenserver} \n",
+ "10001 {Klimawandel Dokumentenserver, nan} \n",
"10008 {Creative Matter | Skidmore College Research} \n",
"\n",
" oai_pmh sword_endpoint \\\n",
@@ -2883,17 +2435,17 @@
" description fulltext \\\n",
"eprintid \n",
"1 {nan} {nan} \n",
- "10 {nan} {nan, TRUE} \n",
+ "10 {nan} {TRUE, nan} \n",
"1000 {nan} {TRUE} \n",
- "10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {nan, TRUE} \n",
+ "10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {TRUE, nan} \n",
"10008 {Welcome to Creative Matter, a repository for ... {TRUE} \n",
"\n",
" open_access mandate \\\n",
"eprintid \n",
"1 {nan} {nan} \n",
- "10 {nan, TRUE} {nan} \n",
+ "10 {TRUE, nan} {nan} \n",
"1000 {TRUE} {nan} \n",
- "10001 {nan, TRUE} {nan, TRUE} \n",
+ "10001 {TRUE, nan} {TRUE, nan} \n",
"10008 {FALSE} {FALSE} \n",
"\n",
" organisation_title \\\n",
@@ -2901,7 +2453,7 @@
"1 {nan} \n",
"10 {nan} \n",
"1000 {nan} \n",
- "10001 {nan, KLIMZUG projects, Helmholtz-Zentrum Gees... \n",
+ "10001 {nan, Helmholtz-Zentrum Geesthacht, KLIMZUG pr... \n",
"10008 {Skidmore College} \n",
"\n",
" organisation_home_page location_country \\\n",
@@ -2909,7 +2461,7 @@
"1 {nan} {nan, fr} \n",
"10 {nan} {nan, se} \n",
"1000 {nan} {pt} \n",
- "10001 {http://www.climateservicecenter.de/, nan, htt... {nan, de} \n",
+ "10001 {http://www.hzg.de/, nan, http://www.climatese... {nan, de} \n",
"10008 {http://www.skidmore.edu/} {us} \n",
"\n",
" location_city location_latitude location_longitude \\\n",
@@ -2922,8 +2474,8 @@
"\n",
" software geoname version \\\n",
"eprintid \n",
- "1 {nan, hal} {geoname_2_FR, nan} {nan, other} \n",
- "10 {nan, diva} {nan, geoname_2_SE} {nan, other} \n",
+ "1 {nan, hal} {nan, geoname_2_FR} {nan, other} \n",
+ "10 {nan, diva} {geoname_2_SE, nan} {nan, other} \n",
"1000 {dspace} {geoname_2_PT} {other} \n",
"10001 {nan, opus} {nan, geoname_2_DE} {nan, other} \n",
"10008 {bepress} {geoname_2_US} {other} \n",
@@ -2933,12 +2485,12 @@
"1 {nan} {nan, 2002-05-17 19:24:41} {nan} {nan} \n",
"10 {nan} {nan, 2005-12-08 13:15:22} {nan} {nan} \n",
"1000 {nan} {2006-05-04 10:48:14} {nan} {nan} \n",
- "10001 {S1, HD, GF, GE, G1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n",
+ "10001 {HD, S1, GF, GE, G1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n",
"10008 {nan} {2015-07-06 17:35:50} {nan} {nan} \n",
"\n",
" activity_low activity_medium activity_high recordcount \\\n",
"eprintid \n",
- "1 {nan, 0} {nan, 0} {nan, 0} {nan, 25} \n",
+ "1 {nan, 0} {nan, 0} {nan, 0} {25, nan} \n",
"10 {nan, 0} {nan, 0} {nan, 0} {nan, 100} \n",
"1000 {nan} {nan} {nan} {nan} \n",
"10001 {nan} {nan} {nan} {nan} \n",
@@ -2947,7 +2499,7 @@
" recordhistory fulltexts_total \\\n",
"eprintid \n",
"1 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... {nan} \n",
- "10 {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,1... {nan} \n",
+ "10 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,10... {nan} \n",
"1000 {nan} {nan} \n",
"10001 {nan} {nan} \n",
"10008 {nan} {nan} \n",
@@ -2965,7 +2517,7 @@
"1 {celestial, opendoar} {58, 669} {nan} \n",
"10 {celestial, opendoar} {258, 526} {nan} \n",
"1000 {nan} {nan} {nan} \n",
- "10001 {nan, celestial, opendoar} {3408, 5881, nan} {nan} \n",
+ "10001 {nan, celestial, opendoar} {nan, 5881, 3408} {nan} \n",
"10008 {celestial} {5882} {nan} \n",
"\n",
" submitted_to_name submitted_to_done webometrics_rank \\\n",
@@ -2993,7 +2545,7 @@
"10008 {nan} {nan} {nan} {nan} "
]
},
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -3005,7 +2557,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -3455,8 +3007,8 @@
" TRUE | \n",
" TRUE | \n",
" TRUE | \n",
- " [KLIMZUG projects, Helmholtz-Zentrum Geesthach... | \n",
- " [http://www.climateservicecenter.de/, http://w... | \n",
+ " [Helmholtz-Zentrum Geesthacht, KLIMZUG project... | \n",
+ " [http://www.hzg.de/, http://www.climateservice... | \n",
" de | \n",
" Hamburg | \n",
" 53.5511 | \n",
@@ -3464,7 +3016,7 @@
" opus | \n",
" geoname_2_DE | \n",
" other | \n",
- " [S1, GE, HD, GF, G1] | \n",
+ " [GF, HD, GE, S1, G1] | \n",
" 2015-07-02 08:08:31 | \n",
" NaN | \n",
" NaN | \n",
@@ -3478,7 +3030,7 @@
" NaN | \n",
" NaN | \n",
" [celestial, opendoar] | \n",
- " [3408, 5881] | \n",
+ " [5881, 3408] | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -3682,7 +3234,7 @@
"1 NaN \n",
"10 NaN \n",
"1000 NaN \n",
- "10001 [KLIMZUG projects, Helmholtz-Zentrum Geesthach... \n",
+ "10001 [Helmholtz-Zentrum Geesthacht, KLIMZUG project... \n",
"10008 Skidmore College \n",
"\n",
" organisation_home_page location_country \\\n",
@@ -3690,7 +3242,7 @@
"1 NaN fr \n",
"10 NaN se \n",
"1000 NaN pt \n",
- "10001 [http://www.climateservicecenter.de/, http://w... de \n",
+ "10001 [http://www.hzg.de/, http://www.climateservice... de \n",
"10008 http://www.skidmore.edu/ us \n",
"\n",
" location_city location_latitude location_longitude software \\\n",
@@ -3706,7 +3258,7 @@
"1 geoname_2_FR other NaN 2002-05-17 19:24:41 \n",
"10 geoname_2_SE other NaN 2005-12-08 13:15:22 \n",
"1000 geoname_2_PT other NaN 2006-05-04 10:48:14 \n",
- "10001 geoname_2_DE other [S1, GE, HD, GF, G1] 2015-07-02 08:08:31 \n",
+ "10001 geoname_2_DE other [GF, HD, GE, S1, G1] 2015-07-02 08:08:31 \n",
"10008 geoname_2_US other NaN 2015-07-06 17:35:50 \n",
"\n",
" note suggestions activity_low activity_medium activity_high \\\n",
@@ -3738,7 +3290,7 @@
"1 [celestial, opendoar] [58, 669] NaN NaN \n",
"10 [celestial, opendoar] [258, 526] NaN NaN \n",
"1000 NaN NaN NaN NaN \n",
- "10001 [celestial, opendoar] [3408, 5881] NaN NaN \n",
+ "10001 [celestial, opendoar] [5881, 3408] NaN NaN \n",
"10008 celestial 5882 NaN NaN \n",
"\n",
" submitted_to_done webometrics_rank webometrics_size \\\n",
@@ -3766,7 +3318,7 @@
"10008 NaN NaN NaN "
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -3787,7 +3339,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -3953,8 +3505,8 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " [celestial, roarmap, opendoar] | \n",
- " [69, 166, 1106] | \n",
+ " [celestial, opendoar, roarmap] | \n",
+ " [1106, 69, 166] | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -4021,7 +3573,7 @@
"4188 NaN NaN NaN \n",
"\n",
" registry_name registry_id submit_to \\\n",
- "4188 [celestial, roarmap, opendoar] [69, 166, 1106] NaN \n",
+ "4188 [celestial, opendoar, roarmap] [1106, 69, 166] NaN \n",
"\n",
" submitted_to_name submitted_to_done webometrics_rank webometrics_size \\\n",
"4188 NaN NaN 1 6 \n",
@@ -4036,7 +3588,7 @@
"4188 NaN "
]
},
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -4048,7 +3600,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -4271,7 +3823,7 @@
" 2 | \n",
" 2 | \n",
" 3802 | \n",
- " 3771 | \n",
+ " 3772 | \n",
" 143 | \n",
" 1861 | \n",
" 2887 | \n",
@@ -4292,8 +3844,8 @@
" 118 | \n",
" 134 | \n",
" 117 | \n",
- " 7 | \n",
- " 4257 | \n",
+ " 8 | \n",
+ " 4256 | \n",
" 7 | \n",
" 1 | \n",
" 1 | \n",
@@ -4308,18 +3860,18 @@
" \n",
" \n",
" top | \n",
- " 1259 | \n",
+ " 1 | \n",
" 11 | \n",
" archive | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
- " disk0/00/00/14/07 | \n",
+ " disk0/00/00/00/01 | \n",
" 2010-01-06 13:43:48 | \n",
- " 2016-05-02 05:43:04 | \n",
+ " 2011-07-06 08:24:53 | \n",
" 2010-01-06 13:43:48 | \n",
" institutional | \n",
- " 12637 | \n",
+ " 10164 | \n",
" NaN | \n",
" show | \n",
" NaN | \n",
@@ -4341,9 +3893,9 @@
" http://eprints.upnjatim.ac.id/ | \n",
" Repositorio Institucional | \n",
" http://kce.docressources.info/ws/PMBWs_2 | \n",
- " http://npl.csircentral.net/ | \n",
+ " http://producao.usp.br/sword/servicedocument | \n",
" http://eprints.upnjatim.ac.id/cgi/latest_tool?... | \n",
- " http://twitter.com/bu_ufsc | \n",
+ " http://my.indexcopernicus.com/fredemoreno | \n",
" info:other:archives.eprints.org:import | \n",
" TRUE | \n",
" TRUE | \n",
@@ -4371,13 +3923,13 @@
" 0 | \n",
" 0 | \n",
" [celestial, opendoar] | \n",
- " [2246, 1879] | \n",
- " [celestial, roarmap, opendoar] | \n",
+ " 2479 | \n",
+ " [celestial, opendoar, roarmap] | \n",
" opendoar | \n",
" 2021-01-25 | \n",
- " 367 | \n",
- " 738 | \n",
- " 668 | \n",
+ " 24 | \n",
+ " 46 | \n",
+ " 20 | \n",
" 824 | \n",
" 806 | \n",
" 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | \n",
@@ -5016,7 +4568,7 @@
" eprintid rev_number eprint_status userid importid source \\\n",
"count 5375 5375 5375 5375 0.0 0.0 \n",
"unique 5375 658 1 2135 NaN NaN \n",
- "top 1259 11 archive 1 NaN NaN \n",
+ "top 1 11 archive 1 NaN NaN \n",
"freq 1 332 5375 1333 NaN NaN \n",
"mean NaN NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN NaN \n",
@@ -5029,7 +4581,7 @@
" dir datestamp lastmod \\\n",
"count 5375 5375 5375 \n",
"unique 5375 4127 3966 \n",
- "top disk0/00/00/14/07 2010-01-06 13:43:48 2016-05-02 05:43:04 \n",
+ "top disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"freq 1 16 8 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@@ -5042,7 +4594,7 @@
" status_changed type succeeds commentary \\\n",
"count 5375 5375 107 0.0 \n",
"unique 4158 12 107 NaN \n",
- "top 2010-01-06 13:43:48 institutional 12637 NaN \n",
+ "top 2010-01-06 13:43:48 institutional 10164 NaN \n",
"freq 16 3795 1 NaN \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
@@ -5143,18 +4695,18 @@
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
- " sword_endpoint \\\n",
- "count 176 \n",
- "unique 170 \n",
- "top http://npl.csircentral.net/ \n",
- "freq 2 \n",
- "mean NaN \n",
- "std NaN \n",
- "min NaN \n",
- "25% NaN \n",
- "50% NaN \n",
- "75% NaN \n",
- "max NaN \n",
+ " sword_endpoint \\\n",
+ "count 176 \n",
+ "unique 170 \n",
+ "top http://producao.usp.br/sword/servicedocument \n",
+ "freq 2 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
"\n",
" rss_feed \\\n",
"count 1521 \n",
@@ -5169,70 +4721,70 @@
"75% NaN \n",
"max NaN \n",
"\n",
- " twitter_feed description \\\n",
- "count 115 3782 \n",
- "unique 111 3304 \n",
- "top http://twitter.com/bu_ufsc info:other:archives.eprints.org:import \n",
- "freq 2 112 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
+ " twitter_feed \\\n",
+ "count 115 \n",
+ "unique 111 \n",
+ "top http://my.indexcopernicus.com/fredemoreno \n",
+ "freq 2 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
"\n",
- " fulltext open_access mandate organisation_title \\\n",
- "count 4127 4127 3676 4396 \n",
- "unique 2 2 2 3802 \n",
- "top TRUE TRUE FALSE Chinese Academy of Science (中国科学院) \n",
- "freq 2758 2652 2699 9 \n",
- "mean NaN NaN NaN NaN \n",
- "std NaN NaN NaN NaN \n",
- "min NaN NaN NaN NaN \n",
- "25% NaN NaN NaN NaN \n",
- "50% NaN NaN NaN NaN \n",
- "75% NaN NaN NaN NaN \n",
- "max NaN NaN NaN NaN \n",
+ " description fulltext open_access mandate \\\n",
+ "count 3782 4127 4127 3676 \n",
+ "unique 3304 2 2 2 \n",
+ "top info:other:archives.eprints.org:import TRUE TRUE FALSE \n",
+ "freq 112 2758 2652 2699 \n",
+ "mean NaN NaN NaN NaN \n",
+ "std NaN NaN NaN NaN \n",
+ "min NaN NaN NaN NaN \n",
+ "25% NaN NaN NaN NaN \n",
+ "50% NaN NaN NaN NaN \n",
+ "75% NaN NaN NaN NaN \n",
+ "max NaN NaN NaN NaN \n",
"\n",
- " organisation_home_page location_country location_city \\\n",
- "count 4226 5080 3655 \n",
- "unique 3771 143 1861 \n",
- "top http://www.cas.cn/ us Lima \n",
- "freq 9 886 69 \n",
- "mean NaN NaN NaN \n",
- "std NaN NaN NaN \n",
- "min NaN NaN NaN \n",
- "25% NaN NaN NaN \n",
- "50% NaN NaN NaN \n",
- "75% NaN NaN NaN \n",
- "max NaN NaN NaN \n",
+ " organisation_title organisation_home_page \\\n",
+ "count 4396 4226 \n",
+ "unique 3802 3772 \n",
+ "top Chinese Academy of Science (中国科学院) http://www.cas.cn/ \n",
+ "freq 9 9 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
"\n",
- " location_latitude location_longitude software geoname version \\\n",
- "count 3681 3664 4637 4671 5375 \n",
- "unique 2887 2917 31 126 53 \n",
- "top 34.1607 -118.139 dspace geoname_2_US other \n",
- "freq 25 25 2307 840 4771 \n",
- "mean NaN NaN NaN NaN NaN \n",
- "std NaN NaN NaN NaN NaN \n",
- "min NaN NaN NaN NaN NaN \n",
- "25% NaN NaN NaN NaN NaN \n",
- "50% NaN NaN NaN NaN NaN \n",
- "75% NaN NaN NaN NaN NaN \n",
- "max NaN NaN NaN NaN NaN \n",
+ " location_country location_city location_latitude location_longitude \\\n",
+ "count 5080 3655 3681 3664 \n",
+ "unique 143 1861 2887 2917 \n",
+ "top us Lima 34.1607 -118.139 \n",
+ "freq 886 69 25 25 \n",
+ "mean NaN NaN NaN NaN \n",
+ "std NaN NaN NaN NaN \n",
+ "min NaN NaN NaN NaN \n",
+ "25% NaN NaN NaN NaN \n",
+ "50% NaN NaN NaN NaN \n",
+ "75% NaN NaN NaN NaN \n",
+ "max NaN NaN NaN NaN \n",
"\n",
- " subjects date \\\n",
- "count 1250 5360 \n",
- "unique 906 4830 \n",
- "top K1 2006-05-04 10:48:14 \n",
- "freq 53 99 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
+ " software geoname version subjects date \\\n",
+ "count 4637 4671 5375 1250 5360 \n",
+ "unique 31 126 53 906 4830 \n",
+ "top dspace geoname_2_US other K1 2006-05-04 10:48:14 \n",
+ "freq 2307 840 4771 53 99 \n",
+ "mean NaN NaN NaN NaN NaN \n",
+ "std NaN NaN NaN NaN NaN \n",
+ "min NaN NaN NaN NaN NaN \n",
+ "25% NaN NaN NaN NaN NaN \n",
+ "50% NaN NaN NaN NaN NaN \n",
+ "75% NaN NaN NaN NaN NaN \n",
+ "max NaN NaN NaN NaN NaN \n",
"\n",
" note \\\n",
"count 215 \n",
@@ -5288,7 +4840,7 @@
"\n",
" fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name \\\n",
"count 258 270 258 4603 \n",
- "unique 118 134 117 7 \n",
+ "unique 118 134 117 8 \n",
"top 0 0 0 [celestial, opendoar] \n",
"freq 114 113 114 2114 \n",
"mean NaN NaN NaN NaN \n",
@@ -5299,23 +4851,23 @@
"75% NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN \n",
"\n",
- " registry_id submit_to submitted_to_name \\\n",
- "count 4578 293 205 \n",
- "unique 4257 7 1 \n",
- "top [2246, 1879] [celestial, roarmap, opendoar] opendoar \n",
- "freq 4 92 205 \n",
- "mean NaN NaN NaN \n",
- "std NaN NaN NaN \n",
- "min NaN NaN NaN \n",
- "25% NaN NaN NaN \n",
- "50% NaN NaN NaN \n",
- "75% NaN NaN NaN \n",
- "max NaN NaN NaN \n",
+ " registry_id submit_to submitted_to_name \\\n",
+ "count 4578 293 205 \n",
+ "unique 4256 7 1 \n",
+ "top 2479 [celestial, opendoar, roarmap] opendoar \n",
+ "freq 4 92 205 \n",
+ "mean NaN NaN NaN \n",
+ "std NaN NaN NaN \n",
+ "min NaN NaN NaN \n",
+ "25% NaN NaN NaN \n",
+ "50% NaN NaN NaN \n",
+ "75% NaN NaN NaN \n",
+ "max NaN NaN NaN \n",
"\n",
" submitted_to_done webometrics_rank webometrics_size \\\n",
"count 205 148 148 \n",
"unique 1 148 148 \n",
- "top 2021-01-25 367 738 \n",
+ "top 2021-01-25 24 46 \n",
"freq 205 1 1 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@@ -5328,7 +4880,7 @@
" webometrics_visibility webometrics_rich_files webometrics_scholar \\\n",
"count 148 148 148 \n",
"unique 148 146 143 \n",
- "top 668 824 806 \n",
+ "top 20 824 806 \n",
"freq 1 3 5 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@@ -5365,7 +4917,7 @@
"max NaN "
]
},
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -5376,7 +4928,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -5425,7 +4977,7 @@
"dtype: int64"
]
},
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -5436,7 +4988,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -5480,7 +5032,7 @@
"dtype: int64"
]
},
- "execution_count": 12,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -5489,6 +5041,39 @@
"roar_df.isna().sum()[40:]"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "type\n",
+ "database 74\n",
+ "demonstration 20\n",
+ "institutional 3795\n",
+ "journal 121\n",
+ "learning 77\n",
+ "multi 141\n",
+ "opendata 41\n",
+ "other 409\n",
+ "researchdata 54\n",
+ "subject 294\n",
+ "theses 347\n",
+ "webobservatory 2\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(roar_df.type).groupby('type').size()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 13,
@@ -5497,7 +5082,10 @@
{
"data": {
"text/plain": [
- "array([nan, 'TRUE', 'FALSE'], dtype=object)"
+ "open_access\n",
+ "FALSE 1475\n",
+ "TRUE 2652\n",
+ "dtype: int64"
]
},
"execution_count": 13,
@@ -5506,7 +5094,7 @@
}
],
"source": [
- "roar_df.open_access.unique()"
+ "pd.DataFrame(roar_df.open_access).groupby('open_access').size()"
]
},
{
@@ -5517,9 +5105,10 @@
{
"data": {
"text/plain": [
- "array(['subject', 'institutional', 'researchdata', 'theses', 'database',\n",
- " 'other', 'journal', 'opendata', 'demonstration', 'multi',\n",
- " 'learning', 'webobservatory'], dtype=object)"
+ "mandate\n",
+ "FALSE 2699\n",
+ "TRUE 977\n",
+ "dtype: int64"
]
},
"execution_count": 14,
@@ -5528,27 +5117,7 @@
}
],
"source": [
- "roar_df.type.unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([nan, 'TRUE', 'FALSE'], dtype=object)"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "roar_df.mandate.unique()"
+ "pd.DataFrame(roar_df.mandate).groupby('mandate').size()"
]
},
{
diff --git a/notebooks/01.4-exploration-fairsharing.ipynb b/notebooks/01.4-exploration-fairsharing.ipynb
index 8c12ae4..5182e5f 100644
--- a/notebooks/01.4-exploration-fairsharing.ipynb
+++ b/notebooks/01.4-exploration-fairsharing.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -30,7 +30,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -54,116 +54,468 @@
" \n",
" \n",
" | \n",
- " full_name | \n",
- " short_name | \n",
- " fs_url | \n",
- " url | \n",
- " countries | \n",
- " subjects | \n",
+ " id | \n",
+ " type | \n",
+ " attributes.created-at | \n",
+ " attributes.updated-at | \n",
+ " attributes.metadata.doi | \n",
+ " attributes.metadata.name | \n",
+ " attributes.metadata.status | \n",
+ " attributes.metadata.contacts | \n",
+ " attributes.metadata.homepage | \n",
+ " attributes.metadata.identifier | \n",
+ " attributes.metadata.description | \n",
+ " attributes.metadata.support-links | \n",
+ " attributes.metadata.year-creation | \n",
+ " attributes.metadata.data-processes | \n",
+ " attributes.legacy-ids | \n",
+ " attributes.fairsharing-registry | \n",
+ " attributes.record-type | \n",
+ " attributes.subjects | \n",
+ " attributes.domains | \n",
+ " attributes.taxonomies | \n",
+ " attributes.user-defined-tags | \n",
+ " attributes.countries | \n",
+ " attributes.name | \n",
+ " attributes.abbreviation | \n",
+ " attributes.url | \n",
+ " attributes.doi | \n",
+ " attributes.fairsharing-licence | \n",
+ " attributes.description | \n",
+ " attributes.publications | \n",
+ " attributes.licence-links | \n",
+ " attributes.metadata.citations | \n",
+ " attributes.metadata.abbreviation | \n",
+ " attributes.metadata.access-points | \n",
+ " attributes.metadata.associated-tools | \n",
+ " attributes.metadata.deprecation-date | \n",
+ " attributes.metadata.deprecation-reason | \n",
+ " attributes.metadata.tombstone | \n",
"
\n",
" \n",
"
\n",
" \n",
" 0 | \n",
- " GenBank | \n",
- " GenBank | \n",
- " https://fairsharing.org/10.25504/FAIRsharing.9... | \n",
- " https://www.ncbi.nlm.nih.gov/genbank/ | \n",
- " European Union,Japan,United States | \n",
- " Bioinformatics,Data Management,Data Submission... | \n",
+ " 1723 | \n",
+ " fairsharing-records | \n",
+ " 2014-11-04T15:23:40.000Z | \n",
+ " 2021-09-30T11:39:06.829Z | \n",
+ " 10.25504/FAIRsharing.8t18te | \n",
+ " Cell Image Library | \n",
+ " ready | \n",
+ " [{'contact-name': 'David Orloff', 'contact-ema... | \n",
+ " http://www.cellimagelibrary.org | \n",
+ " 1723 | \n",
+ " This library is a public and easily accessible... | \n",
+ " [{'url': 'http://www.cellimagelibrary.org/page... | \n",
+ " 2010.0 | \n",
+ " [{'name': 'live update', 'type': 'data release... | \n",
+ " [biodbcore-000180, bsg-d000180] | \n",
+ " Database | \n",
+ " repository | \n",
+ " [Cell Biology, Life Science] | \n",
+ " [Cell, Microscopy, Light microscopy, Electron ... | \n",
+ " [All] | \n",
+ " [] | \n",
+ " [United States] | \n",
+ " FAIRsharing record for: Cell Image Library | \n",
+ " None | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.8... | \n",
+ " 10.25504/FAIRsharing.8t18te | \n",
+ " https://creativecommons.org/licenses/by-sa/4.0... | \n",
+ " This FAIRsharing record describes: This librar... | \n",
+ " [{'id': 232, 'pubmed_id': 23203874, 'title': '... | \n",
+ " [{'licence-name': 'Cell Image Library Data Pol... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" 1 | \n",
- " GlycoNAVI | \n",
- " GlycoNAVI | \n",
- " https://fairsharing.org/10.25504/FAIRsharing.w... | \n",
- " https://glyconavi.org/ | \n",
- " Japan | \n",
- " Chemistry,Glycomics,Life Science,Organic Chemi... | \n",
+ " 3101 | \n",
+ " fairsharing-records | \n",
+ " 2020-09-16T08:49:13.000Z | \n",
+ " 2021-09-30T11:36:45.452Z | \n",
+ " NaN | \n",
+ " WHOI Ship Data-Grabber System | \n",
+ " ready | \n",
+ " NaN | \n",
+ " http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html | \n",
+ " 3101 | \n",
+ " The WHOI Ship DataGrabber system provides the ... | \n",
+ " [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... | \n",
+ " 2004.0 | \n",
+ " [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... | \n",
+ " [biodbcore-001609, bsg-d001609] | \n",
+ " Database | \n",
+ " repository | \n",
+ " [Earth Science, Water Research, Oceanography] | \n",
+ " [] | \n",
+ " [Not applicable] | \n",
+ " [subseafloor environments] | \n",
+ " [United States] | \n",
+ " FAIRsharing record for: WHOI Ship Data-Grabber... | \n",
+ " None | \n",
+ " https://fairsharing.org/fairsharing_records/3101 | \n",
+ " None | \n",
+ " https://creativecommons.org/licenses/by-sa/4.0... | \n",
+ " This FAIRsharing record describes: The WHOI Sh... | \n",
+ " [] | \n",
+ " [{'licence-name': 'NDSF Data Archive Policy', ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" 2 | \n",
- " ADHDgene | \n",
- " ADHDgene | \n",
- " https://fairsharing.org/10.25504/FAIRsharing.m... | \n",
- " http://adhd.psych.ac.cn/ | \n",
- " China | \n",
- " Biomedical Science,Genetics | \n",
+ " 2649 | \n",
+ " fairsharing-records | \n",
+ " 2018-08-07T20:23:32.000Z | \n",
+ " 2021-09-30T11:39:07.898Z | \n",
+ " NaN | \n",
+ " Electron Microscope Public Image Archive | \n",
+ " ready | \n",
+ " [{'contact-name': 'General contact', 'contact-... | \n",
+ " https://www.ebi.ac.uk/pdbe/emdb/empiar/ | \n",
+ " 2649 | \n",
+ " EMPIAR, the Electron Microscopy Public Image A... | \n",
+ " [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... | \n",
+ " 2015.0 | \n",
+ " [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | \n",
+ " [biodbcore-001140, bsg-d001140] | \n",
+ " Database | \n",
+ " repository | \n",
+ " [Bioinformatics, Biology] | \n",
+ " [Protein image, Microscopy, Electron microscop... | \n",
+ " [All] | \n",
+ " [] | \n",
+ " [Greece, Czech Republic, United Kingdom, Icela... | \n",
+ " FAIRsharing record for: Electron Microscope Pu... | \n",
+ " EMPIAR | \n",
+ " https://fairsharing.org/fairsharing_records/2649 | \n",
+ " None | \n",
+ " https://creativecommons.org/licenses/by-sa/4.0... | \n",
+ " This FAIRsharing record describes: EMPIAR, the... | \n",
+ " [{'id': 2232, 'pubmed_id': 27067018, 'title': ... | \n",
+ " [{'licence-name': 'EMBL-EBI Terms of Use', 'li... | \n",
+ " [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... | \n",
+ " EMPIAR | \n",
+ " [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | \n",
+ " [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" 3 | \n",
- " Allele frequency resource for research and tea... | \n",
- " ALFRED | \n",
- " https://fairsharing.org/10.25504/FAIRsharing.y... | \n",
- " http://alfred.med.yale.edu | \n",
- " United States | \n",
- " Life Science | \n",
+ " 2657 | \n",
+ " fairsharing-records | \n",
+ " 2018-08-13T15:12:11.000Z | \n",
+ " 2021-09-30T11:37:28.736Z | \n",
+ " 10.25504/FAIRsharing.tnByoG | \n",
+ " ClinicalStudyDataRequest.com | \n",
+ " ready | \n",
+ " [{'contact-email': 'support@clinicalstudydatar... | \n",
+ " https://clinicalstudydatarequest.com/ | \n",
+ " 2657 | \n",
+ " ClinicalStudyDataRequest.com (CSDR) is a conso... | \n",
+ " [{'url': 'https://clinicalstudydatarequest.com... | \n",
+ " 2014.0 | \n",
+ " [{'url': 'https://clinicalstudydatarequest.com... | \n",
+ " [biodbcore-001149, bsg-d001149] | \n",
+ " Database | \n",
+ " repository | \n",
+ " [Preclinical Studies, Biomedical Science] | \n",
+ " [] | \n",
+ " [Homo sapiens] | \n",
+ " [] | \n",
+ " [Worldwide] | \n",
+ " FAIRsharing record for: ClinicalStudyDataReque... | \n",
+ " CSDR | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.t... | \n",
+ " 10.25504/FAIRsharing.tnByoG | \n",
+ " https://creativecommons.org/licenses/by-sa/4.0... | \n",
+ " This FAIRsharing record describes: ClinicalStu... | \n",
+ " [] | \n",
+ " [{'licence-name': 'CSDR Data Sharing Agreement... | \n",
+ " NaN | \n",
+ " CSDR | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
" 4 | \n",
- " Animal Transcription Factor Database | \n",
- " AnimalTFDB | \n",
- " https://fairsharing.org/10.25504/FAIRsharing.e... | \n",
- " http://bioinfo.life.hust.edu.cn/AnimalTFDB/ | \n",
- " China | \n",
- " Life Science | \n",
+ " 2078 | \n",
+ " fairsharing-records | \n",
+ " 2014-11-04T15:23:40.000Z | \n",
+ " 2021-09-30T11:34:43.129Z | \n",
+ " 10.25504/FAIRsharing.3axym7 | \n",
+ " Germplasm Resources Information Network | \n",
+ " ready | \n",
+ " [{'contact-email': 'dbmu@ars-grin.gov'}] | \n",
+ " https://www.ars-grin.gov/ | \n",
+ " 2078 | \n",
+ " GRIN provides National Genetic Resources Progr... | \n",
+ " [{'url': 'https://www.ars-grin.gov/Pages/Colle... | \n",
+ " 2010.0 | \n",
+ " [{'url': 'https://www.ars-grin.gov/', 'name': ... | \n",
+ " [biodbcore-000546, bsg-d000546] | \n",
+ " Database | \n",
+ " repository | \n",
+ " [Life Science] | \n",
+ " [Cell, Cell culture, Germplasm] | \n",
+ " [Bacteria, Metazoa, Viridiplantae] | \n",
+ " [] | \n",
+ " [United States] | \n",
+ " FAIRsharing record for: Germplasm Resources In... | \n",
+ " GRIN | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.3... | \n",
+ " 10.25504/FAIRsharing.3axym7 | \n",
+ " https://creativecommons.org/licenses/by-sa/4.0... | \n",
+ " This FAIRsharing record describes: GRIN provid... | \n",
+ " [] | \n",
+ " [] | \n",
+ " NaN | \n",
+ " GRIN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " full_name short_name \\\n",
- "0 GenBank GenBank \n",
- "1 GlycoNAVI GlycoNAVI \n",
- "2 ADHDgene ADHDgene \n",
- "3 Allele frequency resource for research and tea... ALFRED \n",
- "4 Animal Transcription Factor Database AnimalTFDB \n",
+ " id type attributes.created-at \\\n",
+ "0 1723 fairsharing-records 2014-11-04T15:23:40.000Z \n",
+ "1 3101 fairsharing-records 2020-09-16T08:49:13.000Z \n",
+ "2 2649 fairsharing-records 2018-08-07T20:23:32.000Z \n",
+ "3 2657 fairsharing-records 2018-08-13T15:12:11.000Z \n",
+ "4 2078 fairsharing-records 2014-11-04T15:23:40.000Z \n",
"\n",
- " fs_url \\\n",
- "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
- "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
- "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
- "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
- "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
+ " attributes.updated-at attributes.metadata.doi \\\n",
+ "0 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te \n",
+ "1 2021-09-30T11:36:45.452Z NaN \n",
+ "2 2021-09-30T11:39:07.898Z NaN \n",
+ "3 2021-09-30T11:37:28.736Z 10.25504/FAIRsharing.tnByoG \n",
+ "4 2021-09-30T11:34:43.129Z 10.25504/FAIRsharing.3axym7 \n",
"\n",
- " url \\\n",
- "0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
- "1 https://glyconavi.org/ \n",
- "2 http://adhd.psych.ac.cn/ \n",
- "3 http://alfred.med.yale.edu \n",
- "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
+ " attributes.metadata.name attributes.metadata.status \\\n",
+ "0 Cell Image Library ready \n",
+ "1 WHOI Ship Data-Grabber System ready \n",
+ "2 Electron Microscope Public Image Archive ready \n",
+ "3 ClinicalStudyDataRequest.com ready \n",
+ "4 Germplasm Resources Information Network ready \n",
"\n",
- " countries \\\n",
- "0 European Union,Japan,United States \n",
- "1 Japan \n",
- "2 China \n",
- "3 United States \n",
- "4 China \n",
+ " attributes.metadata.contacts \\\n",
+ "0 [{'contact-name': 'David Orloff', 'contact-ema... \n",
+ "1 NaN \n",
+ "2 [{'contact-name': 'General contact', 'contact-... \n",
+ "3 [{'contact-email': 'support@clinicalstudydatar... \n",
+ "4 [{'contact-email': 'dbmu@ars-grin.gov'}] \n",
"\n",
- " subjects \n",
- "0 Bioinformatics,Data Management,Data Submission... \n",
- "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
- "2 Biomedical Science,Genetics \n",
- "3 Life Science \n",
- "4 Life Science "
+ " attributes.metadata.homepage \\\n",
+ "0 http://www.cellimagelibrary.org \n",
+ "1 http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html \n",
+ "2 https://www.ebi.ac.uk/pdbe/emdb/empiar/ \n",
+ "3 https://clinicalstudydatarequest.com/ \n",
+ "4 https://www.ars-grin.gov/ \n",
+ "\n",
+ " attributes.metadata.identifier \\\n",
+ "0 1723 \n",
+ "1 3101 \n",
+ "2 2649 \n",
+ "3 2657 \n",
+ "4 2078 \n",
+ "\n",
+ " attributes.metadata.description \\\n",
+ "0 This library is a public and easily accessible... \n",
+ "1 The WHOI Ship DataGrabber system provides the ... \n",
+ "2 EMPIAR, the Electron Microscopy Public Image A... \n",
+ "3 ClinicalStudyDataRequest.com (CSDR) is a conso... \n",
+ "4 GRIN provides National Genetic Resources Progr... \n",
+ "\n",
+ " attributes.metadata.support-links \\\n",
+ "0 [{'url': 'http://www.cellimagelibrary.org/page... \n",
+ "1 [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... \n",
+ "2 [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... \n",
+ "3 [{'url': 'https://clinicalstudydatarequest.com... \n",
+ "4 [{'url': 'https://www.ars-grin.gov/Pages/Colle... \n",
+ "\n",
+ " attributes.metadata.year-creation \\\n",
+ "0 2010.0 \n",
+ "1 2004.0 \n",
+ "2 2015.0 \n",
+ "3 2014.0 \n",
+ "4 2010.0 \n",
+ "\n",
+ " attributes.metadata.data-processes \\\n",
+ "0 [{'name': 'live update', 'type': 'data release... \n",
+ "1 [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... \n",
+ "2 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... \n",
+ "3 [{'url': 'https://clinicalstudydatarequest.com... \n",
+ "4 [{'url': 'https://www.ars-grin.gov/', 'name': ... \n",
+ "\n",
+ " attributes.legacy-ids attributes.fairsharing-registry \\\n",
+ "0 [biodbcore-000180, bsg-d000180] Database \n",
+ "1 [biodbcore-001609, bsg-d001609] Database \n",
+ "2 [biodbcore-001140, bsg-d001140] Database \n",
+ "3 [biodbcore-001149, bsg-d001149] Database \n",
+ "4 [biodbcore-000546, bsg-d000546] Database \n",
+ "\n",
+ " attributes.record-type attributes.subjects \\\n",
+ "0 repository [Cell Biology, Life Science] \n",
+ "1 repository [Earth Science, Water Research, Oceanography] \n",
+ "2 repository [Bioinformatics, Biology] \n",
+ "3 repository [Preclinical Studies, Biomedical Science] \n",
+ "4 repository [Life Science] \n",
+ "\n",
+ " attributes.domains \\\n",
+ "0 [Cell, Microscopy, Light microscopy, Electron ... \n",
+ "1 [] \n",
+ "2 [Protein image, Microscopy, Electron microscop... \n",
+ "3 [] \n",
+ "4 [Cell, Cell culture, Germplasm] \n",
+ "\n",
+ " attributes.taxonomies attributes.user-defined-tags \\\n",
+ "0 [All] [] \n",
+ "1 [Not applicable] [subseafloor environments] \n",
+ "2 [All] [] \n",
+ "3 [Homo sapiens] [] \n",
+ "4 [Bacteria, Metazoa, Viridiplantae] [] \n",
+ "\n",
+ " attributes.countries \\\n",
+ "0 [United States] \n",
+ "1 [United States] \n",
+ "2 [Greece, Czech Republic, United Kingdom, Icela... \n",
+ "3 [Worldwide] \n",
+ "4 [United States] \n",
+ "\n",
+ " attributes.name attributes.abbreviation \\\n",
+ "0 FAIRsharing record for: Cell Image Library None \n",
+ "1 FAIRsharing record for: WHOI Ship Data-Grabber... None \n",
+ "2 FAIRsharing record for: Electron Microscope Pu... EMPIAR \n",
+ "3 FAIRsharing record for: ClinicalStudyDataReque... CSDR \n",
+ "4 FAIRsharing record for: Germplasm Resources In... GRIN \n",
+ "\n",
+ " attributes.url \\\n",
+ "0 https://fairsharing.org/10.25504/FAIRsharing.8... \n",
+ "1 https://fairsharing.org/fairsharing_records/3101 \n",
+ "2 https://fairsharing.org/fairsharing_records/2649 \n",
+ "3 https://fairsharing.org/10.25504/FAIRsharing.t... \n",
+ "4 https://fairsharing.org/10.25504/FAIRsharing.3... \n",
+ "\n",
+ " attributes.doi \\\n",
+ "0 10.25504/FAIRsharing.8t18te \n",
+ "1 None \n",
+ "2 None \n",
+ "3 10.25504/FAIRsharing.tnByoG \n",
+ "4 10.25504/FAIRsharing.3axym7 \n",
+ "\n",
+ " attributes.fairsharing-licence \\\n",
+ "0 https://creativecommons.org/licenses/by-sa/4.0... \n",
+ "1 https://creativecommons.org/licenses/by-sa/4.0... \n",
+ "2 https://creativecommons.org/licenses/by-sa/4.0... \n",
+ "3 https://creativecommons.org/licenses/by-sa/4.0... \n",
+ "4 https://creativecommons.org/licenses/by-sa/4.0... \n",
+ "\n",
+ " attributes.description \\\n",
+ "0 This FAIRsharing record describes: This librar... \n",
+ "1 This FAIRsharing record describes: The WHOI Sh... \n",
+ "2 This FAIRsharing record describes: EMPIAR, the... \n",
+ "3 This FAIRsharing record describes: ClinicalStu... \n",
+ "4 This FAIRsharing record describes: GRIN provid... \n",
+ "\n",
+ " attributes.publications \\\n",
+ "0 [{'id': 232, 'pubmed_id': 23203874, 'title': '... \n",
+ "1 [] \n",
+ "2 [{'id': 2232, 'pubmed_id': 27067018, 'title': ... \n",
+ "3 [] \n",
+ "4 [] \n",
+ "\n",
+ " attributes.licence-links \\\n",
+ "0 [{'licence-name': 'Cell Image Library Data Pol... \n",
+ "1 [{'licence-name': 'NDSF Data Archive Policy', ... \n",
+ "2 [{'licence-name': 'EMBL-EBI Terms of Use', 'li... \n",
+ "3 [{'licence-name': 'CSDR Data Sharing Agreement... \n",
+ "4 [] \n",
+ "\n",
+ " attributes.metadata.citations \\\n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "\n",
+ " attributes.metadata.abbreviation \\\n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 EMPIAR \n",
+ "3 CSDR \n",
+ "4 GRIN \n",
+ "\n",
+ " attributes.metadata.access-points \\\n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "\n",
+ " attributes.metadata.associated-tools \\\n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "\n",
+ " attributes.metadata.deprecation-date attributes.metadata.deprecation-reason \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
+ "\n",
+ " attributes.metadata.tombstone \n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 NaN \n",
+ "3 NaN \n",
+ "4 NaN "
]
},
- "execution_count": 2,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
- " delimiter='|', header=0,\n",
- " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
+ "with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:\n",
+ " lines = f.read().splitlines()\n",
+ " \n",
+ "fairsharing_df = pd.DataFrame(lines)\n",
+ "fairsharing_df.columns = ['json_element']\n",
+ "fairsharing_df['json_element'].apply(json.loads)\n",
+ "fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))\n",
+ "\n",
"fairsharing_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -187,76 +539,831 @@
" \n",
" \n",
" | \n",
- " full_name | \n",
- " short_name | \n",
- " fs_url | \n",
- " url | \n",
- " countries | \n",
- " subjects | \n",
+ " id | \n",
+ " type | \n",
+ " attributes.created-at | \n",
+ " attributes.updated-at | \n",
+ " attributes.metadata.doi | \n",
+ " attributes.metadata.name | \n",
+ " attributes.metadata.status | \n",
+ " attributes.metadata.contacts | \n",
+ " attributes.metadata.homepage | \n",
+ " attributes.metadata.identifier | \n",
+ " attributes.metadata.description | \n",
+ " attributes.metadata.support-links | \n",
+ " attributes.metadata.year-creation | \n",
+ " attributes.metadata.data-processes | \n",
+ " attributes.legacy-ids | \n",
+ " attributes.fairsharing-registry | \n",
+ " attributes.record-type | \n",
+ " attributes.subjects | \n",
+ " attributes.domains | \n",
+ " attributes.taxonomies | \n",
+ " attributes.user-defined-tags | \n",
+ " attributes.countries | \n",
+ " attributes.name | \n",
+ " attributes.abbreviation | \n",
+ " attributes.url | \n",
+ " attributes.doi | \n",
+ " attributes.fairsharing-licence | \n",
+ " attributes.description | \n",
+ " attributes.publications | \n",
+ " attributes.licence-links | \n",
+ " attributes.metadata.citations | \n",
+ " attributes.metadata.abbreviation | \n",
+ " attributes.metadata.access-points | \n",
+ " attributes.metadata.associated-tools | \n",
+ " attributes.metadata.deprecation-date | \n",
+ " attributes.metadata.deprecation-reason | \n",
+ " attributes.metadata.tombstone | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
- " 1752 | \n",
- " 1752 | \n",
- " 1752 | \n",
- " 1752 | \n",
- " 1749 | \n",
- " 1690 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1354 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1678 | \n",
+ " 1797 | \n",
+ " 1797.000000 | \n",
+ " 1797 | \n",
+ " 1608 | \n",
+ " 1492.000000 | \n",
+ " 1565 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1638 | \n",
+ " 1797 | \n",
+ " 1354 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 1797 | \n",
+ " 326 | \n",
+ " 1638 | \n",
+ " 449 | \n",
+ " 618 | \n",
+ " 217 | \n",
+ " 217 | \n",
+ " 1 | \n",
"
\n",
" \n",
" unique | \n",
- " 1752 | \n",
- " 1741 | \n",
- " 1752 | \n",
- " 1752 | \n",
- " 178 | \n",
- " 834 | \n",
+ " 1797 | \n",
+ " 1 | \n",
+ " 1162 | \n",
+ " 1797 | \n",
+ " 1354 | \n",
+ " 1796 | \n",
+ " 4 | \n",
+ " 1576 | \n",
+ " 1797 | \n",
+ " NaN | \n",
+ " 1797 | \n",
+ " 1594 | \n",
+ " NaN | \n",
+ " 1563 | \n",
+ " 1797 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 888 | \n",
+ " 1163 | \n",
+ " 378 | \n",
+ " 384 | \n",
+ " 185 | \n",
+ " 1796 | \n",
+ " 1626 | \n",
+ " 1797 | \n",
+ " 1354 | \n",
+ " 1 | \n",
+ " 1797 | \n",
+ " 1109 | \n",
+ " 1082 | \n",
+ " 320 | \n",
+ " 1626 | \n",
+ " 444 | \n",
+ " 615 | \n",
+ " 55 | \n",
+ " 86 | \n",
+ " 1 | \n",
"
\n",
" \n",
" top | \n",
- " Brassica Information Portal | \n",
+ " 1723 | \n",
+ " fairsharing-records | \n",
+ " 2014-11-04T15:23:40.000Z | \n",
+ " 2021-09-30T11:39:06.829Z | \n",
+ " 10.25504/FAIRsharing.8t18te | \n",
+ " OmicsDB | \n",
+ " ready | \n",
+ " [{'contact-name': 'Sam Hokin', 'contact-email'... | \n",
+ " http://www.cellimagelibrary.org | \n",
+ " NaN | \n",
+ " This library is a public and easily accessible... | \n",
+ " [{'url': 'https://github.com/gbif/ipt/wiki/IPT... | \n",
+ " NaN | \n",
+ " [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... | \n",
+ " [biodbcore-000180, bsg-d000180] | \n",
+ " Database | \n",
+ " repository | \n",
+ " [Life Science] | \n",
+ " [] | \n",
+ " [All] | \n",
+ " [] | \n",
+ " [United States] | \n",
+ " FAIRsharing record for: OmicsDB | \n",
" CGD | \n",
- " https://fairsharing.org/10.25504/FAIRsharing.e... | \n",
- " http://web.iodp.tamu.edu/LORE/ | \n",
- " United States | \n",
- " Life Science | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.8... | \n",
+ " 10.25504/FAIRsharing.8t18te | \n",
+ " https://creativecommons.org/licenses/by-sa/4.0... | \n",
+ " This FAIRsharing record describes: This librar... | \n",
+ " [] | \n",
+ " [] | \n",
+ " [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... | \n",
+ " CGD | \n",
+ " [{'url': 'https://github.com/Ensembl', 'name':... | \n",
+ " [{'url': 'http://www.h-invitational.jp/hinv/bl... | \n",
+ " 2021-9-17 | \n",
+ " This resource is no longer available at the st... | \n",
+ " True | \n",
"
\n",
" \n",
" freq | \n",
" 1 | \n",
+ " 1797 | \n",
+ " 636 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1540 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1797 | \n",
+ " 926 | \n",
+ " 350 | \n",
+ " 265 | \n",
+ " 502 | \n",
+ " 1193 | \n",
+ " 594 | \n",
+ " 2 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
- " 588 | \n",
- " 367 | \n",
+ " 1797 | \n",
+ " 1 | \n",
+ " 661 | \n",
+ " 716 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 84 | \n",
+ " 113 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2446.100167 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2007.636059 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 520.058757 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 10.953269 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1547.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1894.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1996.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2004.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2445.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2010.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2897.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2014.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 3346.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " full_name short_name \\\n",
- "count 1752 1752 \n",
- "unique 1752 1741 \n",
- "top Brassica Information Portal CGD \n",
- "freq 1 3 \n",
+ " id type attributes.created-at \\\n",
+ "count 1797 1797 1797 \n",
+ "unique 1797 1 1162 \n",
+ "top 1723 fairsharing-records 2014-11-04T15:23:40.000Z \n",
+ "freq 1 1797 636 \n",
+ "mean NaN NaN NaN \n",
+ "std NaN NaN NaN \n",
+ "min NaN NaN NaN \n",
+ "25% NaN NaN NaN \n",
+ "50% NaN NaN NaN \n",
+ "75% NaN NaN NaN \n",
+ "max NaN NaN NaN \n",
"\n",
- " fs_url \\\n",
- "count 1752 \n",
- "unique 1752 \n",
- "top https://fairsharing.org/10.25504/FAIRsharing.e... \n",
+ " attributes.updated-at attributes.metadata.doi \\\n",
+ "count 1797 1354 \n",
+ "unique 1797 1354 \n",
+ "top 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te \n",
+ "freq 1 1 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " attributes.metadata.name attributes.metadata.status \\\n",
+ "count 1797 1797 \n",
+ "unique 1796 4 \n",
+ "top OmicsDB ready \n",
+ "freq 2 1540 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " attributes.metadata.contacts \\\n",
+ "count 1678 \n",
+ "unique 1576 \n",
+ "top [{'contact-name': 'Sam Hokin', 'contact-email'... \n",
+ "freq 6 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.homepage attributes.metadata.identifier \\\n",
+ "count 1797 1797.000000 \n",
+ "unique 1797 NaN \n",
+ "top http://www.cellimagelibrary.org NaN \n",
+ "freq 1 NaN \n",
+ "mean NaN 2446.100167 \n",
+ "std NaN 520.058757 \n",
+ "min NaN 1547.000000 \n",
+ "25% NaN 1996.000000 \n",
+ "50% NaN 2445.000000 \n",
+ "75% NaN 2897.000000 \n",
+ "max NaN 3346.000000 \n",
+ "\n",
+ " attributes.metadata.description \\\n",
+ "count 1797 \n",
+ "unique 1797 \n",
+ "top This library is a public and easily accessible... \n",
"freq 1 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
"\n",
- " url countries subjects \n",
- "count 1752 1749 1690 \n",
- "unique 1752 178 834 \n",
- "top http://web.iodp.tamu.edu/LORE/ United States Life Science \n",
- "freq 1 588 367 "
+ " attributes.metadata.support-links \\\n",
+ "count 1608 \n",
+ "unique 1594 \n",
+ "top [{'url': 'https://github.com/gbif/ipt/wiki/IPT... \n",
+ "freq 6 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.year-creation \\\n",
+ "count 1492.000000 \n",
+ "unique NaN \n",
+ "top NaN \n",
+ "freq NaN \n",
+ "mean 2007.636059 \n",
+ "std 10.953269 \n",
+ "min 1894.000000 \n",
+ "25% 2004.000000 \n",
+ "50% 2010.000000 \n",
+ "75% 2014.000000 \n",
+ "max 2021.000000 \n",
+ "\n",
+ " attributes.metadata.data-processes \\\n",
+ "count 1565 \n",
+ "unique 1563 \n",
+ "top [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... \n",
+ "freq 2 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.legacy-ids attributes.fairsharing-registry \\\n",
+ "count 1797 1797 \n",
+ "unique 1797 1 \n",
+ "top [biodbcore-000180, bsg-d000180] Database \n",
+ "freq 1 1797 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " attributes.record-type attributes.subjects attributes.domains \\\n",
+ "count 1797 1797 1797 \n",
+ "unique 3 888 1163 \n",
+ "top repository [Life Science] [] \n",
+ "freq 926 350 265 \n",
+ "mean NaN NaN NaN \n",
+ "std NaN NaN NaN \n",
+ "min NaN NaN NaN \n",
+ "25% NaN NaN NaN \n",
+ "50% NaN NaN NaN \n",
+ "75% NaN NaN NaN \n",
+ "max NaN NaN NaN \n",
+ "\n",
+ " attributes.taxonomies attributes.user-defined-tags \\\n",
+ "count 1797 1797 \n",
+ "unique 378 384 \n",
+ "top [All] [] \n",
+ "freq 502 1193 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " attributes.countries attributes.name \\\n",
+ "count 1797 1797 \n",
+ "unique 185 1796 \n",
+ "top [United States] FAIRsharing record for: OmicsDB \n",
+ "freq 594 2 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " attributes.abbreviation \\\n",
+ "count 1638 \n",
+ "unique 1626 \n",
+ "top CGD \n",
+ "freq 3 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.url \\\n",
+ "count 1797 \n",
+ "unique 1797 \n",
+ "top https://fairsharing.org/10.25504/FAIRsharing.8... \n",
+ "freq 1 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.doi \\\n",
+ "count 1354 \n",
+ "unique 1354 \n",
+ "top 10.25504/FAIRsharing.8t18te \n",
+ "freq 1 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.fairsharing-licence \\\n",
+ "count 1797 \n",
+ "unique 1 \n",
+ "top https://creativecommons.org/licenses/by-sa/4.0... \n",
+ "freq 1797 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.description \\\n",
+ "count 1797 \n",
+ "unique 1797 \n",
+ "top This FAIRsharing record describes: This librar... \n",
+ "freq 1 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.publications attributes.licence-links \\\n",
+ "count 1797 1797 \n",
+ "unique 1109 1082 \n",
+ "top [] [] \n",
+ "freq 661 716 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " attributes.metadata.citations \\\n",
+ "count 326 \n",
+ "unique 320 \n",
+ "top [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... \n",
+ "freq 6 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.abbreviation \\\n",
+ "count 1638 \n",
+ "unique 1626 \n",
+ "top CGD \n",
+ "freq 3 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.access-points \\\n",
+ "count 449 \n",
+ "unique 444 \n",
+ "top [{'url': 'https://github.com/Ensembl', 'name':... \n",
+ "freq 3 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.associated-tools \\\n",
+ "count 618 \n",
+ "unique 615 \n",
+ "top [{'url': 'http://www.h-invitational.jp/hinv/bl... \n",
+ "freq 2 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.deprecation-date \\\n",
+ "count 217 \n",
+ "unique 55 \n",
+ "top 2021-9-17 \n",
+ "freq 84 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.deprecation-reason \\\n",
+ "count 217 \n",
+ "unique 86 \n",
+ "top This resource is no longer available at the st... \n",
+ "freq 113 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " attributes.metadata.tombstone \n",
+ "count 1 \n",
+ "unique 1 \n",
+ "top True \n",
+ "freq 1 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN "
]
},
- "execution_count": 3,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -267,22 +1374,53 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "full_name 0\n",
- "short_name 0\n",
- "fs_url 0\n",
- "url 0\n",
- "countries 3\n",
- "subjects 62\n",
+ "id 0\n",
+ "type 0\n",
+ "attributes.created-at 0\n",
+ "attributes.updated-at 0\n",
+ "attributes.metadata.doi 443\n",
+ "attributes.metadata.name 0\n",
+ "attributes.metadata.status 0\n",
+ "attributes.metadata.contacts 119\n",
+ "attributes.metadata.homepage 0\n",
+ "attributes.metadata.identifier 0\n",
+ "attributes.metadata.description 0\n",
+ "attributes.metadata.support-links 189\n",
+ "attributes.metadata.year-creation 305\n",
+ "attributes.metadata.data-processes 232\n",
+ "attributes.legacy-ids 0\n",
+ "attributes.fairsharing-registry 0\n",
+ "attributes.record-type 0\n",
+ "attributes.subjects 0\n",
+ "attributes.domains 0\n",
+ "attributes.taxonomies 0\n",
+ "attributes.user-defined-tags 0\n",
+ "attributes.countries 0\n",
+ "attributes.name 0\n",
+ "attributes.abbreviation 159\n",
+ "attributes.url 0\n",
+ "attributes.doi 443\n",
+ "attributes.fairsharing-licence 0\n",
+ "attributes.description 0\n",
+ "attributes.publications 0\n",
+ "attributes.licence-links 0\n",
+ "attributes.metadata.citations 1471\n",
+ "attributes.metadata.abbreviation 159\n",
+ "attributes.metadata.access-points 1348\n",
+ "attributes.metadata.associated-tools 1179\n",
+ "attributes.metadata.deprecation-date 1580\n",
+ "attributes.metadata.deprecation-reason 1580\n",
+ "attributes.metadata.tombstone 1796\n",
"dtype: int64"
]
},
- "execution_count": 4,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -291,6 +1429,30 @@
"fairsharing_df.isna().sum()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "attributes.record-type\n",
+ "knowledgebase 774\n",
+ "knowledgebase_and_repository 97\n",
+ "repository 926\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(fairsharing_df['attributes.record-type']).groupby('attributes.record-type').size()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,