diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb
index b79a854..dcdad8e 100644
--- a/notebooks/01.2-exploration-opendoar.ipynb
+++ b/notebooks/01.2-exploration-opendoar.ipynb
@@ -34,13 +34,6 @@
"## Loading datasets"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**re3data**"
- ]
- },
{
"cell_type": "code",
"execution_count": 3,
@@ -67,105 +60,206 @@
" \n",
" \n",
" | \n",
- " re3data_id | \n",
+ " openaire_id | \n",
+ " opendoar_id | \n",
" repository_name | \n",
+ " additional_name | \n",
+ " repository_url | \n",
+ " description | \n",
" type | \n",
+ " update_date | \n",
+ " start_date | \n",
" subject | \n",
- " provider_type | \n",
- " keyword | \n",
+ " content_type | \n",
" institution | \n",
+ " metadata_policy | \n",
+ " data_policy | \n",
+ " submission_policy | \n",
+ " content_policy | \n",
+ " software | \n",
+ " api | \n",
"
\n",
" \n",
"
\n",
" \n",
" 0 | \n",
- " r3d100000001 | \n",
- " Odum Institute Archive Dataverse | \n",
- " [disciplinary] | \n",
- " [1 Humanities and Social Sciences, 111 Social ... | \n",
- " [dataProvider] | \n",
- " [FAIR, Middle East, crime, demography, economy... | \n",
- " [[Odum Institute for Research in Social Scienc... | \n",
+ " opendoar____::38b3eff8baf56627478ec76a704e9b52 | \n",
+ " 101 | \n",
+ " utrecht university repository | \n",
+ " [] | \n",
+ " http://dspace.library.uu.nl | \n",
+ " this site is a university repository providing... | \n",
+ " institutional | \n",
+ " 2021-04-16 15:22:03 | \n",
+ " 2006-01-13 12:55:13 | \n",
+ " [multidisciplinary] | \n",
+ " [journal_articles, conference_and_workshop_pap... | \n",
+ " [[university of utrecht, [universiteit utrecht... | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " dspace | \n",
+ " true | \n",
"
\n",
" \n",
" 1 | \n",
- " r3d100000002 | \n",
- " Access to Archival Databases | \n",
- " [disciplinary] | \n",
- " [1 Humanities and Social Sciences, 102 History... | \n",
- " [dataProvider] | \n",
- " [US History] | \n",
- " [[The U.S. National Archives and Records Admin... | \n",
+ " opendoar____::2b44928ae11fb9384c4cf38708677c48 | \n",
+ " 115 | \n",
+ " dspace at indian institute of management kozhi... | \n",
+ " [dspace@iimk] | \n",
+ " http://dspace.iimk.ac.in/ | \n",
+ " this site is a subject based university reposi... | \n",
+ " institutional | \n",
+ " 2021-02-18 17:36:43 | \n",
+ " 2006-01-04 11:54:34 | \n",
+ " [ecology and environment, social sciences gene... | \n",
+ " [journal_articles, conference_and_workshop_pap... | \n",
+ " [[indian institute of management kozhikode, [i... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " dspace 4.1 | \n",
+ " true | \n",
"
\n",
" \n",
" 2 | \n",
- " r3d100000004 | \n",
- " Datenbank Gesprochenes Deutsch | \n",
- " [disciplinary] | \n",
- " [1 Humanities and Social Sciences, 104 Linguis... | \n",
- " [dataProvider, serviceProvider] | \n",
- " [Australian German, FOLK, German dialects, Pfe... | \n",
- " [[Institut für Deutsche Sprache, Archiv für Ge... | \n",
+ " opendoar____::3416a75f4cea9109507cacd8e2f2aefc | \n",
+ " 41 | \n",
+ " caltech engineering and science online | \n",
+ " [] | \n",
+ " http://calteches.library.caltech.edu/ | \n",
+ " the caltech archives holds approximately 220 c... | \n",
+ " institutional | \n",
+ " 2021-02-18 17:36:28 | \n",
+ " 2006-01-04 14:47:04 | \n",
+ " [biology and biochemistry, chemistry and chemi... | \n",
+ " [journal_articles, conference_and_workshop_pap... | \n",
+ " [[california institute of technology, [caltech... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " eprints 3.1.3 | \n",
+ " true | \n",
"
\n",
" \n",
" 3 | \n",
- " r3d100000005 | \n",
- " UNC Dataverse | \n",
- " [institutional] | \n",
- " [1 Humanities and Social Sciences, 111 Social ... | \n",
- " [dataProvider, serviceProvider] | \n",
- " [FAIR, census, demographic survey, demography,... | \n",
- " [[Odum Institute for Research in Social Scienc... | \n",
+ " opendoar____::07e1cd7dca89a1678042477183b7ac3f | \n",
+ " 119 | \n",
+ " dcu online research access service | \n",
+ " [doras] | \n",
+ " http://doras.dcu.ie/ | \n",
+ " this site is an institutional repository provi... | \n",
+ " institutional | \n",
+ " 2021-02-18 17:36:44 | \n",
+ " 2006-01-04 11:15:19 | \n",
+ " [multidisciplinary] | \n",
+ " [journal_articles, conference_and_workshop_pap... | \n",
+ " [[dublin city university, [dcu], ie, [], , htt... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " eprints 3.0.5 | \n",
+ " true | \n",
"
\n",
" \n",
" 4 | \n",
- " r3d100000006 | \n",
- " Archaeology Data Service | \n",
- " [disciplinary] | \n",
- " [1 Humanities and Social Sciences, 101 Ancient... | \n",
- " [dataProvider, serviceProvider] | \n",
- " [FAIR, archaeology, cultural heritage, prehist... | \n",
- " [[Arts and Humanities Research Council, [AHRC]... | \n",
+ " opendoar____::d1f491a404d6854880943e5c3cd9ca25 | \n",
+ " 129 | \n",
+ " earth-prints repository | \n",
+ " [] | \n",
+ " http://www.earth-prints.org/ | \n",
+ " a subject based repository providing open acce... | \n",
+ " disciplinary | \n",
+ " 2021-04-19 08:28:38 | \n",
+ " 2006-01-30 16:43:11 | \n",
+ " [earth and planetary sciences] | \n",
+ " [journal_articles, conference_and_workshop_pap... | \n",
+ " [[istituto nazionale di geofisica e vulcanolog... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " dspace 5.8.1-snapshot | \n",
+ " true | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " re3data_id repository_name type \\\n",
- "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
- "1 r3d100000002 Access to Archival Databases [disciplinary] \n",
- "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
- "3 r3d100000005 UNC Dataverse [institutional] \n",
- "4 r3d100000006 Archaeology Data Service [disciplinary] \n",
+ " openaire_id opendoar_id \\\n",
+ "0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 \n",
+ "1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 \n",
+ "2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 \n",
+ "3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 \n",
+ "4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 \n",
+ "\n",
+ " repository_name additional_name \\\n",
+ "0 utrecht university repository [] \n",
+ "1 dspace at indian institute of management kozhi... [dspace@iimk] \n",
+ "2 caltech engineering and science online [] \n",
+ "3 dcu online research access service [doras] \n",
+ "4 earth-prints repository [] \n",
+ "\n",
+ " repository_url \\\n",
+ "0 http://dspace.library.uu.nl \n",
+ "1 http://dspace.iimk.ac.in/ \n",
+ "2 http://calteches.library.caltech.edu/ \n",
+ "3 http://doras.dcu.ie/ \n",
+ "4 http://www.earth-prints.org/ \n",
+ "\n",
+ " description type \\\n",
+ "0 this site is a university repository providing... institutional \n",
+ "1 this site is a subject based university reposi... institutional \n",
+ "2 the caltech archives holds approximately 220 c... institutional \n",
+ "3 this site is an institutional repository provi... institutional \n",
+ "4 a subject based repository providing open acce... disciplinary \n",
+ "\n",
+ " update_date start_date \\\n",
+ "0 2021-04-16 15:22:03 2006-01-13 12:55:13 \n",
+ "1 2021-02-18 17:36:43 2006-01-04 11:54:34 \n",
+ "2 2021-02-18 17:36:28 2006-01-04 14:47:04 \n",
+ "3 2021-02-18 17:36:44 2006-01-04 11:15:19 \n",
+ "4 2021-04-19 08:28:38 2006-01-30 16:43:11 \n",
"\n",
" subject \\\n",
- "0 [1 Humanities and Social Sciences, 111 Social ... \n",
- "1 [1 Humanities and Social Sciences, 102 History... \n",
- "2 [1 Humanities and Social Sciences, 104 Linguis... \n",
- "3 [1 Humanities and Social Sciences, 111 Social ... \n",
- "4 [1 Humanities and Social Sciences, 101 Ancient... \n",
+ "0 [multidisciplinary] \n",
+ "1 [ecology and environment, social sciences gene... \n",
+ "2 [biology and biochemistry, chemistry and chemi... \n",
+ "3 [multidisciplinary] \n",
+ "4 [earth and planetary sciences] \n",
"\n",
- " provider_type \\\n",
- "0 [dataProvider] \n",
- "1 [dataProvider] \n",
- "2 [dataProvider, serviceProvider] \n",
- "3 [dataProvider, serviceProvider] \n",
- "4 [dataProvider, serviceProvider] \n",
+ " content_type \\\n",
+ "0 [journal_articles, conference_and_workshop_pap... \n",
+ "1 [journal_articles, conference_and_workshop_pap... \n",
+ "2 [journal_articles, conference_and_workshop_pap... \n",
+ "3 [journal_articles, conference_and_workshop_pap... \n",
+ "4 [journal_articles, conference_and_workshop_pap... \n",
"\n",
- " keyword \\\n",
- "0 [FAIR, Middle East, crime, demography, economy... \n",
- "1 [US History] \n",
- "2 [Australian German, FOLK, German dialects, Pfe... \n",
- "3 [FAIR, census, demographic survey, demography,... \n",
- "4 [FAIR, archaeology, cultural heritage, prehist... \n",
+ " institution metadata_policy \\\n",
+ "0 [[university of utrecht, [universiteit utrecht... True \n",
+ "1 [[indian institute of management kozhikode, [i... True \n",
+ "2 [[california institute of technology, [caltech... True \n",
+ "3 [[dublin city university, [dcu], ie, [], , htt... True \n",
+ "4 [[istituto nazionale di geofisica e vulcanolog... True \n",
"\n",
- " institution \n",
- "0 [[Odum Institute for Research in Social Scienc... \n",
- "1 [[The U.S. National Archives and Records Admin... \n",
- "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
- "3 [[Odum Institute for Research in Social Scienc... \n",
- "4 [[Arts and Humanities Research Council, [AHRC]... "
+ " data_policy submission_policy content_policy software \\\n",
+ "0 True False True dspace \n",
+ "1 True True True dspace 4.1 \n",
+ "2 True True True eprints 3.1.3 \n",
+ "3 True True True eprints 3.0.5 \n",
+ "4 True True True dspace 5.8.1-snapshot \n",
+ "\n",
+ " api \n",
+ "0 true \n",
+ "1 true \n",
+ "2 true \n",
+ "3 true \n",
+ "4 true "
]
},
"execution_count": 3,
@@ -173,262 +267,6 @@
"output_type": "execute_result"
}
],
- "source": [
- "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
- " converters={'subject': ast.literal_eval,\n",
- " 'keyword': ast.literal_eval,\n",
- " 'additional_name': ast.literal_eval,\n",
- " 'repository_id': ast.literal_eval,\n",
- " 'type': ast.literal_eval,\n",
- " 'content_type': ast.literal_eval,\n",
- " 'provider_type': ast.literal_eval,\n",
- " 'institution': ast.literal_eval\n",
- " },\n",
- " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
- "re3data_df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "re3data_df = re3data_df.explode('provider_type')\n",
- "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "