diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..6298079 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,5 @@ +--- +sidebar_position: 5 +--- + +# Public API \ No newline at end of file diff --git a/docs/data-model.md b/docs/data-model.md new file mode 100644 index 0000000..affd97e --- /dev/null +++ b/docs/data-model.md @@ -0,0 +1,5 @@ +--- +sidebar_position: 3 +--- + +# Data model \ No newline at end of file diff --git a/docs/data-sources.md b/docs/data-sources.md new file mode 100644 index 0000000..c4ad3fe --- /dev/null +++ b/docs/data-sources.md @@ -0,0 +1,5 @@ +--- +sidebar_position: 2 +--- + +# Data sources \ No newline at end of file diff --git a/docs/download.md b/docs/download.md new file mode 100644 index 0000000..5ac6ed0 --- /dev/null +++ b/docs/download.md @@ -0,0 +1,11 @@ +--- +sidebar_position: 4 +--- + +# Bulk downloads + +In order to facilitate users, different dumps are available. All are available under the Zenodo community called [OpenAIRE Research Graph](https://zenodo.org/communities/openaire-research-graph). +Here we provide detailed documentation about the full dump: + +* Json dump: https://doi.org/10.5281/zenodo.3516917 +* Json schema: https://doi.org/10.5281/zenodo.4238938 \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 0000000..8530506 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,7 @@ +--- +sidebar_position: 8 +--- + +# FAQ + +https://support.openaire.eu/projects/docs/wiki/FAQ \ No newline at end of file diff --git a/docs/graph-provision/_category_.json b/docs/graph-provision/_category_.json new file mode 100644 index 0000000..1329c49 --- /dev/null +++ b/docs/graph-provision/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Graph provision", + "position": 6, + "link": { + "type": "generated-index", + "description": "5 minutes to learn the most important Docusaurus concepts." + } +} \ No newline at end of file diff --git a/docs/graph-provision/assets/dedup-results.png b/docs/graph-provision/assets/dedup-results.png new file mode 100644 index 0000000..d8fdda2 Binary files /dev/null and b/docs/graph-provision/assets/dedup-results.png differ diff --git a/docs/graph-provision/deduplication/_category_.json b/docs/graph-provision/deduplication/_category_.json new file mode 100644 index 0000000..f67ffbe --- /dev/null +++ b/docs/graph-provision/deduplication/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Deduplication", + "position": 1, + "link": { + "type": "generated-index", + "description": "5 minutes to learn the most important Docusaurus concepts." + } +} \ No newline at end of file diff --git a/docs/graph-provision/deduplication/clustering-functions.md b/docs/graph-provision/deduplication/clustering-functions.md new file mode 100644 index 0000000..9fcbc31 --- /dev/null +++ b/docs/graph-provision/deduplication/clustering-functions.md @@ -0,0 +1,21 @@ +--- +sidebar_position: 3 +--- +# Clustering functions +TODO + +## NgramPairs +It produces a list of concatenations of a pair of ngrams generated from different words.
+*Example:*
+Input string: `“Search for the Standard Model Higgs Boson”`
+Parameters: ngram length = 3
+List of ngrams: `“sea”`, `“sta”`, `“mod”`, `“hig”`
+Ngram pairs: `“seasta”`, `“stamod”`, `“modhig”` + +## SuffixPrefix + +It produces ngrams pairs in a particular way: it concatenates the suffix of a string with the prefix of the next in the input string.
+*Example:*
+Input string: `“Search for the Standard Model Higgs Boson”`
+Parameters: suffix and prefix length = 3
+Output list: `“ardmod”` (suffix of the word `“Standard”` + prefix of the word `“Model”`), `“rchsta”` (suffix of the word `“Search”` + prefix of the word `“Standard”`) \ No newline at end of file diff --git a/docs/graph-provision/deduplication/organizations.md b/docs/graph-provision/deduplication/organizations.md new file mode 100644 index 0000000..6b43e54 --- /dev/null +++ b/docs/graph-provision/deduplication/organizations.md @@ -0,0 +1,6 @@ +--- +sidebar_position: 2 +--- + +# Organizations +TODO diff --git a/docs/graph-provision/deduplication/research-products.md b/docs/graph-provision/deduplication/research-products.md new file mode 100644 index 0000000..b673d92 --- /dev/null +++ b/docs/graph-provision/deduplication/research-products.md @@ -0,0 +1,49 @@ +--- +sidebar_position: 1 +--- + +# Research results + +Metadata records about the same scholarly work can be collected from different providers. Each metadata record can possibly carry different information because, for example, some providers are not aware of links to projects, keywords or other details. Another common case is when OpenAIRE collects one metadata record from a repository about a pre-print and another record from a journal about the published article. For the provision of statistics, OpenAIRE must identify those cases and “merge” the two metadata records, so that the scholarly work is counted only once in the statistics OpenAIRE produces. + +Duplicates among research results are identified among results of the same type (publications, datasets, software, other research products). If two duplicate results are aggregated one as a dataset and one as a software, for example, they will never be compared and they will never be identified as duplicates. +OpenAIRE supports different deduplication strategies based on the type of results. + +## Methodology overview + +The deduplication process can be divided into two different phases: +* Candidate identification (clustering) +* Decision tree +* Creation of representative record + +The implementation of each phase is different based on the type of results that are being processed. + +### Publications + +#### Candidate identification (clustering) + +Clustering is a common heuristics used to overcome the N x N complexity required to match all pairs of objects to identify the equivalent ones. The challenge is to identify a [clustering function](./clustering-functions) that maximizes the chance of comparing only records that may lead to a match, while minimizing the number of records that will not be matched while being equivalent. Since the equivalence function is to some level tolerant to minimal errors (e.g. switching of characters in the title, or minimal difference in letters), we need this function to be not too precise (e.g. a hash of the title), but also not too flexible (e.g. random ngrams of the title). On the other hand, reality tells us that in some cases equality of two records can only be determined by their PIDs (e.g. DOI) as the metadata properties are very different across different versions and no [clustering function](./clustering-functions) will ever bring them into the same cluster. To match these requirements OpenAIRE clustering for products works with two functions: +DOI: the function generates the DOI when this is provided as part of the record properties; +Title-based function: the function generates a key that depends on (i) number of significant words in the title (normalized, stemming, etc.), (ii) module 10 of the number of characters of such words, and (iii) a string obtained as an alternation of the function prefix(3) and suffix(3) (and vice versa) o the first 3 words (2 words if the title only has 2). For example, the title “Entity deduplication in big data graphs for scholarly communication” becomes “entity deduplication big data graphs scholarly communication” with two keys key “7.1entionbig” and “7.1itydedbig” (where 1 is module 10 of 54 characters of the normalized title. + +#### Decision tree + +For each pair of publications in a cluster the following strategy (depicted in the figure below) is applied. +Cross comparison of the pid lists (in the `pid` and `alternateid` elements). If 50% common pids, levenshtein distance on titles with low threshold (0.9). +Otherwise, check if the number of authors and the title version is equal. If so, levenshtein distance on titles with higher threshold (0.99). +The publications are matched as duplicate if the distance is higher than the threshold, in every other case they are considered as distinct publications. + +![Example banner](../assets/dedup-results.png) + +#### Creation of representative record +TODO + +### Datasets +TODO + +### Software +TODO + +### Other types of research products +TODO + diff --git a/docs/graph-provision/inference/_category_.json b/docs/graph-provision/inference/_category_.json new file mode 100644 index 0000000..c1c56c4 --- /dev/null +++ b/docs/graph-provision/inference/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Inference and annotations", + "position": 2, + "link": { + "type": "generated-index", + "description": "5 minutes to learn the most important Docusaurus concepts." + } +} \ No newline at end of file diff --git a/docs/graph-provision/inference/impact-scores.md b/docs/graph-provision/inference/impact-scores.md new file mode 100644 index 0000000..b38c5de --- /dev/null +++ b/docs/graph-provision/inference/impact-scores.md @@ -0,0 +1,5 @@ +--- +sidebar_position: 2 +--- + +# Impact scores \ No newline at end of file diff --git a/docs/graph-provision/inference/mining.md b/docs/graph-provision/inference/mining.md new file mode 100644 index 0000000..9c1f8f5 --- /dev/null +++ b/docs/graph-provision/inference/mining.md @@ -0,0 +1,6 @@ +--- +sidebar_position: 1 +--- + +# Mining algorithms +TODO diff --git a/docs/intro.md b/docs/intro.md index 8a2e69d..d5d3f65 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -2,46 +2,23 @@ sidebar_position: 1 --- -# Tutorial Intro +# Welcome! -Let's discover **Docusaurus in less than 5 minutes**. +The OpenAIRE Research Graph is one of the largest open scholarly record collections worldwide, key in fostering Open Science and establishing its practices in the daily research activities. +Conceived as a public and transparent good, populated out of data sources trusted by scientists, the Graph aims at bringing discovery, monitoring, and assessment of science back in the hands of the scientific community. -## Getting Started +Imagine a vast collection of research products all linked together, contextualised and openly available. For the past ten years OpenAIRE has been working to gather this valuable record. It is a massive collection of metadata and links between scientific products such as articles, datasets, software, and other research products, entities like organisations, funders, funding streams, projects, communities, and data sources. -Get started by **creating a new site**. +As of today, the OpenAIRE Research Graph aggregates around 450Mi metadata records with links collecting from 10K data sources trusted by scientists, including: -Or **try Docusaurus immediately** with **[docusaurus.new](https://docusaurus.new)**. +* Repositories registered in OpenDOAR or re3data.org (soon FAIRSharing.org) +* Open Access journals registered in DOAJ +* Crossref +* Unpaywall +* ORCID +* Microsoft Academic Graph +* Datacite -### What you'll need - -- [Node.js](https://nodejs.org/en/download/) version 16.14 or above: - - When installing Node.js, you are recommended to check all checkboxes related to dependencies. - -## Generate a new site - -Generate a new Docusaurus site using the **classic template**. - -The classic template will automatically be added to your project after you run the command: - -```bash -npm init docusaurus@latest my-website classic -``` - -You can type this command into Command Prompt, Powershell, Terminal, or any other integrated terminal of your code editor. - -The command also installs all necessary dependencies you need to run Docusaurus. - -## Start your site - -Run the development server: - -```bash -cd my-website -npm run start -``` - -The `cd` command changes the directory you're working with. In order to work with your newly created Docusaurus site, you'll need to navigate the terminal there. - -The `npm run start` command builds your website locally and serves it through a development server, ready for you to view at http://localhost:3000/. - -Open `docs/intro.md` (this page) and edit some lines: the site **reloads automatically** and displays your changes. +After cleaning, deduplication, enrichment and full-text mining processes, the graph is analysed to produce statistics for the [OpenAIRE MONITOR](https://monitor.openaire.eu), the [Open Science Observatory](https://osobservatory.openaire.eu), made discoverable via the [OpenAIRE EXPLORE](https://explore.openaire.eu) and programmatically accessible as described at +https://develop.openaire.eu. +Json dumps are also published on Zenodo. diff --git a/docs/license.md b/docs/license.md new file mode 100644 index 0000000..2b4cac6 --- /dev/null +++ b/docs/license.md @@ -0,0 +1,5 @@ +--- +sidebar_position: 9 +--- + +# License \ No newline at end of file diff --git a/docs/services.md b/docs/services.md new file mode 100644 index 0000000..4534a3c --- /dev/null +++ b/docs/services.md @@ -0,0 +1,6 @@ +--- +sidebar_position: 7 +--- + +# Graph-based services +TODO diff --git a/docs/tutorial-basics/_category_.json b/docs/tutorial-basics/_category_.json index 2e6db55..6763b64 100644 --- a/docs/tutorial-basics/_category_.json +++ b/docs/tutorial-basics/_category_.json @@ -1,6 +1,6 @@ { "label": "Tutorial - Basics", - "position": 2, + "position": 15, "link": { "type": "generated-index", "description": "5 minutes to learn the most important Docusaurus concepts." diff --git a/docs/tutorial-extras/_category_.json b/docs/tutorial-extras/_category_.json index a8ffcc1..b7b9670 100644 --- a/docs/tutorial-extras/_category_.json +++ b/docs/tutorial-extras/_category_.json @@ -1,6 +1,6 @@ { "label": "Tutorial - Extras", - "position": 3, + "position": 16, "link": { "type": "generated-index" } diff --git a/docusaurus.config.js b/docusaurus.config.js index 3bb257c..2c32c3c 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -6,8 +6,8 @@ const darkCodeTheme = require('prism-react-renderer/themes/dracula'); /** @type {import('@docusaurus/types').Config} */ const config = { - title: 'OpenAIRE Docs', - tagline: 'Dinosaurs are cool', + title: 'OpenAIRE Documentation', + tagline: 'Open Access Infrastructure for Research in Europe', url: 'http://snf-23385.ok-kno.grnetcloud.net', baseUrl: '/openaire/', onBrokenLinks: 'throw', @@ -16,8 +16,8 @@ const config = { // GitHub pages deployment config. // If you aren't using GitHub pages, you don't need these. - organizationName: 'facebook', // Usually your GitHub org/user name. - projectName: 'docusaurus', // Usually your repo name. + organizationName: 'schatzopoulos', // Usually your GitHub org/user name. + projectName: 'openaire-docs', // Usually your repo name. // Even if you don't use internalization, you can use this field to set useful // metadata like html lang. For example, if your site is Chinese, you may want @@ -33,18 +33,16 @@ const config = { /** @type {import('@docusaurus/preset-classic').Options} */ ({ docs: { + routeBasePath: '/', // serve the docs at the site's route + sidebarPath: require.resolve('./sidebars.js'), // Please change this to your repo. // Remove this to remove the "edit this page" links. - editUrl: - 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', + // editUrl: + // 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', }, blog: { showReadingTime: true, - // Please change this to your repo. - // Remove this to remove the "edit this page" links. - editUrl: - 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', }, theme: { customCss: require.resolve('./src/css/custom.css'), @@ -57,30 +55,30 @@ const config = { /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ ({ navbar: { - title: 'OpenAIRE Docs', + // title: 'OpenAIRE Documentation', logo: { - alt: 'My Site Logo', - src: 'img/logo.svg', + alt: 'OpenAIRE', + src: 'img/logo.png', }, items: [ { type: 'doc', docId: 'intro', position: 'left', - label: 'Graph', - }, - { - type: 'doc', - docId: 'intro', - position: 'left', - label: 'Guidelines', - }, - {to: '/blog', label: 'Blog', position: 'left'}, - { - href: 'https://github.com/facebook/docusaurus', - label: 'GitHub', - position: 'right', + label: 'Research graph', }, + // { + // type: 'doc', + // docId: 'intro', + // position: 'left', + // label: 'docs', + // }, + // {to: '/blog', label: 'Blog', position: 'left'}, + // { + // href: 'https://github.com/facebook/docusaurus', + // label: 'GitHub', + // position: 'right', + // }, ], }, footer: { @@ -90,43 +88,60 @@ const config = { title: 'Docs', items: [ { - label: 'Tutorial', - to: '/docs/intro', + label: 'Research Graph', + to: '/docs/category/research-graph', + }, + ], + }, + { + title: 'Dashboards', + items: [ + { + label: 'Explore', + href: 'https://explore.openaire.eu/', + }, + { + label: 'Provide', + href: 'https://provide.openaire.eu/', + }, + { + label: 'Connect', + href: 'https://connect.openaire.eu/', + }, + { + label: 'Monitor', + href: 'https://monitor.openaire.eu/', + }, + { + label: 'Develop', + href: 'https://graph.openaire.eu/', }, ], }, { title: 'Community', items: [ - { - label: 'Stack Overflow', - href: 'https://stackoverflow.com/questions/tagged/docusaurus', + { + label: 'Facebook', + href: 'http://www.facebook.com/groups/openaire/' }, { - label: 'Discord', - href: 'https://discordapp.com/invite/docusaurus', + label: 'Linkedin', + href: 'https://www.linkedin.com/company/openaire-eu/', }, { label: 'Twitter', - href: 'https://twitter.com/docusaurus', - }, - ], - }, - { - title: 'More', - items: [ - { - label: 'Blog', - to: '/blog', - }, - { - label: 'GitHub', - href: 'https://github.com/facebook/docusaurus', + href: 'https://twitter.com/OpenAIRE_eu', + }, + { + label: 'Youtube', + href: 'https://www.youtube.com/channel/UChFYqizc-S6asNjQSoWuwjw', }, ], }, + ], - copyright: `Copyright © ${new Date().getFullYear()} My Project, Inc. Built with Docusaurus.`, + copyright: `Copyright © ${new Date().getFullYear()} OpenAIRE`, }, prism: { theme: lightCodeTheme, diff --git a/src/css/custom.css b/src/css/custom.css index 7f4ad3e..27cefe3 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -55,3 +55,8 @@ --ifm-color-primary-lightest: #aed6f1; --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); } + + +.todo { + background-color: yellow; +} \ No newline at end of file diff --git a/src/pages/index.js b/src/pages/index.js index affcd90..0f0cae9 100644 --- a/src/pages/index.js +++ b/src/pages/index.js @@ -17,8 +17,8 @@ function HomepageHeader() {
- Docusaurus Tutorial - 5min ⏱️ + to="intro"> + Get started {/*- 5min ⏱️*/}
@@ -31,7 +31,7 @@ export default function Home() { return ( + description="OpenAIRE Documentation">
diff --git a/static/img/favicon.ico b/static/img/favicon.ico index c01d54b..cd00609 100644 Binary files a/static/img/favicon.ico and b/static/img/favicon.ico differ diff --git a/static/img/logo.png b/static/img/logo.png new file mode 100644 index 0000000..aeffc3d Binary files /dev/null and b/static/img/logo.png differ diff --git a/static/img/logo.svg b/static/img/logo.svg deleted file mode 100644 index 9db6d0d..0000000 --- a/static/img/logo.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file