diff --git a/docs/changelog.md b/docs/changelog.md index 23fd806..25789d1 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -19,6 +19,22 @@ This section documents all notable changes for each graph version. --- +### v6.0.0 +_Start Date: 2023-07-26 • Release Date: 2023-08-16 • Dump release: **yes**_ + +#### Changed + +- [Relationship data model](/data-model/relationships/relationship-object): flattened properties source, sourceType, target, targetType +- BIP! indicators are now serialised as an array; see the updated model [here](/data-model/entities/other#bipindicators) +- Crossref dump from June 2023 +- ORCID works without a DOI from June 2023 +- Usage counts from June 2023 +- Datacite contents from June 2023 +- OpenCitations relations from January 2023 +- BIP! indicators from June 2023 +- New Datasources/Services were added, collected from an updated EOSC Service catalogue endpoint + + ### v5.2.0 _Start Date: 2023-07-03 • Release Date: 2023-07-17 • Dump release: **no**_ diff --git a/docs/data-model/entities/other.md b/docs/data-model/entities/other.md index 6e792b3..a5ddcf1 100644 --- a/docs/data-model/entities/other.md +++ b/docs/data-model/entities/other.md @@ -201,6 +201,55 @@ Scheme of reference for access right code. Currently, always set to COAR access "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" ``` +## BipIndicator + +The different impact indicators as computed by [BIP!](https://bip.imsi.athenarc.gr/). + + +### indicator +_Type: String • Cardinality: ONE_ + +The name of indicator; it can be either one of: +* `influence`: it reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically). +* `influence_alt`: it is an alternative to the "Influence" indicator, which also reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically). +* `popularity`: it reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network. +* `popularity_alt`: it is an alternative to the "Popularity" indicator, which also reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network. +* `impulse`: it reflects the initial momentum of an article directly after its publication, based on the underlying citation network. + +For more details on how these indicators are calculated, please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators). + +```json +"influence": { + "score": "123", + "class": "C2" +} +``` + +### class +_Type: String • Cardinality: ONE_ + +The impact class assigned based on the indicator score. + +To facilitate comprehension, BIP! also offers impact classes for articles, to group together those that have similar impact. The following 5 classes are provided: +* `C1`: Top 0.01% +* `C2`: Top 0.1% +* `C3`: Top 1% +* `C4`: Top 10% +* `C5`: Bottom 90% + +```json +"class": "C2" +``` + +### score +_Type: String • Cardinality: ONE_ + +The actual indicator score. + +```json +"score": "1234" +``` + ## Container This field has information about the conference or journal where the result has been presented or published. @@ -610,36 +659,41 @@ These are indicators computed for a specific OpenAIRE result. Each Indicator object is composed of the following properties: -### impactMeasures -_Type: [ImpactMeasures](#impactmeasures-1) • Cardinality: ONE_ +### bipIndicators +_Type: [BipIndicator](#bipindicator) • Cardinality: MANY_ These impact-based indicators, provided by [BIP!](https://bip.imsi.athenarc.gr/), estimate the impact of a result. For details about their calculation, please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators). ```json -"impactMeasures": { - "influence": { +"bipIndicators": [ + { + "indicator": "influence", "score": "123", "class": "C2" }, - "influence_alt" : { - "score": "456", - "class": "C3" + { + "indicator": "influence_alt", + "score": "456", + "class": "C3" }, - "popularity": { - "score": "234", - "class": "C1" + { + "indicator": "popularity", + "score": "234", + "class": "C1" }, - "popularity_alt": { - "score": "345", - "class": "C5" + { + "indicator": "popularity_alt", + "score": "345", + "class": "C5" }, - "impulse": { + { + "indicator": "impulse", "score": "987", "class": "C3" } -} +] ``` ### usageCounts @@ -647,6 +701,8 @@ _Type: [UsageCounts](#usagecounts-1) • Cardinality: ONE_ These measures, computed by the [UsageCounts Service](https://usagecounts.openaire.eu/), are based on usage statistics. +Please refer [here](/graph-production-workflow/indicators-ingestion/usage-counts) for more details. + ```json "usageCounts":{ "downloads": "10", @@ -674,76 +730,6 @@ Language label in English. "label": "English" ``` -## ImpactMeasures - -The different impact-based indicators as computed by [BIP!](https://bip.imsi.athenarc.gr/). - -### influence -_Type: [Score](#score) • Cardinality: ONE_ - -This indicator reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically). -For more details please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators#pagerank-pr). - -```json -"influence": { - "score": "123", - "class": "C2" -} -``` - -### influence_alt -_Type: [Score](#score) • Cardinality: ONE_ - -This is an alternative to the "Influence" indicator, which also reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically). -For more details please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators#citation-count-cc). - -```json -"influence_alt" :{ - "score": "456", - "class": "C3" -} -``` - -### popularity -_Type: [Score](#score) • Cardinality: ONE_ - -This indicator reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network. -For more details please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators#attrank). - -```json -"popularity":{ - "score": "234", - "class": "C1" -} - -``` - -### popularity_alt -_Type: [Score](#score) • Cardinality: ONE_ - -This is an alternative to the "Popularity" indicator, which also reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network. -For more details please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators#ram). - -```json -"popularity_alt":{ - "score": "345", - "class": "C5" -} - -``` - -### impulse -_Type: [Score](#score) • Cardinality: ONE_ - -This indicator reflects the initial momentum of an article directly after its publication, based on the underlying citation network. -For more details please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators#incubation-citation-count-icc). - -```json -"impulse":{ - "score": "987", - "class": "C3" -} -``` ## OrganizationPid @@ -827,33 +813,6 @@ The value expressed in the scheme (i.e. 10.1000/182). "value": "10.21511/bbs.13(3).2018.13" ``` -## Score -The specific score object for each calculated impact measure calculated by [BIP!](https://bip.imsi.athenarc.gr/). - -### score -_Type: String • Cardinality: ONE_ - -The actual indicator score. - -```json -"score": "1234" -``` - -### class -_Type: String • Cardinality: ONE_ - -The impact class assigned based on the indicator score. - -To facilitate comprehension, BIP! also offers impact classes for articles, to group together those that have similar impact. The following 5 classes are provided: -* `C1`: Top 0.01% -* `C2`: Top 0.1% -* `C3`: Top 1% -* `C4`: Top 10% -* `C5`: Bottom 90% - -```json -"class": "C2" -``` ## Subject Represents keywords associated to the result. diff --git a/docs/data-model/entities/result.md b/docs/data-model/entities/result.md index 05b9008..b84e49e 100644 --- a/docs/data-model/entities/result.md +++ b/docs/data-model/entities/result.md @@ -184,35 +184,43 @@ Date when the embargo ends and this result turns Open Access. ``` ### indicators -_Type: [Indicator](other#indicator) • Cardinality: ONE_ +_Type: [Indicator](other#indicator-1) • Cardinality: ONE_ The indicators computed for this result; -currently, the following two types of indicators are supported: [impact indicators](/graph-production-workflow/indicators-ingestion/impact-indicators) and [usage statistics indicators](/graph-production-workflow/indicators-ingestion/usage-counts). +currently, the following types of indicators are supported: + +* [Impact indicators by BIP!](other#bipindicators) +* [Usage Statistics indicators](other#usagecounts) ```json "indicators": { - "impactMeasures": { - "influence": { + "bipIndicators": [ + { + "indicator": "influence", "score": "123", "class": "C2" }, - "influence_alt" : { + { + "indicator": "influence_alt", "score": "456", "class": "C3" }, - "popularity": { + { + "indicator": "popularity", "score": "234", "class": "C1" }, - "popularity_alt": { + { + "indicator": "popularity_alt", "score": "345", "class": "C5" }, - "impulse": { + { + "indicator": "impulse", "score": "987", "class": "C3" } - }, + ], "usageCounts": { "downloads": "10", "views": "20" diff --git a/docs/downloads/beginners-kit.md b/docs/downloads/beginners-kit.md index e9f4ae8..39421cf 100644 --- a/docs/downloads/beginners-kit.md +++ b/docs/downloads/beginners-kit.md @@ -4,9 +4,9 @@ sidebar_position: 2 # Beginner's kit -:::caution + The large size of the OpenAIRE Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone. diff --git a/docs/downloads/full-graph.md b/docs/downloads/full-graph.md index 11e46f1..e009aed 100644 --- a/docs/downloads/full-graph.md +++ b/docs/downloads/full-graph.md @@ -4,9 +4,9 @@ sidebar_position: 1 # Full graph dump -:::caution + You can download the full OpenAIRE Graph Dump as well as its schema from the following links: diff --git a/docs/downloads/subgraphs.md b/docs/downloads/subgraphs.md index 7fd5e97..3f6668b 100644 --- a/docs/downloads/subgraphs.md +++ b/docs/downloads/subgraphs.md @@ -4,9 +4,9 @@ sidebar_position: 3 # Sub-graph dumps -:::caution + In order to facilitate users, different dumps are available under the Zenodo community called [OpenAIRE Graph](https://zenodo.org/communities/openaire-research-graph). This page lists all alternative dumps currently available. diff --git a/docs/graph-production-workflow/indicators-ingestion/impact-indicators.md b/docs/graph-production-workflow/indicators-ingestion/impact-indicators.md index 6c90186..a1bfdc3 100644 --- a/docs/graph-production-workflow/indicators-ingestion/impact-indicators.md +++ b/docs/graph-production-workflow/indicators-ingestion/impact-indicators.md @@ -1,12 +1,12 @@ # Impact indicators -This page summarises all calculated impact indicators, provided by [BIP!](https://bip.imsi.athenarc.gr/), which are included in the [impactMeasures](/data-model/entities/other#impactmeasures) property (found under the [indicators](/data-model/entities/result#indicators) property of the result). +This page summarises all calculated impact indicators, provided by [BIP!](https://bip.imsi.athenarc.gr/), which are included in the [bipIndicators](/data-model/entities/other#bipindicators) property (found under the [indicators](/data-model/entities/result#indicators) property of the result). It should be noted that the impact indicators are being calculated on the level of the research output. Below we explain their main intuition, the way they are calculated, and their most important limitations, in an attempt help avoiding common pitfalls and misuses. -## Citation Count (CC) +## Citation Count (CC) • influence_alt ***Short description:*** This is the most widely used scientific impact indicator, which sums all citations received by each article. @@ -31,7 +31,7 @@ Also, since some indicators require the publication year for their calculation, ***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) -## "Incubation" Citation Count (iCC) +## "Incubation" Citation Count (iCC) • impulse ***Short description:*** This measure is essentially a time-restricted version of the citation count, where the time window is distinct for each paper, i.e., @@ -58,7 +58,7 @@ Also, since some indicators require the publication year for their calculation, ***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) - ## PageRank (PR) + ## PageRank (PR) • influence ***Short description:*** Originally developed to rank Web pages, PageRank has been also widely used to rank publications in citation @@ -97,7 +97,7 @@ Also, since some indicators require the publication year for their calculation, ***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) -## RAM +## RAM • popularity_alt ***Short description:*** RAM is essentially a modified Citation Count, where recent citations are considered of higher importance compared to older ones. @@ -131,7 +131,7 @@ Also, since some indicators require the publication year for their calculation, ***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) -## AttRank +## AttRank • popularity ***Short description:*** AttRank is a PageRank variant that alleviates its bias against recent publications (i.e., it is tailored to capture popularity). diff --git a/src/css/custom.css b/src/css/custom.css index b6455c2..81feb5f 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -41,6 +41,10 @@ background-color: yellow; } +.bip-indicator-names { + color: #B9BBB6; +} + @media (min-width: 996px) { .left-badge { diff --git a/versioned_docs/version-6.0.0/api/_category_.json b/versioned_docs/version-6.0.0/api/_category_.json new file mode 100644 index 0000000..36617e4 --- /dev/null +++ b/versioned_docs/version-6.0.0/api/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Public API", + "position": 4, + "link": { + "type": "doc", + "id": "api" + } +} \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/api/api.md b/versioned_docs/version-6.0.0/api/api.md new file mode 100644 index 0000000..1cf4b7f --- /dev/null +++ b/versioned_docs/version-6.0.0/api/api.md @@ -0,0 +1,6 @@ +--- +sidebar_position: 5 +--- + +# Public API +TODO: https://graph.openaire.eu/develop/overview.html \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/assets/badges/openaire-badge-1.png b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-1.png new file mode 100644 index 0000000..fc2ade4 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-1.png differ diff --git a/versioned_docs/version-6.0.0/assets/badges/openaire-badge-1.zip b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-1.zip new file mode 100644 index 0000000..3aa0a97 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-1.zip differ diff --git a/versioned_docs/version-6.0.0/assets/badges/openaire-badge-2.png b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-2.png new file mode 100644 index 0000000..42153ff Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-2.png differ diff --git a/versioned_docs/version-6.0.0/assets/badges/openaire-badge-2.zip b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-2.zip new file mode 100644 index 0000000..d89a26d Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-2.zip differ diff --git a/versioned_docs/version-6.0.0/assets/badges/openaire-badge-3.png b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-3.png new file mode 100644 index 0000000..11e9b94 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-3.png differ diff --git a/versioned_docs/version-6.0.0/assets/badges/openaire-badge-3.zip b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-3.zip new file mode 100644 index 0000000..b4f3599 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/badges/openaire-badge-3.zip differ diff --git a/versioned_docs/version-6.0.0/assets/img/aggregation.png b/versioned_docs/version-6.0.0/assets/img/aggregation.png new file mode 100644 index 0000000..e0f04e5 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/aggregation.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/architecture.png b/versioned_docs/version-6.0.0/assets/img/architecture.png new file mode 100644 index 0000000..06358c5 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/architecture.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/data-model-2.png b/versioned_docs/version-6.0.0/assets/img/data-model-2.png new file mode 100644 index 0000000..fd897ea Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/data-model-2.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/data-model.png b/versioned_docs/version-6.0.0/assets/img/data-model.png new file mode 100644 index 0000000..a797502 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/data-model.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/decisiontree-dataset-orp.png b/versioned_docs/version-6.0.0/assets/img/decisiontree-dataset-orp.png new file mode 100644 index 0000000..e2184f0 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/decisiontree-dataset-orp.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/decisiontree-organization.png b/versioned_docs/version-6.0.0/assets/img/decisiontree-organization.png new file mode 100644 index 0000000..7b71e12 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/decisiontree-organization.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/decisiontree-publication.png b/versioned_docs/version-6.0.0/assets/img/decisiontree-publication.png new file mode 100644 index 0000000..50d173d Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/decisiontree-publication.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/decisiontree-software.png b/versioned_docs/version-6.0.0/assets/img/decisiontree-software.png new file mode 100644 index 0000000..7cdb108 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/decisiontree-software.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/dedup-relation-fixup.png b/versioned_docs/version-6.0.0/assets/img/dedup-relation-fixup.png new file mode 100644 index 0000000..8bfe434 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/dedup-relation-fixup.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/dedup-results.png b/versioned_docs/version-6.0.0/assets/img/dedup-results.png new file mode 100644 index 0000000..8ab84d1 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/dedup-results.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/deduplication-workflow.png b/versioned_docs/version-6.0.0/assets/img/deduplication-workflow.png new file mode 100644 index 0000000..ed4acf6 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/deduplication-workflow.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_datasource.png b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_datasource.png new file mode 100644 index 0000000..2b78501 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_datasource.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_selconstraints.png b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_selconstraints.png new file mode 100644 index 0000000..d1cff89 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_selconstraints.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_subject.png b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_subject.png new file mode 100644 index 0000000..2ea815a Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_subject.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_zenodo.png b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_zenodo.png new file mode 100644 index 0000000..d8626f1 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/bulktagging_zenodo.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/organization_tree.png b/versioned_docs/version-6.0.0/assets/img/enrichment/organization_tree.png new file mode 100644 index 0000000..c4d3df3 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/organization_tree.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_affiliationistrepo.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_affiliationistrepo.png new file mode 100644 index 0000000..6961a9a Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_affiliationistrepo.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_country.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_country.png new file mode 100644 index 0000000..8f90911 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_country.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_orcid.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_orcid.png new file mode 100644 index 0000000..49a230e Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_orcid.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_organizationsemrel.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_organizationsemrel.png new file mode 100644 index 0000000..caef457 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_organizationsemrel.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttocommunitythroughorganization.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttocommunitythroughorganization.png new file mode 100644 index 0000000..18dcc42 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttocommunitythroughorganization.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttocommunitythroughsemrel.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttocommunitythroughsemrel.png new file mode 100644 index 0000000..9c6e0eb Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttocommunitythroughsemrel.png differ diff --git a/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttoproject.png b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttoproject.png new file mode 100644 index 0000000..4960b42 Binary files /dev/null and b/versioned_docs/version-6.0.0/assets/img/enrichment/propagation_resulttoproject.png differ diff --git a/versioned_docs/version-6.0.0/changelog.md b/versioned_docs/version-6.0.0/changelog.md new file mode 100644 index 0000000..25789d1 --- /dev/null +++ b/versioned_docs/version-6.0.0/changelog.md @@ -0,0 +1,134 @@ +--- +sidebar_position: 12 +--- + +# Versions & changelog + +## Versioning + +Our versioning policy follows the [Semantic Versioning specification](https://semver.org/). +In our case, given a version `MAJOR.MINOR.PATCH`, we increment the: + +* `MAJOR` version when the data model of the Graph changes +* `MINOR` version when the pipeline (e.g., different deduplication method, different implementation for an enrichment process) or major data sources change +* `PATCH` version when the graph data are updated + +## Changelog + +This section documents all notable changes for each graph version. + +--- + +### v6.0.0 +_Start Date: 2023-07-26 • Release Date: 2023-08-16 • Dump release: **yes**_ + +#### Changed + +- [Relationship data model](/data-model/relationships/relationship-object): flattened properties source, sourceType, target, targetType +- BIP! indicators are now serialised as an array; see the updated model [here](/data-model/entities/other#bipindicators) +- Crossref dump from June 2023 +- ORCID works without a DOI from June 2023 +- Usage counts from June 2023 +- Datacite contents from June 2023 +- OpenCitations relations from January 2023 +- BIP! indicators from June 2023 +- New Datasources/Services were added, collected from an updated EOSC Service catalogue endpoint + + +### v5.2.0 +_Start Date: 2023-07-03 • Release Date: 2023-07-17 • Dump release: **no**_ + +#### Added +- Citations imported from Crossref & MAG +- FoS and SDG classifications introduced for ~16Mi research products + +#### Changed + +- Removed the numerical prefix from the OpenAIRE identifiers (```"20|openorgs____::..." --> "openorgs____::..."```) +- Dataset file names in the Zenodo depositions changed from `dump` to `dataset` +- Crossref dump from May 2023 +- ORCID works without a DOI from June 2023 +- Usage counts from April 2023 +- Datacite contents from June 2023 +- OpenCitations relations from January 2023 +- Deduplication of the datasource +- Avoid duplicated organisation PIDs + +### v5.1.3 +_Start Date: 2023-05-22 • Release Date: 2023-06-12 • Dump release: **no**_ + +#### Added +- Datasource and project level usage counts + +#### Changed + +- Crossref dump from April 2023 +- ORCID works without a DOI from May 2023 +- Usage counts from April 2023 +- Datacite contents from May 2023 +- OpenCitations relations from January 2023 +- Deduplication of the datasource + +### v5.1.2 +_Start Date: 2023-03-20 • Release Date: 2023-04-04 • Dump release: **no**_ + +#### Changed + +- Crossref dump from February 2023 +- ORCID works without a DOI from March 2023 +- Usage counts from February 2023 (+76% Downloads per Datasource for 2023) +- Datacite contents from mid March 2023 +- OpenCitations relations from January 2023 + +### v5.1.1 +_Start Date: 2023-02-13 • Release Date: 2023-03-01 • Dump release: **no**_ + +#### Added + +- Revised SDG classification: improved coverage (+600K classified DOIs) +- General increase of the funded scientific outputs, thanks to the full text mining scanning new OpenAccess publications +- Integrated contents from + - [EMBL-EBIs Protein Data Bank in Europe](/graph-production-workflow/aggregation/non-compatible-sources/ebi) + - [UniProtKB/Swiss-Prot](/graph-production-workflow//aggregation/non-compatible-sources/uniprot) + +#### Changed + +- Crossref dump from January 2023 +- ORCID works without a DOI from January 2023 +- Usage counts from January 2023 +- Datacite contents from mid February 2023 +- OpenCitations relations from December 2022 + +### v5.1.0 +_Start Date: 2023-01-16 • Release Date: 2023-01-30 • Dump release: **no**_ + +#### Added + +- Revised SDG classification: better accuracy, lower coverage (will improve in the next months) + +#### Changed + +- Crossref dump from December 2022 +- ORCID works without a DOI from January 2023 +- Usage counts from December 2022 +- DataCite contents from January 2023 + +--- + +### v5.0.0 + +_Start Date: 2022-12-19 • Release Date: 2022-12-28 • Dump release: **yes**_ + +#### Added + +- [Impact & Usage indicators](/data-model/entities/result#indicators) at the level of the Result +- [Beginner's kit](/downloads/beginners-kit) in the Downloads section +- New relationship types were introduced; see the complete list [here](/data-model/relationships/relationship-types) + +#### Changed + +- FOS and SDGs were removed from the [result subjects](/data-model/entities/result#subjects) +- Measures were removed from the [result instance](/data-model/entities/result#instance) +- Updated DOIBoost to include publications from Crossref and the works from ORCID with a DOI until November 2022 +- Added ORCID works without a DOI from November 2022 + diff --git a/versioned_docs/version-6.0.0/data-model/_category_.json b/versioned_docs/version-6.0.0/data-model/_category_.json new file mode 100644 index 0000000..730ffb4 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Data model", + "position": 3, + "link": { + "type": "doc", + "id": "data-model" + } +} \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/data-model/data-model.md b/versioned_docs/version-6.0.0/data-model/data-model.md new file mode 100644 index 0000000..7f4da93 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/data-model.md @@ -0,0 +1,25 @@ +# Data model + +The OpenAIRE Graph comprises several types of [entities](../category/entities) and [relationships](/category/relationships) among them. + +The latest version of the JSON schema can be found on the [Downloads](../downloads/full-graph) section. + +

+ Data model +

+ +The figure above, presents the graph's data model. +Its main entities are described in brief below: + +* [Results](/data-model/entities/result) represent the outcomes (or products) of research activities. +* [Data Sources](/data-model/entities/data-source) are the sources from which the metadata of graph objects are collected. +* [Organizations](/data-model/entities/organization) correspond to companies or research institutions involved in projects, +responsible for operating data sources or consisting the affiliations of Product creators. +* [Projects](/data-model/entities/project) are research project grants funded by a Funding Stream of a Funder. +* [Communities](/data-model/entities/community) are groups of people with a common research intent (e.g. research infrastructures, university alliances). + +:::note Further reading + +A detailed report on the OpenAIRE Graph Data Model can be found on [Zenodo](https://zenodo.org/record/2643199). +::: + diff --git a/versioned_docs/version-6.0.0/data-model/entities/_category_.json b/versioned_docs/version-6.0.0/data-model/entities/_category_.json new file mode 100644 index 0000000..8161451 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Entities", + "position": 1, + "link": { + "type": "generated-index", + "description": "The main entities of the OpenAIRE Graph are listed below." + } +} \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/data-model/entities/community.md b/versioned_docs/version-6.0.0/data-model/entities/community.md new file mode 100644 index 0000000..a93febe --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/community.md @@ -0,0 +1,82 @@ +--- +sidebar_position: 6 +--- + +# Communities + +Research communities and research initiatives are intended as groups of people with a common research intent and can be of two types: ​research initiatives or ​research communities​: + +* Research initiatives are intended to capture a view of the information space that is "research impact"-oriented, i.e. all products generated due to my research initiative; +* Research communities the latter “research activity” oriented, i.e. all products that may be of interest or related to my research initiative. + +For example, the organizations supporting a research infrastructure fall in the first category, while the researchers involved in a discipline fall in the second. + +## The `Community` object + +### id +_Type: String • Cardinality: ONE_ + +The OpenAIRE id for the community/research infrastructure, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). + +```json + "id": "context_____::5b7f9fa40bdc12072249204cedfa7808" +``` + +### acronym +_Type: String • Cardinality: ONE_ + +The acronym of the community. + +```json +"acronym": "covid-19" +``` + +### description +_Type: String • Cardinality: ONE_ + +Description of the research community/research infrastructure + +```json +"description": "This portal provides access to publications, research data, projects and software that may be relevant to the Corona Virus Disease (COVID-19). The OpenAIRE COVID-19 Gateway aggregates COVID-19 related records, links them and provides a single access point for discovery and navigation. We tag content from the OpenAIRE Graph (10,000+ data sources) and additional sources. All COVID-19 related research results are linked to people, organizations and projects, providing a contextualized navigation." +``` + +### name +_Type: String • Cardinality: ONE_ + +The long name of the community. + +```json +"name": "Corona Virus Disease" +``` + +### subject +_Type: String • Cardinality: MANY_ + +The list of the subjects associated to the research community (only appies to research communities). + +```json +"subject": [ + "COVID19", + "SARS-CoV", + "HCoV-19", + ... +] +``` + +### type +_Type: String • Cardinality: ONE_ + +The type of the community; one of `{ Research Community, Research infrastructure }`. + +```json +"type": "Research Community" +``` + +### zenodo_community +_Type: String • Cardinality: ONE_ + +The URL of the Zenodo community associated to the Research community/Research infrastructure. + +```json +"zenodo_community": "https://zenodo.org/communities/covid-19" +``` diff --git a/versioned_docs/version-6.0.0/data-model/entities/data-source.md b/versioned_docs/version-6.0.0/data-model/entities/data-source.md new file mode 100644 index 0000000..1678c16 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/data-source.md @@ -0,0 +1,294 @@ +--- +sidebar_position: 2 +--- + +# Data sources + +OpenAIRE entity instances are created out of data collected from various data sources of different kinds, such as publication repositories, dataset archives, CRIS systems, funder databases, etc. Data sources export information packages (e.g., XML records, HTTP responses, RDF data, JSON) that may contain information on one or more of such entities and possibly relationships between them. + +For example, a metadata record about a project carries information for the creation of a Project entity and its participants (as Organization entities). It is important, once each piece of information is extracted from such packages and inserted into the OpenAIRE information space as an entity, for such pieces to keep provenance information relative to the originating data source. This is to give visibility to the data source, but also to enable the reconstruction of the very same piece of information if problems arise. + +--- + +## The `DataSource` object + +### id +_Type: String • Cardinality: ONE_ + +The OpenAIRE id of the data source, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). + +```json +"id": "issn___print::22c514d022b199c346e7f29ca06efc95" +``` + +### originalId +_Type: String • Cardinality: MANY_ + +The list of original identifiers associated to the datasource. + +```json +"originalId": [ + "issn___print::2451-8271", + ... +] +``` + +### pid + +_Type: [ControlledField](other#controlledfield) • Cardinality: MANY_ + +The persistent identifiers for the datasource. + +```json +"pid": [ + { + "scheme": "DOI", + "value": "10.5281/zenodo.4707307" + }, + ... +] +``` + +### datasourcetype +_Type: [ControlledField](other#controlledfield) • Cardinality: ONE_ + +The datasource type; see the vocabulary [dnet:datasource_typologies](https://api.openaire.eu/vocabularies/dnet:datasource_typologies). + +```json +"datasourcetype": { + "scheme": "pubsrepository::journal", + "value": "Journal" +} +``` + +### openairecompatibility +_Type: String • Cardinality: ONE_ + +The OpenAIRE compatibility of the ingested results, indicates which guidelines they are compliant according to the vocabulary [dnet:datasourceCompatibilityLevel](https://api.openaire.eu/vocabularies/dnet:datasourceCompatibilityLevel). + +```json +"openairecompatibility": "collected from a compatible aggregator" +``` + +### officialname +_Type: String • Cardinality: ONE_ + +The official name of the datasource. + +```json +"officialname": "Recent Patents and Topics on Medical Imaging" +``` + +### englishname +_Type: String • Cardinality: ONE_ + +The English name of the datasource. + +```json +"englishname": "Recent Patents and Topics on Medical Imaging" +``` + +### websiteurl +_Type: String • Cardinality: ONE_ + +The URL of the website of the datasource. + +```json +"websiteurl": "http://dspace.unict.it/" +``` + +### logourl +_Type: String • Cardinality: ONE_ + +The URL of the logo for the datasource. + +```json +"logourl": "https://impactum-journals.uc.pt/public/journals/26/pageHeaderLogoImage_en_US.png" +``` + +### dateofvalidation +_Type: String • Cardinality: ONE_ + +The date of validation against the OpenAIRE guidelines for the datasource records. + +```json +"dateofvalidation": "2016-10-10" +``` + +### description +_Type: String • Cardinality: ONE_ + +The description for the datasource. + +```json +"description": "Recent Patents on Medical Imaging publishes review and research articles, and guest edited single-topic issues on recent patents in the field of medical imaging. It provides an important and reliable source of current information on developments in the field. The journal is essential reading for all researchers involved in Medical Imaging." +``` + +### subjects +_Type: String • Cardinality: MANY_ + +List of subjects associated to the datasource + +```json +"subjects": [ + "Medicine", + "Imaging", + ... +] +``` + +### languages +_Type: String • Cardinality: MANY_ + +The languages present in the data source's content, as defined by OpenDOAR. + +```json +"languages":[ + "eng", + ... +] +``` + +### contenttypes +_Type: String • Cardinality: MANY_ + +Types of content in the data source, as defined by OpenDOAR + +```json +"contenttypes": [ + "Journal articles", + ... +] +``` + +### releasestartdate +_Type: String • Cardinality: ONE_ + +Releasing date of the data source, as defined by re3data.org. + +```json +"releasestartdate": "2010-07-24" +``` + +### releaseenddate +_Type: String • Cardinality: ONE_ + +Date when the data source went offline or stopped ingesting new research data. As defined by re3data.org + +```json +"releaseenddate": "2016-03-28" +``` + +### accessrights +_Type: String • Cardinality: ONE_ + +Type of access to the data source, as defined by re3data.org. Possible values: `{ open, restricted, closed }`. + +```json +"accessrights": "open" +``` + +### uploadrights +_Type: String • Cardinality: ONE_ + +Type of data upload, as defined by re3data.org; one of `{ open, restricted, closed }`. + +```json +"uploadrights": "closed" +``` + +### databaseaccessrestriction +_Type: String • Cardinality: ONE_ + +Access restrictions to the research data repository. Allowed values are: `{ feeRequired, registration, other }`. + +This field only applies for re3data data source; see [re3data schema specification](https://gfzpublic.gfz-potsdam.de/rest/items/item_758898_6/component/file_775891/content) for more details. + +```json +"databaseaccessrestriction": "registration" +``` + +### datauploadrestriction +_Type: String • Cardinality: ONE_ + +Upload restrictions applied by the datasource, as defined by re3data.org. One of `{ feeRequired, registration, other }`. + +This field only applies for re3data data source; see [re3data schema specification](https://gfzpublic.gfz-potsdam.de/rest/items/item_758898_6/component/file_775891/content) for more details. + +```json +"datauploadrestriction": "feeRequired registration" +``` + +### versioning +_Type: Boolean • Cardinality: ONE_ + +Whether the research data repository supports versioning: +`yes` if the data source supports versioning, `no` otherwise. + +This field only applies for re3data data source; see [re3data schema specification](https://gfzpublic.gfz-potsdam.de/rest/items/item_758898_6/component/file_775891/content) for more details. + +```json +"versioning": true +``` + +### citationguidelineurl +_Type: String • Cardinality: ONE_ + +The URL of the data source providing information on how to cite its items. The DataCite citation format is recommended (http://www.datacite.org/whycitedata). + +This field only applies for re3data data source; see [re3data schema specification](https://gfzpublic.gfz-potsdam.de/rest/items/item_758898_6/component/file_775891/content) for more details. + +```json +"citationguidelineurl": "https://physionet.org/about/#citation" +``` + +### pidsystems +_Type: String • Cardinality: ONE_ + +The persistent identifier system that is used by the data source. As defined by re3data.org. + +```json +"pidsystems": "hdl" +``` + +### certificates +_Type: String • Cardinality: ONE_ + +The certificate, seal or standard the data source complies with. As defined by re3data.org. + +```json +"certificates": "WDS" +``` + +### policies +_Type: String • Cardinality: MANY_ + +Policies of the data source, as defined in OpenDOAR. + +### journal +_Type: [Container](other#container) • Cardinality: ONE_ + +Information about the journal, if this data source is of type Journal. + +```json +"container": { + "edition": "", + "iss": "5", + "issnLinking": "", + "issnOnline": "1873-7625", + "issnPrinted":"2451-8271", + "name": "Recent Patents and Topics on Imaging", + "sp": "12", + "ep": "22", + "vol": "50" +} +``` + +### missionstatementurl +_Type: String • Cardinality: ONE_ + +The URL of a mission statement describing the designated community of the data source. As defined by re3data.org + +```json +"missionstatementurl": "https://www.sigma2.no/content/nird-research-data-archive" +``` \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/data-model/entities/organization.md b/versioned_docs/version-6.0.0/data-model/entities/organization.md new file mode 100644 index 0000000..0b5f1d0 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/organization.md @@ -0,0 +1,94 @@ +--- +sidebar_position: 3 +--- + +# Organizations + +Organizations include companies, research centers or institutions involved as project partners or as responsible of operating data sources. Information about organizations are collected from funder databases like CORDA, registries of data sources like OpenDOAR and re3Data, and CRIS systems, as being related to projects or data sources. + + +--- + +## The `Organization` object + +### id +_Type: String • Cardinality: ONE_ + +The OpenAIRE id for the organization, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). + +```json +"id": "openorgs____::b84450f9864182c67b8611b5593f4250" +``` + +### legalshortname +_Type: String • Cardinality: ONE_ + +The legal name in short form of the organization. + +```json +"legalshortname": "ARC" +``` + +### legalname +_Type: String • Cardinality: ONE_ + +The legal name of the organization. + +```json +"legalname": "Athena Research and Innovation Center In Information Communication & Knowledge Technologies" +``` + +### alternativenames +_Type: String • Cardinality: MANY_ + +Alternative names that identify the organization. + +```json +"alternativenames": [ + "Athena Research and Innovation Center In Information Communication & Knowledge Technologies", + "Athena RIC", + "ARC", + ... +] +``` + +### websiteurl +_Type: String • Cardinality: ONE_ + +The websiteurl of the organization. + +```json +"websiteurl": "https://www.athena-innovation.gr/el/announce/pressreleases.html" +``` + +### country +_Type: [Country](other#country) • Cardinality: ONE_ + +The country where the organization is located. + +```json +"country":{ + "code": "GR", + "label": "Greece" +} +``` + +### pid +_Type: [OrganizationPid](other#organizationpid) • Cardinality: MANY_ + +The list of persistent identifiers for the organization. + +```json +"pid": [ + { + "scheme": "ISNI", + "value": "0000 0004 0393 5688" + }, + { + "scheme": "GRID", + "value": + "grid.19843.37" + }, + ... +] +``` \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/data-model/entities/other.md b/versioned_docs/version-6.0.0/data-model/entities/other.md new file mode 100644 index 0000000..a5ddcf1 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/other.md @@ -0,0 +1,884 @@ +--- +sidebar_position: 7 +--- + +# Other component objects + +Here, we describe other component objects that are used as part of the main graph entities. + +## AccessRight + +Subclass of [BestAccessRight](#bestaccessright), indicates information about rights held in and over the resource and the open Access Route. + +### openAccessRoute +_Type: One of `{ gold, green, hybrid, bronze }` • Cardinality: ONE_ + +Indicates the OpenAccess status. Values are set according to the [Unpaywall methodology](https://support.unpaywall.org/support/solutions/articles/44001777288-what-do-the-types-of-oa-status-green-gold-hybrid-and-bronze-mean-). + +```json +"openAccessRoute": "gold" +``` + +## AlternateIdentifier +Type used to represent the information associated to persistent identifiers associated to the result that have not been forged by an authority for that pid type. For example we collect metadata from an institutional repository that provides as identifier for the result also the doi. + +### scheme +_Type: String • Cardinality: ONE_ + +Vocabulary reference. + +```json +"scheme": "doi" +``` + +### value +_Type: String • Cardinality: ONE_ + +Value from the given scheme/vocabulary. + +```json +"value": "10.1016/j.respol.2021.104226" +``` + +## APC +Indicates the money spent to make a book or article available in Open Access. Sources for this information includes the OpenAPC initiative. + +### currency +_Type: String • Cardinality: ONE_ + +The system of money in which the amount is expressed (Euro, USD, etc). + +```json +"currency": "EU" +``` + +### amount +_Type: String • Cardinality: ONE_ + +The quantity of money. + +```json +"amount": "1000" +``` + +## Author + +Represents the result author. + +### fullname +_Type: String • Cardinality: ONE_ + +Author's full name. + +```json +"fullname": "Turunen, Heidi" +``` + +### name +_Type: String • Cardinality: ONE_ + +Author's given name. + +```json +"name": "Heidi" +``` + +### surname +_Type: String • Cardinality: ONE_ + +Author's family name. + +```json +"surname": "Turunen" +``` + +### rank +_Type: String • Cardinality: ONE_ + +Author's order in the list of authors for the given result. + +```json +"rank": 1 +``` + +### pid +_Type: [AuthorPid](#authorpid) • Cardinality: ONE_ + +Persistent identifier associated with this author. + +```json +"pid": { + "id": { + "scheme": "orcid", + "value": "0000-0001-7169-1177" + }, + "provenance": { + "provenance": "Harvested", + "trust": "0.9" + } +} +``` + +## AuthorPid + +The author's persistent identifier. + +### id +_Type: [AuthorPidSchemaValue](#authorpidschemavalue) • Cardinality: ONE_ + +```json +"id": { + "scheme": "orcid", + "value": "0000-0001-7169-1177" +} +``` + +### provenance +_Type: [Provenance](#provenance-2) • Cardinality: ONE_ + +The reason why the pid was associated to the author. + +```json +"provenance": { + "provenance": "Inferred by OpenAIRE", + "trust": "0.85" +} +``` + +## AuthorPidSchemaValue +Type used to represent the scheme and value for the author's pid. + +### schema +_Type: String • Cardinality: ONE_ + +The author's pid scheme. OpenAIRE currently supports ORCID. + +```json +"scheme": "orcid" +``` + +### value +_Type: String • Cardinality: ONE_ + +The author's pid value in that scheme. + +```json +"value": "0000-1111-2222-3333" +``` + +## BestAccessRight +Indicates the most open access rights \*available among the result Instances. + +\* where the openness is defined by the ordering of the access right terms in the following. +``` +OPEN SOURCE > OPEN > EMBARGO (6MONTHS) > EMBARGO (12MONTHS) > RESTRICTED > CLOSED > UNKNOWN +``` + +### code +_Type: String • Cardinality: ONE_ + +COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/. + +```json +"code": "c_16ec" +``` + +### label +_Type: String • Cardinality: ONE_ + +Label for the access mode. + +```json +"label": "RESTRICTED" +``` + +### scheme +_Type: String • Cardinality: ONE_ + +Scheme of reference for access right code. Currently, always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/. + +```json +"scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" +``` + +## BipIndicator + +The different impact indicators as computed by [BIP!](https://bip.imsi.athenarc.gr/). + + +### indicator +_Type: String • Cardinality: ONE_ + +The name of indicator; it can be either one of: +* `influence`: it reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically). +* `influence_alt`: it is an alternative to the "Influence" indicator, which also reflects the overall/total impact of an article in the research community at large, based on the underlying citation network (diachronically). +* `popularity`: it reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network. +* `popularity_alt`: it is an alternative to the "Popularity" indicator, which also reflects the "current" impact/attention (the "hype") of an article in the research community at large, based on the underlying citation network. +* `impulse`: it reflects the initial momentum of an article directly after its publication, based on the underlying citation network. + +For more details on how these indicators are calculated, please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators). + +```json +"influence": { + "score": "123", + "class": "C2" +} +``` + +### class +_Type: String • Cardinality: ONE_ + +The impact class assigned based on the indicator score. + +To facilitate comprehension, BIP! also offers impact classes for articles, to group together those that have similar impact. The following 5 classes are provided: +* `C1`: Top 0.01% +* `C2`: Top 0.1% +* `C3`: Top 1% +* `C4`: Top 10% +* `C5`: Bottom 90% + +```json +"class": "C2" +``` + +### score +_Type: String • Cardinality: ONE_ + +The actual indicator score. + +```json +"score": "1234" +``` + +## Container +This field has information about the conference or journal where the result has been presented or published. + +### name +_Type: String • Cardinality: ONE_ + +Name of the journal or conference. + +```json +"name": "Research Policy" +``` + +### issnPrinted +_Type: String • Cardinality: ONE_ + +The journal printed issn. + +```json +"issnPrinted": "0048-7333" +``` + +### issnOnline +_Type: String • Cardinality: ONE_ + +The journal online issn. + +```json +"issnOnline": "1873-7625" +``` + +### issnLinking +_Type: String • Cardinality: ONE_ + +The journal linking issn. + +### iss +_Type: String • Cardinality: ONE_ + +The journal issue. + +```json +"iss": "5" +``` + +### sp +_Type: String • Cardinality: ONE_ + +The start page. + +```json +"sp": "12" +``` + +### ep +_Type: String • Cardinality: ONE_ + +The end page. + +```json +"ep": "22" +``` + +### vol +_Type: String • Cardinality: ONE_ + +The journal volume. + +```json +"vol": "50" +``` + +### edition +_Type: String • Cardinality: ONE_ + +The edition of the journal or conference. + +### conferenceplace +_Type: String • Cardinality: ONE_ + +The place of the conference. + +```json +"conferenceplace": "Padua, Italy" +``` + +### conferencedate +_Type: String • Cardinality: ONE_ + +The date of the conference. + +```json +"conferencedate": "2022-09-22" +``` + +## ControlledField + + +Generic type used to represent the information described by a scheme and a value in that scheme (i.e. pid). + +### scheme +_Type: String • Cardinality: ONE_ + +Vocabulary reference. + +```json +"scheme": "DOI" +``` + +### value +_Type: String • Cardinality: ONE_ + +Value from the given scheme/vocabulary. + +```json +"value": "10.5281/zenodo.4707307" +``` + +## Country +To represent the generic country code and label. + +### code +_Type: String • Cardinality: ONE_ + +ISO 3166-1 alpha-2 country code. + +```json +"code" : "IT" +``` + +### label +_Type: String • Cardinality: ONE_ + +The country label. + +```json +"label": "Italy" +``` + +## Funding +Funding information for a project. + +### funding_stream +_Type: [FundingStream](#fundingstream) • Cardinality: ONE_ + +Funding information for the project. + +```json +"funding_stream": { + "description": "Horizon 2020 Framework Programme - Research and Innovation action", + "id": "EC::H2020::RIA" +} +``` +### jurisdiction +_Type: String • Cardinality: ONE_ + +Geographical jurisdiction (e.g. for European Commission is EU, for Croatian Science Foundation is HR). + +```json +"jurisdiction": "EU" +``` + +### name +_Type: String • Cardinality: ONE_ + +The name of the funder. + +```json +"name": "European Commission" +``` + +### shortName +_Type: String • Cardinality: ONE_ + +The short name of the funder. + +```json +"shortName": "EC" +``` + +## FundingStream +Description of a funding stream. + +### id +_Type: String • Cardinality: ONE_ + +The identifier of the funding stream. + +```json +"id": "EC::H2020::RIA" +``` + +### description +_Type: String • Cardinality: ONE_ + +Short description of the funding stream. + +```json +"description": "Horizon 2020 Framework Programme - Research and Innovation action" +``` + +## GeoLocation +Represents the geolocation information. + +### point +_Type: String • Cardinality: ONE_ + +A point with Latitude and Longitude. + +```json +"point": "7.72486 50.1084" +``` + +### box +_Type: String • Cardinality: ONE_ + +A specified bounding box defined by two longitudes (min and max) and two latitudes (min and max). + + +```json +"box": "18.569386 54.468973 18.066832 54.83707" +``` + +### place +_Type: String • Cardinality: ONE_ + +The name of a specific place. + +```json +"place": "Tübingen, Baden-Württemberg, Southern Germany" +``` + +## Grant +The money granted to a project. + +### currency +_Type: String • Cardinality: ONE_ + +The currency of the granted amount (e.g. EUR). + +```json +"currency": "EUR" +``` + +### fundedamount +_Type: Number • Cardinality: ONE_ + +The funded amount. + +```json +"fundedamount": 1.0E7 +``` + +### totalcost +_Type: Number • Cardinality: ONE_ + +The total cost of the project. + +```json +"totalcost": 1.0E7 +``` + +## H2020Programme +The H2020 programme funding a project. + +### code +_Type: String • Cardinality: ONE_ + +The code of the programme. + +```json +"code": "H2020-EU.1.4.1.3." +``` + +### description +_Type: String • Cardinality: ONE_ + +The description of the programme. + +```json +"description": "Development, deployment and operation of ICT-based e-infrastructures" +``` + +## Instance +An instance is one specific materialization or version of the result. For example, you can have one result with three instances as result of deduplication: + +* one is the pre-print +* one is the post-print +* one is the published version + +Each instance is characterized by the properties that follow. + +### accessright +_Type: [AccessRight](#accessright) • Cardinality: ONE_ + +Maps [dc:rights](https://www.dublincore.org/specifications/dublin-core/dcmi-terms/elements11/rights/), describes the access rights of the web resources relative to this instance. + +```json +"accessright": { + "code": "c_abf2", + "label": "OPEN", + "openAccessRoute": "gold", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" +} +``` + +### alternateIdentifier +_Type: [AlternateIdentifier](#alternateidentifier) • Cardinality: MANY_ + +All the identifiers associated to the result other than the authoritative ones. + +```json +"alternateIdentifier": [ + { + "scheme": "doi", + "value": "10.1016/j.respol.2021.104226" + }, + ... +] +``` + +### articleprocessingcharge +_Type: [APC](#apc) • Cardinality: ONE_ + +The money spent to make this book or article available in Open Access. Source for this information is the OpenAPC initiative. + +```json +"articleprocessingcharge": { + "currency": "EUR", + "amount": "1000" +} +``` + +### license +_Type: String • Cardinality: ONE_ + +The license URL. + +```json +"license": "http://creativecommons.org/licenses/by-nc/4.0" +``` + +### pid +_Type: [ResultPid](#resultpid) • Cardinality: MANY_ + +The set of persistent identifiers associated to this instance that have been collected from an authority for the pid type (i.e. Crossref/Datacite for doi). See the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers) for more information. + +```json +"pid": [ + { + "scheme": "pmc", + "value": "PMC8024784" + }, + ... +] +``` + +### publicationdate +_Type: String • Cardinality: ONE_ + +The publication date of the research product. + +```json +"publicationdate": "2009-02-12" +``` + +### refereed +_Type: String • Cardinality: ONE_ + +Describes if this instance has been peer-reviewed or not. Allowed values are peerReviewed, nonPeerReviewed, UNKNOWN (as defined in https://api.openaire.eu/vocabularies/dnet:review_levels). For example: + +* peerReviewed: https://api.openaire.eu/vocabularies/dnet:review_levels/0001 +* nonPeerReviewed: https://api.openaire.eu/vocabularies/dnet:review_levels/0002 + +based on guidelines covers the vocabularies + +* [DRIVE guidelines 2.0 - info:eu-repo/semantic](https://wiki.surfnet.nl/download/attachments/10851536/DRIVER_Guidelines_v2_Final_2008-11-13.pdf) (OpenAIRE v1.0 till v3.0 - Literature) +* [COAR Vocabulary v2.0 and v3.0](https://vocabularies.coar-repositories.org/resource_types/) (OpenAIRE v4 - Inst.+Them.) + +```json +"refereed": "UNKNOWN" +``` + +### type +_Type: String • Cardinality: ONE_ + +The specific sub-type of this instance (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links) + +```json +"type": "Article" +``` + +### url +_Type: String • Cardinality: MANY_ + +URLs to the instance. They may link to the actual full-text or to the landing page at the hosting source. + +```json +"url": [ + "https://periodicos2.uesb.br/index.php/folio/article/view/4296", + ... +] +``` + +## Indicator + +These are indicators computed for a specific OpenAIRE result. + +Each Indicator object is composed of the following properties: + +### bipIndicators +_Type: [BipIndicator](#bipindicator) • Cardinality: MANY_ + +These impact-based indicators, provided by [BIP!](https://bip.imsi.athenarc.gr/), estimate the impact of a result. + +For details about their calculation, please refer [here](/graph-production-workflow/indicators-ingestion/impact-indicators). + +```json +"bipIndicators": [ + { + "indicator": "influence", + "score": "123", + "class": "C2" + }, + { + "indicator": "influence_alt", + "score": "456", + "class": "C3" + }, + { + "indicator": "popularity", + "score": "234", + "class": "C1" + }, + { + "indicator": "popularity_alt", + "score": "345", + "class": "C5" + }, + { + "indicator": "impulse", + "score": "987", + "class": "C3" + } +] +``` + +### usageCounts +_Type: [UsageCounts](#usagecounts-1) • Cardinality: ONE_ + +These measures, computed by the [UsageCounts Service](https://usagecounts.openaire.eu/), are based on usage statistics. + +Please refer [here](/graph-production-workflow/indicators-ingestion/usage-counts) for more details. + +```json +"usageCounts":{ + "downloads": "10", + "views": "20" +} +``` +## Language +Represents information for the language of the result + +### code +_Type: String • Cardinality: ONE_ + +Alpha-3/ISO 639-2 code of the language. Values controlled by the [dnet:languages vocabulary](https://api.openaire.eu/vocabularies/dnet:languages). + +```json +"code": "eng" +``` + +### label +_Type: String • Cardinality: ONE_ + +Language label in English. + +```json +"label": "English" +``` + + +## OrganizationPid + +The schema and value for identifiers of the organization. + +### scheme +_Type: String • Cardinality: ONE_ + +Vocabulary reference (i.e. isni). + +```json +"scheme" : "GRID" +``` + +### value +_Type: String • Cardinality: ONE_ + +Value from the given scheme/vocabulary (i.e. 0000000090326370). + +```json +"value" : "grid.7119.e" +``` + +## Provenance +Indicates the process that produced (or provided) the information, and the trust associated to the information. + +### provenance +_Type: String • Cardinality: ONE_ + +Provenance term from the vocabulary [dnet:provenanceActions](https://api.openaire.eu/vocabularies/dnet:provenanceActions). + +```json +"provenance": "Harvested" +``` + +### trust +_Type: String • Cardinality: ONE_ + +Trust, expressed as a number in the range [0-1]. + +```json +"trust": "0.9" +``` + +## ResultCountry +It is for the country associated to the result. +It is a subclass of [Country](#country) and extends it with provenance information. + +### provenance +_Type: [Provenance](#provenance-2) • Cardinality: ONE_ + +Indicates the reason why this country is associated to this result. + +```json +"provenance": { + "provenance": "inferred by OpenAIRE", + "trust": "0.85" +} +``` + +## ResultPid +Type used to represent the information associated to persistent identifiers for the result that have been forged by an authority for that pid type. + + + +### scheme +_Type: String • Cardinality: ONE_ + +The scheme of the persistent identifier for the result (i.e. doi). If the pid is here it means the information for the pid has been collected from an authority for that pid type (i.e. Crossref/Datacite for doi). The set of authoritative pid is: `doi` when collected from Crossref or Datacite, `pmid` when collected from EuroPubmed, `arxiv` when collected from arXiv, `handle` from the repositories. + +```json +"scheme": "doi" +``` + +### value +_Type: String • Cardinality: ONE_ + +The value expressed in the scheme (i.e. 10.1000/182). + +```json +"value": "10.21511/bbs.13(3).2018.13" +``` + +## Subject +Represents keywords associated to the result. + +### subject +_Type: [SubjectSchemeValue](#subjectschemevalue) • Cardinality: ONE_ + +Contains the subject term: subject type (keyword, MeSH, etc) and the subject term (medicine, chemistry, etc.). + +```json +"subject": { + "scheme": "keyword", + "value": "SVOC" +} +``` + +### provenance +_Type: [Provenance](#provenance-2) • Cardinality: ONE_ + +Contains provenance information for the subject term. + +```json +"provenance": { + "provenance": "Harvested", + "trust": "0.9" +} +``` + +## SubjectSchemeValue +Subject classification against a vocabulary + +### scheme +_Type: String • Cardinality: ONE_ + +OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies). + +```json +"scheme" : "keyword" +``` + +### value +_Type: String • Cardinality: ONE_ + +The value for the subject in the selected scheme. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary). + +```json +"value" : "pyrolysis-oil" +``` + +## UsageCounts + +The usage counts indicator computed for this result. + +### views +_Type: String • Cardinality: ONE_ + +The number of views for this result. + +```json +"views": "10" +``` + +### downloads +_Type: String • Cardinality: ONE_ + +The number of downloads for this result. + +```json +"downloads": "5" +``` \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/data-model/entities/project.md b/versioned_docs/version-6.0.0/data-model/entities/project.md new file mode 100644 index 0000000..facada4 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/project.md @@ -0,0 +1,171 @@ +--- +sidebar_position: 4 +--- + +# Projects + +Of crucial interest to OpenAIRE is also the identification of the funders (e.g. European Commission, WellcomeTrust, FCT Portugal, NWO The Netherlands) that co-funded the projects that have led to a given result. Projects are characterized by a list of funding streams (e.g. FP7, H2020 for the EC), which identify the strands of fundings. Funding streams can be nested to form a tree of sub-funding streams. + +--- + +## The `Project` object + +### id +_Type: String • Cardinality: ONE_ + +Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). + +```json +"id": "corda__h2020::70ea22400fd890c5033cb31642c4ae68" +``` + +### code +_Type: String • Cardinality: ONE_ + +Τhe grant agreement code of the project. + +```json +"code": "777541" +``` + +### acronym +_Type: String • Cardinality: ONE_ + +Project's acronym. + +```json +"acronym": "OpenAIRE-Advance" +``` + +### title +_Type: String • Cardinality: ONE_ + +Project's title. + +```json +"title": "OpenAIRE Advancing Open Scholarship" +``` + +### callidentifier +_Type: String • Cardinality: ONE_ + +The identifier of the research call. + +```json +"callidentifier": "H2020-EINFRA-2017"` +``` + +### funding +_Type: [Funding](other#funding) • Cardinality: MANY_ + +Funding information for the project. + +```json +"funding": [ + { + "funding_stream": { + "description": "Horizon 2020 Framework Programme - Research and Innovation action", + "id": "EC::H2020::RIA" + }, + "jurisdiction": "EU", + "name": "European Commission", + "shortName": "EC" + } +] +``` +### granted +_Type: [Grant](other#grant) • Cardinality: ONE_ + +The money granted to the project. + +```json +"granted": { + "currency": "EUR", + "fundedamount": 1.0E7, + "totalcost": 1.0E7 +} +``` + +### h2020programme +_Type: [H2020Programme](other#h2020programme) • Cardinality: MANY_ + +The H2020 programme funding the project. + +```json +"h2020programme":[ + { + "code": "H2020-EU.1.4.1.3.", + "description": "Development, deployment and operation of ICT-based e-infrastructures" + } +] +``` +### keywords +_Type: String • Cardinality: ONE_ + +```json +"keywords": [ + "Open Science", + ... +] +``` + +### openaccessmandatefordataset +_Type: Boolean • Cardinality: ONE_ + +```json +"openaccessmandatefordataset": true +``` + +### openaccessmandateforpublications +_Type: Boolean • Cardinality: ONE_ + +```json +"openaccessmandateforpublications": true +``` + +### startdate +_Type: String • Cardinality: ONE_ + +The start year of the project. + +```json +"startdate": "2018-01-01" +``` + +### enddate +_Type: String • Cardinality: ONE_ + +The end year pf the project. + +```json +"enddate": "2021-02-28" +``` + +### subject +_Type: String • Cardinality: MANY_ + +The subjects of the project + +```json +"subject": [ + "Data and Distributed Computing e-infrastructures for Open Science", + ... +] +``` +### summary +_Type: String • Cardinality: ONE_ + +Short summary of the project. + +```json +"summary": "OpenAIRE-Advance continues the mission of OpenAIRE to support the Open Access/Open Data mandates in Europe. By sustaining the current successful infrastructure, comprised of a human network and robust technical services, it consolidates its achievements while working to shift the momentum among its communities to Open Science, aiming to be a trusted e-Infrastructurewithin the realms of the European Open Science Cloud.In this next phase, OpenAIRE-Advance strives to empower its National Open Access Desks (NOADs) so they become a pivotal part within their own national data infrastructures, positioningOA and open science onto national agendas. The capacity building activities bring together experts ontopical task groups in thematic areas(open policies, RDM, legal issues, TDM), promoting a train the trainer approach, strengthening and expanding the pan-European Helpdesk with support and training toolkits, training resources and workshops.It examines key elements of scholarly communication, i.e., co-operative OA publishing and next generation repositories, to develop essential building blocks of the scholarly commons.On the technical level OpenAIRE-Advance focuses on the operation and maintenance of the OpenAIRE technical TRL8/9 services,and radically improvesthe OpenAIRE services on offer by: a) optimizing their performance and scalability, b) refining their functionality based on end-user feedback, c) repackagingthem into products, taking a professional marketing approach with well-defined KPIs, d)consolidating the range of services/products into a common e-Infra catalogue to enable a wider uptake.OpenAIRE-Advancesteps up its outreach activities with concrete pilots with three major RIs,citizen science initiatives, and innovators via a rigorous Open Innovation programme. Finally, viaits partnership with COAR, OpenAIRE-Advance consolidatesOpenAIRE’s global roleextending its collaborations with Latin America, US, Japan, Canada, and Africa." +``` + +### websiteurl +_Type: String • Cardinality: ONE_ + +The website of the project + +```json +"websiteurl": "https://www.openaire.eu/advance/" +``` diff --git a/versioned_docs/version-6.0.0/data-model/entities/result.md b/versioned_docs/version-6.0.0/data-model/entities/result.md new file mode 100644 index 0000000..b84e49e --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/entities/result.md @@ -0,0 +1,497 @@ +--- +sidebar_position: 1 +--- + +# Results + +Results are intended as digital objects, described by metadata, resulting from a scientific process. +In this page, we descibe the properties of the `Result` object. + +Moreover, there are the following sub-types of a `Result`, that inherit all its properties and further extend it: +* [Publication](#publication) +* [Dataset](#dataset) +* [Software](#software) +* [Other research product](#other-research-product) + +--- + +## The `Result` object + +### id +_Type: String • Cardinality: ONE_ + +Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). + +```json +"id": "doi_dedup___::80f29c8c8ba18c46c88a285b7e739dc3" +``` + +### type +_Type: String • Cardinality: ONE_ + +Type of the result. Possible types: + +* `publication` +* `dataset` +* `software` +* `other` + +as declared in the terms from the [dnet:result_typologies vocabulary](https://api.openaire.eu/vocabularies/dnet:result_typologies). + +```json +"type": "publication" +``` + +### originalId +_Type: String • Cardinality: MANY_ + +Identifiers of the record at the original sources. + +```json +"originalId": [ + "oai:pubmedcentral.nih.gov:8024784", + "S0048733321000305", + "10.1016/j.respol.2021.104226", + "3136742816" +] +``` + +### maintitle +_Type: String • Cardinality: ONE_ + +A name or title by which a scientific result is known. May be the title of a publication, of a dataset or the name of a piece of software. + +```json +"maintitle": "The fall of the innovation empire and its possible rise through open science" +``` + +### subtitle + +_Type: String • Cardinality: ONE_ + +Explanatory or alternative name by which a scientific result is known. + +```json +"subtitle": "An analysis of cases from 1980 - 2020" +``` + +### author +_Type: [Author](other#author) • Cardinality: MANY_ + +The main researchers involved in producing the data, or the authors of the publication. + +```json +"author": [ + { + "fullname": "E. Richard Gold", + "rank": 1, + "name": "Richard", + "surname": "Gold", + "pid": { + "id": { + "scheme": "orcid", + "value": "0000-0002-3789-9238" + }, + "provenance"; { + "provenance": "Harvested", + "trust": "0.9" + } + } + }, + ... +] +``` +### bestaccessright +_Type: [BestAccessRight](other#bestaccessright) • Cardinality: ONE_ + +The most open access right associated to the manifestations of this research results. + +```json +"bestaccessright": { + "code": "c_abf2", + "label": "OPEN", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" +} +``` + +### contributor +_Type: String • Cardinality: MANY_ + +The institution or person responsible for collecting, managing, distributing, or otherwise contributing to the development of the resource. + +```json +"contributor": [ + "University of Zurich", + "Wright, Aidan G C", + "Hallquist, Michael", + ... +] +``` + +### country +_Type: [ResultCountry](other#resultcountry) • Cardinality: MANY_ + +Country associated with the result because it is the country of the organisation that manages the institutional repository or national aggregator or CRIS system from which this record was collected +Country of affiliations of authors can be found instead in the affiliation rel. + +```json +"country": [ + { + "code": "CH", + "label": "Switzerland", + "provenance": { + "provenance": "Inferred by OpenAIRE", + "trust": "0.85" + } + }, + ... +] +``` + +### coverage +_Type: String • Cardinality: MANY_ + +### dateofcollection +_Type: String • Cardinality: ONE_ + +When OpenAIRE collected the record the last time. + +```json +"dateofcollection": "2021-06-09T11:37:56.248Z" +``` + +### description +_Type: String • Cardinality: MANY_ + +A brief description of the resource and the context in which the resource was created. + +```json +"description": [ + "Open science partnerships (OSPs) are one mechanism to reverse declining efficiency. OSPs are public-private partnerships that openly share publications, data and materials.", + "There is growing concern that the innovation system's ability to create wealth and attain social benefit is declining in effectiveness. This article explores the reasons for this decline and suggests a structure, the open science partnership, as one mechanism through which to slow down or reverse this decline.", + "The article examines the empirical literature of the last century to document the decline. This literature suggests that the cost of research and innovation is increasing exponentially, that researcher productivity is declining, and, third, that these two phenomena have led to an overall flat or declining level of innovation productivity.", + ... +] +``` + +### embargoenddate +_Type: String • Cardinality: ONE_ + +Date when the embargo ends and this result turns Open Access. + +```json +"embargoenddate": "2017-01-01" +``` + +### indicators +_Type: [Indicator](other#indicator-1) • Cardinality: ONE_ + +The indicators computed for this result; +currently, the following types of indicators are supported: + +* [Impact indicators by BIP!](other#bipindicators) +* [Usage Statistics indicators](other#usagecounts) + +```json +"indicators": { + "bipIndicators": [ + { + "indicator": "influence", + "score": "123", + "class": "C2" + }, + { + "indicator": "influence_alt", + "score": "456", + "class": "C3" + }, + { + "indicator": "popularity", + "score": "234", + "class": "C1" + }, + { + "indicator": "popularity_alt", + "score": "345", + "class": "C5" + }, + { + "indicator": "impulse", + "score": "987", + "class": "C3" + } + ], + "usageCounts": { + "downloads": "10", + "views": "20" + } +} +``` + +### instance +_Type: [Instance](other#instance) • Cardinality: MANY_ + +Specific materialization or version of the result. For example, you can have one result with three instances: one is the pre-print, one is the post-print, one is the published version. + +```json +"instance": [ + { + "accessright": { + "code": "c_abf2", + "label": "OPEN", + "openAccessRoute": "gold", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" + }, + "alternateIdentifier": [ + { + "scheme": "doi", + "value": "10.1016/j.respol.2021.104226" + }, + ... + ], + "articleprocessingcharge": { + "amount": "4063.93", + "currency": "EUR" + }, + "license": "http://creativecommons.org/licenses/by-nc/4.0", + "pid": [ + { + "scheme": "pmc", + "value": "PMC8024784" + }, + ... + ], + + "publicationdate": "2021-01-01", + "refereed": "UNKNOWN", + "type": "Article", + "url": [ + "http://europepmc.org/articles/PMC8024784" + ] + }, + ... +] +``` + +### language +_Type: [Language](other#language) • Cardinality: ONE_ + +The alpha-3/ISO 639-2 code of the language. Values controlled by the [dnet:languages vocabulary](https://api.openaire.eu/vocabularies/dnet:languages). + +```json +"language": { + "code": "eng", + "label": "English" +} +``` +### lastupdatetimestamp +_Type: Long • Cardinality: ONE_ + +Timestamp of last update of the record in OpenAIRE. + +```json +"lastupdatetimestamp": 1652722279987 +``` + +### pid +_Type: [ResultPid](other#resultpid) • Cardinality: MANY_ + +Persistent identifiers of the result. See also the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers) to learn more. + +```json +"pid": [ + { + "scheme": "pmc", + "value": "PMC8024784" + }, + { + "scheme": "doi", + "value": "10.1016/j.respol.2021.104226" + }, + ... +] +``` + +### publicationdate +_Type: String • Cardinality: ONE_ + +Main date of the research product: typically the publication or issued date. In case of a research result with different versions with different dates, the date of the result is selected as the most frequent well-formatted date. If not available, then the most recent and complete date among those that are well-formatted. For statistics, the year is extracted and the result is counted only among the result of that year. Example: Pre-print date: 2019-02-03, Article date provided by repository: 2020-02, Article date provided by Crossref: 2020, OpenAIRE will set as date 2019-02-03, because it’s the most recent among the complete and well-formed dates. If then the repository updates the metadata and set a complete date (e.g. 2020-02-12), then this will be the new date for the result because it becomes the most recent most complete date. However, if OpenAIRE then collects the pre-print from another repository with date 2019-02-03, then this will be the “winning date” because it becomes the most frequent well-formatted date. + +```json +"publicationdate": "2021-03-18" +``` + +### publisher +_Type: String • Cardinality: ONE_ + +The name of the entity that holds, archives, publishes prints, distributes, releases, issues, or produces the resource. + +```json +"publisher": "Elsevier, North-Holland Pub. Co" +``` + +### source +_Type: String • Cardinality: MANY_ + +A related resource from which the described resource is derived. See definition of Dublin Core field [dc:source](https://www.dublincore.org/specifications/dublin-core/dcmi-terms/elements11/source). + +```json +"source": [ + "Research Policy", + "Crossref", + ... +] +``` + +### subjects +_Type: [Subject](other#subject) • Cardinality: MANY_ + +Subject, keyword, classification code, or key phrase describing the resource. + +```json +"subjects": [ + { + "provenance": { + "provenance": "Harvested", + "trust": "0.9" + }, + "subject": { + "scheme": "keyword", + "value": "Open science" + } + }, + ... +] +``` +--- + +## Sub-types + +There are the following sub-types of `Result`. Each inherits all its fields and extends them with the following. + +### Publication + +Metadata records about research literature (includes types of publications listed [here](http://api.openaire.eu/vocabularies/dnet:result_typologies/publication)). + +#### container +_Type: [Container](other#container) • Cardinality: ONE_ + +Container has information about the conference or journal where the result has been presented or published. + +```json +"container": { + "edition": "", + "iss": "5", + "issnLinking": "", + "issnOnline": "1873-7625", + "issnPrinted": "0048-7333", + "name": "Research Policy", + "sp": "12", + "ep": "22", + "vol": "50" +} +``` +### Dataset + +Metadata records about research data (includes the subtypes listed [here](http://api.openaire.eu/vocabularies/dnet:result_typologies/dataset)). + +#### size +_Type: String • Cardinality: ONE_ + +The declared size of the dataset. + +```json +"size": "10129818" +``` + +#### version +_Type: String • Cardinality: ONE_ + +The version of the dataset. + +```json +"version": "v1.3" +``` + +#### geolocation +_Type: [GeoLocation](other#geolocation) • Cardinality: MANY_ + +The list of geolocations associated with the dataset. + +```json +"geolocation": [ + { + "box": "18.569386 54.468973 18.066832 54.83707", + "place": "Tübingen, Baden-Württemberg, Southern Germany", + "point": "7.72486 50.1084" + }, + ... +] +``` + +### Software + +Metadata records about research software (includes the subtypes listed [here](http://api.openaire.eu/vocabularies/dnet:result_typologies/software)). + +#### documentationUrl +_Type: String • Cardinality: MANY_ + +The URLs to the software documentation. + +```json +"documentationUrl": [ + "https://github.com/openaire/iis/blob/master/README.markdown", + ... +] +``` + +#### codeRepositoryUrl +_Type: String • Cardinality: ONE_ + +The URL to the repository with the source code. + +```json +"codeRepositoryUrl": "https://github.com/openaire/iis" +``` + +#### programmingLanguage +_Type: String • Cardinality: ONE_ + +The programming language. + +```json +"programmingLanguage": "Java" +``` + +### Other research product + +Metadata records about research products that cannot be classified as research literature, data or software (includes types of products listed [here](http://api.openaire.eu/vocabularies/dnet:result_typologies/other)). + +#### contactperson +_Type: String • Cardinality: MANY_ + +Information on the person responsible for providing further information regarding the resource. + +```json +"contactperson": [ + "Noémie Dominguez", + ... +] +``` + +#### contactgroup +_Type: String • Cardinality: MANY_ + +Information on the group responsible for providing further information regarding the resource. + +```json +"contactgroup": [ + "Networked Multimedia Information Systems (NeMIS)", + ... +] +``` + +#### tool +_Type: String • Cardinality: MANY_ + +Information about tool useful for the interpretation and/or re-use of the research product. diff --git a/versioned_docs/version-6.0.0/data-model/pids-and-identifiers.md b/versioned_docs/version-6.0.0/data-model/pids-and-identifiers.md new file mode 100644 index 0000000..c613366 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/pids-and-identifiers.md @@ -0,0 +1,81 @@ +# PIDs and identifiers + +One of the challenges towards the stability of the contents in the OpenAIRE Graph consists of making its identifiers and records stable over time. +The barriers to this scenario are many, as the Graph keeps a map of data sources that is subject to constant variations: records in repositories vary in content, +original IDs, and PIDs, may disappear or reappear, and the same holds for the repository or the metadata collection it exposes. +Not only, but the mappings applied to the original contents may also change and improve over time to catch up with the changes in the input records. + +## PID Authorities + +One of the fronts regards the attribution of the identity to the objects populating the graph. The basic idea is to build the identifiers of the objects in the graph from the PIDs available in some authoritative sources while considering all the other sources as by definition “unstable”. Examples of authoritative sources are Crossref and DataCite. Examples of non-authoritative ones are institutional repositories, aggregators, etc. PIDs from the authoritative sources would form the stable OpenAIRE ID skeleton of the Graph, precisely because they are immutable by construction. + +Such a policy defines a list of data sources that are considered authoritative for a specific type of PID they provide, whose effect is twofold: +* OpenAIRE IDs depend on persistent IDs when they are provided by the authority responsible to create them; +* PIDs are included in the graph according to a tight criterion: the PID Types declared in the table below are considered to be mapped as PIDs only when they are collected from the relative PID authority data source. + +| PID Type | Authority | +|-----------|-----------------------------------------------------------------------------------------------------| +| doi | [Crossref](https://www.crossref.org), [Datacite](https://datacite.org) | +| pmc, pmid | [Europe PubMed Central](https://europepmc.org/), [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc) | +| arXiv | [arXiv.org e-Print Archive](https://arxiv.org/) | +| uniprot | [Protein Data Bank](http://www.pdb.org/) | +| ena | [Protein Data Bank](http://www.pdb.org/) | +| pdb | [Protein Data Bank](http://www.pdb.org/) | + + +There is an exception though: Handle(s) are minted by several repositories; as listing them all would not be a viable option, to avoid losing them as PIDs, Handles bypass the PID authority filtering rule. +In all other cases, PIDs are be included in the graph as alternate Identifiers. + +## Delegated authorities + +When a record is aggregated from multiple sources considered authoritative for minting specific PIDs, different mappings could be applied to them and, depending on the case, +this could result in inconsistencies in the attribution of the field values. +To overcome the issue, the intuition is to include such records only once in the graph. To do so, the concept of "delegated authorities" defines a list of datasources that +assigns PIDs to their scientific products from a given PID minter. + +This "selection" can be performed when the entities in the graph sharing the same identifier are grouped together. The list of the delegated authorities currently includes + +| Datasource delegated | Datasource delegating | Pid Type | +|--------------------------------------|----------------------------------|-----------| +| [Zenodo](https://zenodo.org) | [Datacite](https://datacite.org) | doi | +| [RoHub](https://reliance.rohub.org/) | [W3ID](https://w3id.org/) | w3id | + + +## Identifiers in the Graph + +OpenAIRE assigns internal identifiers for each object it collects. +By default, the internal identifier is generated as `sourcePrefix::md5(localId)` where: + +* `sourcePrefix` is a namespace prefix of 12 chars assigned to the data source at registration time +* `localid` is the identifier assigned to the object by the data source + +After years of operation, we can say that: + +* `localId` are generally unstable +* objects can disappear from sources +* PIDs provided by sources that are not PID agencies (authoritative sources for a specific type of PID) are often wrong (e.g. pre-print with the DOI of the published version, DOIs with typos) + +Therefore, when the record is collected from an authoritative source: + +* the identity of the record is forged using the PID, like `pidTypePrefix::md5(lowercase(doi))` +* the PID is added in a `pid` element of the data model + +When the record is collected from a source which is not authoritative for any type of PID: +* the identity of the record is forged as usual using the local identifier +* the PID, if available, is added as `alternateIdentifier` + +Currently, the following data sources are used as "PID authorities": + +| PID Type | Prefix (12 chars) | Authority | +|-----------|------------------------|-------------------------------------------| +| doi | `doi_________` | Crossref, Datacite, Zenodo | +| pmc | `pmc_________` | Europe PubMed Central, PubMed Central | +| pmid | `pmid________` | Europe PubMed Central, PubMed Central | +| arXiv | `arXiv_______` | arXiv.org e-Print Archive | +| handle | `handle______` | any repository | +| ena | `ena_________` | EMBL-EBI | +| pdb | `pdb_________` | EMBL-EBI | +| uniprot | `uniprot_____` | EMBL-EBI | + +OpenAIRE also perform duplicate identification (see the [dedicated section for details](/graph-production-workflow/deduplication)). +All duplicates are **merged** together in a **representative record** which must be assigned a dedicated OpenAIRE identifier (i.e. it cannot have the identifier of one of the aggregated record). diff --git a/versioned_docs/version-6.0.0/data-model/relationships/relationship-object.md b/versioned_docs/version-6.0.0/data-model/relationships/relationship-object.md new file mode 100644 index 0000000..7a6bd15 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/relationships/relationship-object.md @@ -0,0 +1,109 @@ +--- +title: The Relationship object +--- + +# The `Relationship` object + +A relationship in the Graph is represented with the data type presented in this page, which aims to model a directed edge between two nodes, providing information about its semantics, provenance and validation. + +### source +_Type: String • Cardinality: ONE_ + +OpenAIRE identifier of the node in the graph. + +```json +"source": "openorgs____::1cb75a3ad756e4c83e455e3e7347643b" +``` + +### sourceType +_Type: String • Cardinality: ONE_ + +Graph node type. + +```json +"sourceType": "organization" +``` + +### target +_Type: String • Cardinality: ONE_ + +OpenAIRE identifier of the node in the graph. + +```json +"target": "doajarticles::022409068174087a003647ff46070f7f" +``` + +### targetType +_Type: String • Cardinality: ONE_ + +Graph node type. + +```json +"target": "datasource" +``` + +### reltype +_Type: [RelType](#the-reltype-object) • Cardinality: ONE_ + +Represent the semantics of the relation between two nodes of the graph. + +```json +"reltype": { + "name": "provides", + "type": "provision" +} +``` +### provenance +_Type: [Provenance](/data-model/entities/other#provenance-1) • Cardinality: ONE_ + +Indicates the process that produced (or provided) the information. + +```json +"provenance": { + "provenance": "Harvested", + "trust":"0.900" +} +``` + +### validated +_Type: Boolean • Cardinality: ONE_ + +Indicates weather or not the relation was validated. + +```json +"validated": true +``` + +### validationDate +_Type: String • Cardinality: ONE_ + +Indicates the validation date of the relation - applies only when the validated flag is set to true. + +```json +"validationDate": "2022-09-02" +``` + +--- + +## The `RelType` object + +The RelType data type models the semantic of the relationship among two nodes. + +### type +_Type: String • Cardinality: ONE_ + +Relation category, e.g. affiliation, citation, see table Relation typologies. + +```json +"name": "provides" +``` + +### name +_Type: String • Cardinality: ONE_ + +Further specifies the relation semantic, indicating the relation direction, e.g. Cites, isCitedBy. + +```json +"type": "provision" +``` +--- \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/data-model/relationships/relationship-types.md b/versioned_docs/version-6.0.0/data-model/relationships/relationship-types.md new file mode 100644 index 0000000..55378b3 --- /dev/null +++ b/versioned_docs/version-6.0.0/data-model/relationships/relationship-types.md @@ -0,0 +1,37 @@ +# Relationship types + +The following table lists all the possible relation semantics found in the graph dump. + +Note: the labels used to specify the semantic of the relationships are (for the large) inherited from the [DataCite metadata kernel](https://schema.datacite.org/meta/kernel-4.4/doc/DataCite-MetadataKernel_v4.4.pdf), which provides a description for them. + +| # | Source entity type | Target entity type | Relation name / inverse | Provenance | +|:--:|:--------------------------------------:|:--------------------------------------:|:----------------------------------------------------------:|:-----------------------------------------------:| +| 1 | [Project](/data-model/entities/project) | [Result](/data-model/entities/result) | produces / isProducedBy | Harvested, Inferred by OpenAIRE, Linked by user | +| 2 | [Project](/data-model/entities/project) | [Organization](/data-model/entities/organization) | hasParticipant / isParticipant | Harvested | +| 3 | [Project](/data-model/entities/project) | [Community](/data-model/entities/community) | IsRelatedTo / IsRelatedTo | Linked by user | +| 4 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsAmongTopNSimilarDocuments / HasAmongTopNSimilarDocuments | Inferred by OpenAIRE | +| 5 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsSupplementTo / IsSupplementedBy | Harvested | +| 6 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsRelatedTo / IsRelatedTo | Harvested, Inferred by OpenAIRE, Linked by user | +| 7 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsPartOf / HasPart | Harvested | +| 8 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsDocumentedBy / Documents | Harvested | +| 9 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsObsoletedBy / Obsoletes | Harvested | +| 10 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsSourceOf / IsDerivedFrom | Harvested | +| 11 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsCompiledBy / Compiles | Harvested | +| 12 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsRequiredBy / Requires | Harvested | +| 13 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsCitedBy / Cites | Harvested, Inferred by OpenAIRE | +| 14 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsReferencedBy / References | Harvested | +| 15 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsReviewedBy / Reviews | Harvested | +| 16 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsOriginalFormOf / IsVariantFormOf | Harvested | +| 17 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsVersionOf / HasVersion | Harvested | +| 18 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsIdenticalTo / IsIdenticalTo | Harvested | +| 19 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsPreviousVersionOf / IsNewVersionOf | Harvested | +| 20 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsContinuedBy / Continues | Harvested | +| 21 | [Result](/data-model/entities/result) | [Result](/data-model/entities/result) | IsDescribedBy / Describes | Harvested | +| 22 | [Result](/data-model/entities/result) | [Organization](/data-model/entities/organization) | hasAuthorInstitution / isAuthorInstitutionOf | Harvested, Inferred by OpenAIRE | +| 23 | [Result](/data-model/entities/result) | [Data source](/data-model/entities/data-source) | isHostedBy / hosts | Harvested, Inferred by OpenAIRE | +| 24 | [Result](/data-model/entities/result) | [Data source](/data-model/entities/data-source) | isProvidedBy / provides | Harvested | +| 25 | [Result](/data-model/entities/result) | [Community](/data-model/entities/community) | IsRelatedTo / IsRelatedTo | Harvested, Inferred by OpenAIRE, Linked by user | +| 26 | [Organization](/data-model/entities/organization) | [Community](/data-model/entities/community) | IsRelatedTo / IsRelatedTo | Linked by user | +| 27 | [Organization](/data-model/entities/organization) | [Organization](/data-model/entities/organization) | IsChildOf / IsParentOf | Linked by user | +| 28 | [Data source](/data-model/entities/data-source) | [Community](/data-model/entities/community) | IsRelatedTo / IsRelatedTo | Linked by user | +| 29 | [Data source](/data-model/entities/data-source) | [Organization](/data-model/entities/organization) | isProvidedBy / provides | Harvested | diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/cfhb.md b/versioned_docs/version-6.0.0/downloads/alternative-model/cfhb.md new file mode 100644 index 0000000..4d9863d --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/cfhb.md @@ -0,0 +1,30 @@ +--- + +sidebar_position: 1 + +--- + +# CfHbKeyValue + +Information about the sources from which the record has been collected. + + + @JsonSchema(description = "the OpenAIRE identifier of the data source") +### key +_Type: String • Cardinality: ONE_ + +the OpenAIRE identifier of the data source + +```json +"key":"openaire____::081b82f96300b6a6e3d282bad31cb6e2" +``` + +### value +_Type: String • Cardinality: ONE_ + +The name of the data source. + +```json +"value":"Crossref" +``` + diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/communityInstance.md b/versioned_docs/version-6.0.0/downloads/alternative-model/communityInstance.md new file mode 100644 index 0000000..7639602 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/communityInstance.md @@ -0,0 +1,37 @@ +--- + +sidebar_position: 1 + +--- + +# CommunityInstance + +It is a subclass of [Instance](../../data-model/entities/result#instance) extended with information regarding the collection and hosting source for this materialization of the result. + +### hostedby +_Type: [CfHbKeyValue](./cfhb) • Cardinality: ONE_ + +Information about the source from which the instance can be viewed or downloaded. + +```json + +"hostedby": { + "key": "issn___print::35ee75a5ad42581d604be113a8f56427", + "value": "New Phytologist" + }, + +``` + +### collectedfrom +_Type: [CfHbKeyValue](./cfhb) • Cardinality: ONE_ + +Information about the source from which the record has been collected + + +```json + +"collectedfrom": { + "key": "openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value": "Crossref" + } +``` \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/context.md b/versioned_docs/version-6.0.0/downloads/alternative-model/context.md new file mode 100644 index 0000000..e08ea69 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/context.md @@ -0,0 +1,46 @@ +--- + +sidebar_position: 1 + +--- + +# Context + +Information related to research initiative/community (RI/RC) related to the result. + +### code +_Type: String • Cardinality: ONE_ + +Code identifying the RI/RC. + +```json +"code":"sdsn-gr" + +``` + + +### label +_Type: String • Cardinality: ONE_ + +Label of the RI/RC. + +```json +"label":"SDSN - Greece" +``` + +### provenance +_Type: [Provenance](/data-model/entities/other#provenance-2) • Cardinality: MANY_ + +Why this result is associated to the RI/RC. + +```json + +"provenance":[{ + "provenance":"Inferred by OpenAIRE", + "trust":"0.9" + }, + ... + ] + +``` + diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/extendedresult.md b/versioned_docs/version-6.0.0/downloads/alternative-model/extendedresult.md new file mode 100644 index 0000000..7c09443 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/extendedresult.md @@ -0,0 +1,141 @@ +--- + +sidebar_position: 1 + +--- + + +# Extended Result + + +It is a subclass of [Result](/data-model/entities/result) extended with information regarding projects (and funders), research communities/infrastructure and related data sources. + + + +### projects + +_Type: [Project](project.md) • Cardinality: MANY_ + + +List of projects (i.e. grants) that (co-)funded the production of the research results. + + +```json + + +"projects": [ + { + "id": "corda__h2020::94c4a066401e22002c4811a301bb4655", + "code": "727929", + "acronym": "TomRes", + "title": "A NOVEL AND INTEGRATED APPROACH TO INCREASE MULTIPLE AND COMBINED STRESS TOLERANCE IN PLANTS USING TOMATO AS A MODEL", + "funder": { + "shortName": "EC", + "name": "European Commission", + "jurisdiction": "EU", + "fundingStream": "H2020" + }, + "provenance": { + "provenance": "Harvested", + "trust": "0.900000000000000022" + }, + "validated": { + "validationDate": "2021-0101", + "validatedByFunder": true + } + }, + ... + ] + +``` + +### context + +_Type: [Context](./context) • Cardinality: MANY_ + + +Reference to relevant research infrastructure, initiative or communities (RI/RC) among those collaborating with OpenAIRE. Please see https://connect.openaire.eu that are publicly visible. + + +```json + + +"context":[ + { + "code":"sdsn-gr", + "label":"SDSN - Greece", + "provenance":[ + { + "provenance":"Inferred by OpenAIRE", + "trust":"0.9" + } + ] + }, + ... + ] + +``` + + + +### collectedfrom + +_Type: [CfHbKeyValue](./cfhb) • Cardinality: MANY_ + + +Information about the sources from which the record has been collected. + + +```json + +"collectedfrom":[ + { + "key":"openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value":"Crossref" + }, + ... + ] + +``` + + +### instance + +_Type: [CommunityInstance](./communityInstance) • Cardinality: MANY_ + +Information about the source from which the instance can be viewed or downloaded. + +```json + + +"instance": [ + { + "license": "http://doi.wiley.com/10.1002/tdm_license_1.1", + "accessright": { + "code": "c_16ec", + "label": "RESTRICTED", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/", + "openAccessRoute": null + }, + "type": "Article", + "url": [ + "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1111%2Fnph.15014", + "http://onlinelibrary.wiley.com/wol1/doi/10.1111/nph.15014/fullpdf", + "http://dx.doi.org/10.1111/nph.15014" + ], + "publicationdate": "2018-02-09", + "refereed": "UNKNOWN", + "hostedby": { + "key": "issn___print::35ee75a5ad42581d604be113a8f56427", + "value": "New Phytologist" + }, + "collectedfrom": { + "key": "openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value": "Crossref" + } + }, + ... + ] + + +``` diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/funder.md b/versioned_docs/version-6.0.0/downloads/alternative-model/funder.md new file mode 100644 index 0000000..1da93a9 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/funder.md @@ -0,0 +1,72 @@ +--- + +sidebar_position: 1 + +--- + +# Funder + + +Information about the funder funding the project. + + +### fundingStream + +_Type: String • Cardinality: ONE_ + + +Funding information for the project. + + +```json + +"funding_stream": "H2020" + + +``` + +### jurisdiction + +_Type: String • Cardinality: ONE_ + + +Geographical jurisdiction (e.g. for European Commission is EU, for Croatian Science Foundation is HR). + + +```json + +"jurisdiction": "EU" + +``` + + +### name + +_Type: String • Cardinality: ONE_ + + +The name of the funder. + + +```json + +"name": "European Commission" + +``` + + +### shortName + +_Type: String • Cardinality: ONE_ + + +The short name of the funder. + + +```json + +"shortName": "EC" + +``` + + diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/project.md b/versioned_docs/version-6.0.0/downloads/alternative-model/project.md new file mode 100644 index 0000000..ce10bb5 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/project.md @@ -0,0 +1,134 @@ +--- + +sidebar_position: 1 + +--- + + + +# Project + + +The information about the projects related to the result. + + +### id + +_Type: String • Cardinality: ONE_ + + +Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](../../data-model/pids-and-identifiers). + + +```json + +"id": "corda__h2020::70ea22400fd890c5033cb31642c4ae68" + +``` + + +### code + +_Type: String • Cardinality: ONE_ + + +Τhe grant agreement code of the project. + + +```json + +"code": "777541" + +``` + + +### acronym + +_Type: String • Cardinality: ONE_ + + +Project's acronym. + + +```json + +"acronym": "OpenAIRE-Advance" + +``` + + +### title + +_Type: String • Cardinality: ONE_ + + +Project's title. + + +```json + +"title": "OpenAIRE Advancing Open Scholarship" + +``` + + +### funder + +_Type [Funder](funder.md) • Cardinality: ONE_ + + +Information about the funder funding the project. + + +```json + + +"funder": { + "shortName": "EC", + "name": "European Commission", + "jurisdiction": "EU", + "fundingStream": "H2020" + } + + +``` + +### provenace + + +_Type [Provenance](../../data-model/entities/other#provenance-2) • Cardinality: ONE_ + + +The reason why the project is associated to the result. + + +```json + + +"provenance": { + "provenance": "Harvested", + "trust": "0.900000000000000022" + } + +``` + + +### validated + + +_Type [Validated](validated.md) • Cardinality: ONE_ + + +Specifies it the association between the project and the result was validated. + + +```json + + +"validated": { + "validationDate": "2021-0101", + "validatedByFunder": true + } + +``` + diff --git a/versioned_docs/version-6.0.0/downloads/alternative-model/validated.md b/versioned_docs/version-6.0.0/downloads/alternative-model/validated.md new file mode 100644 index 0000000..e92b2c9 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/alternative-model/validated.md @@ -0,0 +1,41 @@ +--- + +sidebar_position: 1 + +--- + +# Validated + + +Information about the validtion of the association between the result and the funding information. + + +### validationDate + +_Type: String • Cardinality: ONE_ + + +When OpenAIRE collected the association between the funding and the result from an authoritative source (i.e. Sygma). + + +```json + +"validationDate": "2021-0101" + +``` + + +### validatedByFunder + +_Type: Boolean • Cardinality: ONE_ + + +Specifies if the validation comes from the funder. + + +```json + + +"validatedByFunder": true + +``` \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/downloads/beginners-kit.md b/versioned_docs/version-6.0.0/downloads/beginners-kit.md new file mode 100644 index 0000000..39421cf --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/beginners-kit.md @@ -0,0 +1,16 @@ +--- +sidebar_position: 2 +--- + +# Beginner's kit + + + +The large size of the OpenAIRE Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. +Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone. +[The OpenAIRE Beginner’s Kit]( https://doi.org/10.5281/zenodo.7490192) aims to address this issue. It consists of two components: + +* A subset of the Graph composed of the research products published between 2022-06-29 and 2022-12-29, all the entities connected to them and the respective relationships. +* A Zeppelin notebook that demonstrates how you can use PySpark to analyse the Graph and get answers to some interesting research questions. A guide to Apache Zeppelin can be found [here](https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.6.5/bk_zeppelin-component-guide/content/ch_overview.html). \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/downloads/full-graph.md b/versioned_docs/version-6.0.0/downloads/full-graph.md new file mode 100644 index 0000000..e009aed --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/full-graph.md @@ -0,0 +1,52 @@ +--- +sidebar_position: 1 +--- + +# Full graph dump + + + +You can download the full OpenAIRE Graph Dump as well as its schema from the following links: + + Dataset: https://doi.org/10.5281/zenodo.3516917 + + Schema: https://doi.org/10.5281/zenodo.4238938 + +The schema used to dump this dataset mirrors the one described in the [Data Model](/data-model). +This dataset is licensed under a Creative Commons Attribution 4.0 International License. +It is composed of several files so that you can download the parts you are interested into. The files are named after the entity they store (i.e. publication, dataset). Each file is at most 10GB and it is +a tar archive containing gz files, each with one json per line. + +## How to acknowledge this work + +Open Science services are open and transparent and survive thanks to your active support and to the visibility and reward they gather. If you use one of the [OpenAIRE Graph dumps](https://doi.org/10.5281/zenodo.3516917) for your research, please provide a proper citation following the recommendation that you find on the dump's Zenodo page or as provided below. + +:::note How to cite + +Manghi P., Atzori C., Bardi A., Baglioni M., Schirrwagen J., Dimitropoulos H., La Bruzzo S., Foufoulas I., Mannocci A., Horst M., Czerniak A., Iatropoulou K., Kokogiannaki A., De Bonis M., Artini M., Lempesis A., Ioannidis A., Manola N., Principe P., Vergoulis T., Chatzopoulos S., Pierrakos D. (2022). "OpenAIRE Research Graph Dump", *Dataset*, Zenodo. [doi:10.5281/zenodo.3516917](https://doi.org/10.5281/zenodo.3516917) ([BibTex](/bibtex/OpenAIRE_Research_Graph_dump.bib)) +::: + +Please also consider citing [other relevant research products](/publications#relevant-research-products) that can be of interest. + +Also consider adding one of the following badges to your service with the appropriate link to [our website](https://graph.openaire.eu); click on the badges below to download the respective badge image files. + + +
+
+ + Openaire badge + +
+
+ + Openaire badge + +
+
+ + Openaire badge + +
+
diff --git a/versioned_docs/version-6.0.0/downloads/related-datasets.md b/versioned_docs/version-6.0.0/downloads/related-datasets.md new file mode 100644 index 0000000..b342307 --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/related-datasets.md @@ -0,0 +1,30 @@ +--- +sidebar_position: 4 +--- + +# Other related datasets + +In this page, we list other related datasets; please refer to their respective schema definitions for the data model they follow. + +## The dump of ScholeXplorer + + Dataset: https://doi.org/10.5281/zenodo.6338616 + + Schema (Scholix version 3): https://doi.org/10.5281/zenodo.1120275 + + Schema (Scholix version 4): https://doi.org/10.5281/zenodo.6351557 + +This dataset is licensed under a CC0 1.0 Universal (CC0 1.0) Public Domain Dedication. +The dataset contains the GZ-compressed dump of the Scholix links exposed by the OpenAIRE ScholeXplorer service. + +## The OpenAIRE LOD dump + +Dataset (RDF dump): https://doi.org/10.5281/zenodo.609943 + +LOD Ontology: http://lod.openaire.eu/vocab + +SPARQL Endpoint: http://lod.openaire.eu/sparql + + +The OpenAIRE Linked Open Data (LOD) Services and their integration with the OpenAIRE information space have been released as a beta version. The LOD exporting process started with a specification of the OpenAIRE data model as an RDF vocabulary, and then mapping of the OpenAIRE data to the graph-based RDF data model. To interlink the OpenAIRE data with related data on the Web, we have identified a list of potential datasets to interlinked with, including the DBpedia dataset extracted from Wikipedia and the publication databases DBLP and CiteSeer. +Please refer [here](http://lod.openaire.eu/documentation) for more details on the LOD documentation. \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/downloads/subgraphs.md b/versioned_docs/version-6.0.0/downloads/subgraphs.md new file mode 100644 index 0000000..3f6668b --- /dev/null +++ b/versioned_docs/version-6.0.0/downloads/subgraphs.md @@ -0,0 +1,72 @@ +--- +sidebar_position: 3 +--- + +# Sub-graph dumps + + + +In order to facilitate users, different dumps are available under the Zenodo community called [OpenAIRE Graph](https://zenodo.org/communities/openaire-research-graph). +This page lists all alternative dumps currently available. + + +## The OpenAIRE COVID-19 dump + + Dataset: https://doi.org/10.5281/zenodo.3980490 + + Schema: https://doi.org/10.5281/zenodo.3974225 + + This dataset is licensed under a Creative Commons Attribution 4.0 International License. + It contains metadata records of publications, research data, software and projects on the topic of Corona Virus and COVID-19. +This dump is part of the activities of OpenAIRE to support the fight against COVID-19 together with the OpenAIRE COVID-19 Gateway. +The dump consists of a tar archive containing gzip files with one json per line. Please refer [here](#alternative-sub-graph-data-model) for details on the data model of this dump. + +## The dump of funded products + + Dataset: https://doi.org/10.5281/zenodo.4559725 + + Schema: https://doi.org/10.5281/zenodo.3974225 + + This dataset is licensed under a Creative Commons Attribution 4.0 International License. +It contains metadata records of research products (research literature, data, software, other types of research products) with funding +information available in the OpenAIRE Graph. Records are grouped by funder in a dedicated archive file. Each tar archive contains +gzip files, each with one json record per line. The model of this dump differs from the one of the whole graph. +Please refer [here](#alternative-sub-graph-data-model) for details on the data model of this dump. + +## The dump of delta projects + + Dataset: https://doi.org/10.5281/zenodo.6419021 + + Schema: https://doi.org/10.5281/zenodo.4238938 + + This dataset is licensed under a Creative Commons Attribution 4.0 International License. + It contains the metadata records of projects collected by OpenAIRE in a given time frame. Usually one deposition of collected projects is done for each release of the OpenAIRE Graph + The deposition is one tar archive containing gzip files, each with one json record per line. + +## The dumps about research communities, initiatives and infrastructures + + Dataset: https://doi.org/10.5281/zenodo.3974604 + + Schema: https://doi.org/10.5281/zenodo.3974225 + + This dataset is licensed under a Creative Commons Attribution 4.0 International License. +The dataset contains one file per community/initiative/infrastructure collaborating with OpenAIRE. Check out also their community gateways on + CONNECT. Each file is a tar archive containing gzip files with one json per line. The only communities/research initiative/infrastructure we dump are those visible to everyone. + The model of this dump differs from the one of the whole graph. +Please refer [here](#alternative-sub-graph-data-model) for details on the data model of this dump. + + --- + + ## Alternative sub-graph data model + + It should be noted that the dumps for research communities, infrastructures, and products related to projects do not strictly follow the main data model of the OpenAIRE Graph. In particular, they differ in the following: + + * only research products are dumped (no relations, and entities different from results) + * the dumped results are extended with information that can be inferred in the whole dump namely: + * funding information if present + * associated research community/infrastructure + * associated data sources + +So they have just one entity type, that is the [Extended Result](alternative-model/extendedresult.md). diff --git a/versioned_docs/version-6.0.0/faq.md b/versioned_docs/version-6.0.0/faq.md new file mode 100644 index 0000000..ace8840 --- /dev/null +++ b/versioned_docs/version-6.0.0/faq.md @@ -0,0 +1,7 @@ +--- +sidebar_position: 10 +--- + +# FAQ + +https://support.openaire.eu/projects/docs/wiki/FAQ \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/_category_.json b/versioned_docs/version-6.0.0/graph-production-workflow/_category_.json new file mode 100644 index 0000000..8da8ce0 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Graph production workflow", + "position": 6, + "link": { + "type": "doc", + "id": "graph-production-workflow" + } +} \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/aggregation.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/aggregation.md new file mode 100644 index 0000000..ac966fc --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/aggregation.md @@ -0,0 +1,58 @@ +--- +sidebar_position: 1 +--- + +# Aggregation + +OpenAIRE materializes an open, participatory research graph (the OpenAIRE Graph) where products of the research life-cycle (e.g. scientific literature, research data, project, software) are semantically linked to each other and carry information about their access rights (i.e. if they are Open Access, Restricted, Embargoed, or Closed) and the sources from which they have been collected and where they are hosted. The OpenAIRE Graph is materialised via a set of autonomic, orchestrated workflows operating in a regimen of continuous data aggregation and integration. [1] + +## What does OpenAIRE collect? + +OpenAIRE aggregates metadata records describing objects of the research life-cycle from content providers compliant to the [OpenAIRE guidelines](https://guidelines.openaire.eu/) and from entity registries (i.e. data sources offering authoritative lists of entities, like [OpenDOAR](https://v2.sherpa.ac.uk/opendoar/), [re3data](https://www.re3data.org/), [DOAJ](https://doaj.org/), and various funder databases). After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Graph, accessible from the [OpenAIRE EXPLORE portal](https://explore.openaire.eu) and the [APIs](https://graph.openaire.eu/develop/). + +The transformation process includes the application of cleaning functions whose goal is to ensure that values are harmonised according to a common format (e.g. dates as YYYY-MM-dd) and, whenever applicable, to a common controlled vocabulary. The controlled vocabularies used for cleansing are accessible at [api.openaire.eu/vocabularies](https://api.openaire.eu/vocabularies/). Each vocabulary features a set of controlled terms, each with one code, one label, and a set of synonyms. If a synonym is found as field value, the value is updated with the corresponding term. +In addition, the OpenAIRE Graph is extended with other relevant scholarly communication sources that need special handling, either because they do not strictly follow the OpenAIRE Guidelines or due to the vast amount of data of data they offer (e.g. DOIBoost, that merges Crossref, ORCID, Microsoft Academic Graph, and Unpaywall). + +

+ Aggregation +

+ +The OpenAIRE aggregation system collects information about objects of the research life-cycle compliant to the [OpenAIRE acquisition policy](https://www.openaire.eu/content-acquisition-policy) from [different types of data sources](https://explore.openaire.eu/search/find/dataproviders): + +1. Scientific literature metadata and full-texts from institutional and thematic repositories, CRIS (Common Research Information Systems), Open Access journals and publishers; +2. Dataset metadata from data repositories and data journals; +3. Scientific literature, data and software metadata from Zenodo; +4. Metadata about data sources, organizations, projects, and funding programs from entity registries, i.e. authoritative sources such as CORDA and other funder databases for projects, OpenDOAR for publication repositories, re3data for data repositories, DOAJ for Open Access journals; +5. Metadata of open source research software from software repositories and SoftwareHeritge +6. Metadata about other types of research products, like workflow, protocols, methods, research packages + +Relationships between objects are collected from the data sources, but also automatically detected by [inference algorithms](https://www.openaire.eu/blogs/text-mining-services-in-openaire-1) and added by authenticated users, who can insert links between literature, datasets, software and projects via [the “Link” procedure available from the OpenAIRE explore portal](https://explore.openaire.eu). More information about the linking functionality can be found [here](https://www.openaire.eu/linking). + +## What kind of data sources are in OpenAIRE? + +Objects and relationships in the OpenAIRE Graph are extracted from information packages, i.e. metadata records, collected from data sources of the following kinds: + +- *Literature, Institutional and thematic repositories*: Information systems where scientists upload the bibliographic metadata and full-texts of their articles, due to obligations from their organization or due to community practices (e.g. ArXiv, Europe PMC); +- *Open Access Publishers and journals*: Information system of open access publishers or relative journals, which offer bibliographic metadata and PDFs of their published articles; +- *Data archives*: Information systems where scientists deposit descriptive metadata and files about their research data (also known as scientific data, datasets, etc.).; +- *Hybrid repositories/archives*: information systems where scientists deposit metadata and file of any kind of scientific products, incuding scientific literature, research data and research software (e.g. Zenodo) +- *Aggregator services*: Information systems that collect descriptive metadata about publications or datasets from multiple sources in order to enable cross-data source discovery of given research products. Examples are DataCite, BASE, DOAJ; +- *Entity Registries*: Information systems created with the intent of maintaining authoritative registries of given entities in the scholarly communication, such as OpenDOAR for the institutional repositories, re3data for the data repositories, CORDA and other funder databases for projects and funding information; +- *CRIS*: Information systems adopted by research and academic organizations to keep track of their research administration records and relative results; examples of CRIS content are articles or datasets funded by projects, their principal investigators, facilities acquired thanks to funding, etc.. +- *Research Graphs*: services that maintain an information space of (possibly interlinked) scholalrly communication objects. Examples are CrossRef, ScholeXplorer and OpenAIRE itself. + +## How does OpenAIRE collect metadata records? + +OpenAIRE collects metadata records describing objects of the research life-cycle from content providers compliant to the OpenAIRE guidelines and from entity registries (i.e. data sources offering authoritative lists of entities, like OpenDOAR, re3data, DOAJ, and funder databases). + +The OpenAIRE aggregator collects metadata records in the majority of cases via [OAI-PMH](https://www.openarchives.org/pmh/), but also supports other standard exchange protocols like FTP(S), SFTP, and some RESTful API. +The whole list of available and used collectors could be found in the [RedMine Wiki - API Protocols](https://support.openaire.eu/projects/openaire/wiki/API_protocols) + +For additional details about the aggregation workflows, please refer to [2]. + + +## References + +[1] Manghi, P., Artini, M., Atzori, C., Bardi, A., Mannocci, A., La Bruzzo, S., Candela, L., Castelli, D. and Pagano, P. (2014), “The D-NET software toolkit: A framework for the realization, maintenance, and operation of aggregative infrastructures”, Program: electronic library and information systems, Vol. 48 No. 4, pp. 322-354. [doi:10.1108/prog-08-2013-0045](http://doi.org/10.1108/prog-08-2013-0045) + +[2] Atzori, C., Bardi, A., Manghi, P., & Mannocci, A. (2017, January). "The OpenAIRE workflows for data management". In Italian Research Conference on Digital Libraries (pp. 95-107). Springer, Cham. [doi:10.1007/978-3-319-68130-6_8](https://doi.org/10.1007/978-3-319-68130-6_8) \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/compatible-sources.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/compatible-sources.md new file mode 100644 index 0000000..48d831e --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/compatible-sources.md @@ -0,0 +1,11 @@ +--- +sidebar_position: 1 +--- + +# OpenAIRE compatible sources + +The OpenAIRE aggregator collects metadata records from content providers compliant to the OpenAIRE guidelines. + +The OpenAIRE Guidelines help repository managers expose publications, datasets and CRIS metadata via the OAI-PMH protocol in order to integrate with OpenAIRE infrastructure. + +You can find more information in https://guidelines.openaire.eu/en/latest/ \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/datacite.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/datacite.md new file mode 100644 index 0000000..e1fd166 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/datacite.md @@ -0,0 +1,77 @@ +# Datacite +This section describes the aggregation workflow used to gather the bibliographic material from Datacite and the relative mapping. + +## Datacite datasource +[Datacite](https://datacite.org/index.html) is a leading global non-profit organisation that provides persistent identifiers (DOIs) for research data and other research outputs. + +## Datacite API +The [DataCite REST API](https://support.datacite.org/docs/api) allows users to retrieve, query, and browse Datacite metadata records. In particular, it exposes a method for harvesting new records incrementally. + +``` +https://api.datacite.org/dois?page[cursor]=$CURSOR&page[size]=$NUMBER_OF_ITEM_PER_PAGE&query=updated:[$FROM_DATE_TIMESAMP TO $TO_DATE_TIMESAMP] +``` + +On this API Request, we introduce some variables: +- **CURSOR**: The value of the cursor to iterate the pages; the cursor is extracted from each API response and used in the next request. +- **NUMBER_OF_ITEM_PER_PAGE**: (max 1000) defines how many records must be returned within each API response. +- **FROM_DATE_TIMESAMP, TO_DATE_TIMESAMP** interval timestamp of the updated record. + +Each record contains two pieces of information needed for incremental harvesting: +- **isActive**: tells if the record is deleted (`isActive:false`) +- **updated**: timestamp of last update + +## Collection Workflow + +The collection workflow is responsible for aggregating new records. Each record is stored locally on a table with the following schema: +- **DOI**: The DOI of the Datacite record (it is the primary key) +- **update_timestamp**: the last update date timestamp +- **json**: the native record JSON + +The metadata collection process identifies the most recent record date available locally and uses such date to requests the records to the Datacite API, populating the **FROM_DATE_TIMESAMP** variable. The records in the API response are included in the local storage in upsert mode. + +## Datacite Mapping + +### Entity Mapping + +The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. + +| OpenAIRE Result field path | Datacite record JSON path | # Notes | +|--------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | +| | | Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | +| `type` | | Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities: | +| `pid` | `\attributes\doi` | `scheme = doi` | +| `originalid` | `\attributes\doi` | | +| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | +| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped** | +| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | +| `author.rank` | | Incremental index starting from 1 | +| `author.name` | `\attributes\creators\givenName` | | +| `author.surname` | `\attributes\creators\familyName` | | +| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | +| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | +| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | +| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | +| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | +| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | +| `publicationdate` | `\attributes\dates` | where `dateType` is **issued** | +| `publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | +| `embargoenddate` | `\attributes\dates` | where `dateType` is **available** | +| `subjects` | `\attributes\subject` | `scheme=keywords` | +| `description` | `\attributes\descriptions` | | +| `publisher` | `\attributes\publisher` | | +| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | +| `publisher` | `\attributes\publisher` | | +| `instance.license` | `\attributes\rightsList` | if the rights value starts with http and matches a particular regex | +| `instance.accessright` | `\attributes\rightsList` | | + +### Relation Mapping + +| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Target type | #Notes | +|----------------------------------------|---------------------------------------|---------------------|------------------------------------------------------------------------------------------------------------| +| `isProducedBy/produces` | `attributes\fundingReferences` | `result/project` | only when the fundingReferences matches the pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)` | +| `IsProvidedBy/provides` | | `result/datasource` | Datasource is always set to `Datacite` | +| `isHostedBy/host` | `\attributes\relationships\client\id` | `result/datasource` | we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | +| `isRelatedTo` | `\attribute\relatedIdentifiers` | `result/result` | we create relationships whenever the pid of the target is resolved on the Research Graph | + + diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/doiboost.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/doiboost.md new file mode 100644 index 0000000..e79e384 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/doiboost.md @@ -0,0 +1,253 @@ +# DOIBoost: Crossref, Unpaywall, Microsoft Academic Graph, ORCID + +DOIBoost is a dataset that combines research outputs and links among them from a selection of data sources. +It enriches the records available on Crossref with what's available on Unpaywall, Microsoft Academic Graph, ORCID intersecting all those datasets by DOI. +As consequence, DOIBoost does not contain any record from MAG, Unpaywall, or ORCID that doesn't provide a DOI available in Crossref. + +Each Crossref record is enriched with: +* ORCID identifiers of authors from ORCID +* Open Access instance (with OA color/route and license) from Unpaywall +* the following information from MAG: + * abstracts + * MAG identifiers of authors + * affiliation (result - organization) relationships + * subjects (MAG FieldsOfStudy) + * conference or journal information + +The Open Access status is also set by intersecting the journal information of a record with the journal lists available from DOAJ and the Gold ISSN list. + +## Inputs + +* *Crossref*: dump available to Crossref subscribers via MetadataPlus service, updated once a month. +* *Microsoft Academic Graph*: downloaded version on 2021-02-15. We plan to take the latest version in Dec 2021 before MAG will be retired. +* *ORCID*: baseline dump obtained in 2020-10-13, regularly updated every week from the [ORCID public API](https://info.orcid.org/documentation/features/public-api). +* *Unpaywall*: public database snapshot downloaded in March 2021. Unpaywall updates it twice a year (https://unpaywall.org/products/snapshot) + +The construction of the DOIBoost dataset consists of the following phases: + +## Process + +The following section describes the processing steps needed to build DOIBoost starting from the input data. + +### Crossref filtering + +Records in Crossref are ruled out according to the following criteria + +* have blank title, examples: + * `10.1093/rheumatology/41.7.837` + * `10.1093/qjmed/95.7.430` + * `10.1371/journal.pone.0171434.g005` +* have one of the following publishers: `"Test accounts"`, `"CrossRef Test Account"` + * Examples from https://api.crossref.org/works?query.publisher-name=%22Test%20accounts%22 + * `10.1007/bf00344543` + * `10.1007/bf00186154` + * `10.1306/64ed947a-1724-11d7-8645000102c1865d` +* have no authors with valid names, where valid means: not blank and different from all strings in this list: `List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")` + * Examples for blank authors: + * `10.1108/00070709810247807` + * `10.1016/s1074-9098(02)00346-5` + * `10.1136/heart.88.1.6` + * Examples for `"none"` author from https://api.crossref.org/works?query.author=%22none%22 + * `10.4007/annals.2016.184.3.11` + * `10.4007/annals.2012.176.1.6` + * `10.2172/6393585` + * Examples for `"test"` author from https://api.crossref.org/works?query.author=%22test%22 + * `10.5116/ijme.54ca.a5ae` + * `10.5755/j01.ss.71.2.544` + * `10.5755/j01.ee.22.2.319` +* have `"Addie Jackson"` as author and `"Elsevier BV"` as publisher (empirically we say they are test records) + * Examples from https://api.crossref.org/works?query.author=Addie+Jackson&query.publisher-name=%22Elsevier%20BV%22 + * `10.2139/ssrn.2082156` + * `10.2139/ssrn.2202300` + * `10.2139/ssrn.2255657` +* have not one of the following values in the field `type` : `"book-section"`, `"book"`, `"book-chapter"`, `"book-part"`, `"book-series"`, `"book-set"`, `"book-track"`, `"edited-book"`, `"reference-book"`, `"monograph"`, `"journal-article"`, `"dissertation"`, `"other"`, `"peer-review"`, `"proceedings"`, `"proceedings-article"`, `"reference-entry"`, `"report"`, `"report-series"`, `"standard"`, `"standard-series"`, `"posted-content"`, `"dataset"`, + * Example: + * `10.1371/journal.pone.0171434.g005` + * `10.7554/elife.21052.049` + * `10.1371/journal.pcbi.1005379.s006` + +Records with `type=dataset` are mapped into OpenAIRE results of type dataset. All others are mapped as OpenAIRE results of type publication. + +### Mapping Crossref properties into the OpenAIRE Graph + +Properties in OpenAIRE results are set based on the logic described in the following table: + +| OpenAIRE Result field path | Crossref path(s) | Notes | +|----------------------------------------|--------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `doi` | id in the form `doi_________::md5(doi)` | +| `dateofcollection` | `indexed.datetime` | | +| `lastupdatetimestamp` | `indexed.timestamp` | | +| `type` | `type` | `dataset` if the Crossref type is dataset, `publication` otherwise (based on the filtering logics described above) | +| `originalId` | `doi, clinical-trial-number, alternative-id` | | +| `pid` | | The scheme tells the type of PID, the value contains the actual value | +| `pid.scheme` | | Default value: doi | +| `pid.value` | `doi` | The doi is normalised and lower-cased | +| `maintitle` | `title` | | +| `subtitle` | `subtitle` | | +| `author` | `author` | if available the sequence is mapped to rank and the ORCID is also mapped | +| `author.name` | `author.given` | | +| `author.surname` | `author.family` | | +| `author.fullname` | `author.given author.family` | | +| `author.rank` | | based on the order, starts from 1 | +| `author.pid` | | only if the ORCID is available | +| `author.pid.id.scheme` | | Default `'pending_orcid'` (meaning that it is not an id confirmed by ORCID) | +| `author.pid.id.value` | `author.ORCID` | | +| `author.pid.provenance.provenance` | | Default 'Harvested' | +| `author.pid.provenance.trust` | | Default '0.9' | +| `description` | `abstract` | | +| `subject` | `subject` | with `classid='keywords'`, i.e. no controlled vocabularies for Crossref subjects | +| `publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | +| `publisher` | `publisher` | | +| `source` | `source` | only if the record is not of type `book` | +| `source` | concatenation of `container-title.head` + `"ISBN: "` + `ISBN.head` | only if the record is of type `book` | +| `container` | | It is set only for publications with information about the journal it was published in. | +| `container.name` | `container-title.head` | | +| `container.issnOnline` | `issn-type.value` | if `issn-type.type='electronic'` | +| `container.issnPrinted` | `issn-type.value` | if `issn-type.type='print'` | +| `container.vol` | `volume` | | +| `container.sp` | `page` | before `'-'` | +| `container.ep` | `page` | after `'-'` | +| `instance` | | One instance is created with the DOI URL | +| `instance.accessright` | | Values in `instance.accessright.code` and `instance.accessright.label` are set based on license and dateofacceptance:
- `UNKNOWN`: if the license is blank
- `OPEN ACCESS`: if the license is a CC license or an ACS license or an APA license (considered OPEN also by Unpaywall, see [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44002063718-what-is-an-oa-license-) for details) or if OUP license, but only after 12 months from the publication date
- `EMBARGO`: OUP license, before 12 months from the publication date
- `CLOSED`: if there is a license not covered by the previous cases | +| `instance.accessright.code` | | Code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.label` | | One of: `OPEN`, `RESTRICTED`, `CLOSED`, `EMBARGO` | +| `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.openAccessRoute` | | only if `instance.accessright.value = 'OPEN ACCESS'`. Default is `hybrid`. The route is fixed in subsequent phases of DOIBoost, namely when intersecting with Unpaywall and patching the hostedby via DOAJ and the Gold-ISSN list. | +| `instance.license` | `license.URL ` | If there is a `license.content-version='vor'`, then this is used. Otherwise the first license entry is used. | +| `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | +| `instance.pid.scheme` | | Default value: `doi` | +| `instance.pid.value` | `doi` | The doi is normalised and lower-cased | +| `instance.publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | +| `instance.refereed` | | set to `peerReviewed` only if `relation.has-review.id` is not empty, `UNKNOWN` otherwise. | +| `instance.type` | `subtype` | mapped using the [OpenAIRE vocabulary for result typologies](https://api.openaire.eu/vocabularies/dnet:result_typologies) | +| `instance.url` | `doi` | Full URL of the DOI | + +All other fields of the Json schema not mentioned in the table contain empty values. + +All the records from Crossref are related to the datasource with `name=Crossref` and `id=openaire____::081b82f96300b6a6e3d282bad31cb6e2` + +Possible improvements: +* map `clinical-trial-number` and `alternative-id` in `alternateIdentifiers`? +* Verify if Crossref has a property for `language`, `country`, `container.issnLinking`, `container.iss`, `container.edition`, `container.conferenceplace` and `container.conferencedate` +* Different approach to set the `refereed` field and improve its coverage? + +### Map Crossref links to projects/funders + +Links to funding available in Crossref are mapped as funding relationships (`result -- isProducedBy -- project`) applying the following mapping: + +| Funder | Grant code | Link to | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| DOI: `{10.13039/100010663, 10.13039/100010661, 10.13039/501100007601, 10.13039/501100000780, 10.13039/100010665}` or name: `'European Union’s Horizon 2020 research and innovation program'` | series of `4-9` digits in `award` | Link to H2020 project | +| DOI: `{10.13039/100011199, 10.13039/100004431, 10.13039/501100004963, 10.13039/501100000780}` | series of `4-9` digits in `award` | Link to FP7 project | +| DOI: `10.13039/501100000781` OR name: `'European Union's'` | series of `4-9` digits in `award` | Link to FP7 or H2020 project | +| DOI: `10.13039/100000001` | `award` | Link to NSF project | +| DOI: `10.13039/501100001665` OR name: `{'The French National Research Agency (ANR)', 'The French National Research Agency'}` | `award` | Link to ANR project | +| DOI: `10.13039/501100002341` | `award` | Link to Academy of Finland project | +| DOI: `10.13039/501100001602` | `award`, removing the initial 'SFI' if present | Link to SFI project | +| DOI: `10.13039/501100000923` | `award` | Link to ARC project | +| DOI: `10.13039/501100000038` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to NSERC (`unidentified` project) | +| DOI: `10.13039/501100000155` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to SSHRC (`unidentified` project) | +| DOI: `10.13039/501100000024` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to CIHR (`unidentified` project) | +| DOI: `10.13039/501100002848` OR name :`'CONICYT, Programa de Formación de Capital Humano Avanzado'` | `award` | Link to CONICYT project | +| DOI: `10.13039/501100003448` | series of `4-9` digits in award | Link to GSRT project | +| DOI: `10.13039/501100010198` | `award` | Link to SGOV project | +| DOI: `10.13039/501100004564` | series of `4-9` digits in award | Link to MESTD project | +| DOI: `10.13039/501100003407` | `award` | Link to MIUR project. Since OpenAIRE has a small subset of MIUR projects, a link to the MIUR funder (`unidentified`
project) is also generated | +| DOI: `{10.13039/501100006588, 10.13039/501100004488}` | `award`, removing `'Project No'` and `'HRZZ'` prefix, if present | Link to HRZZ or MZOS project | +| DOI: `10.13039/501100006769` | `award` | Link to Russian Science Foundation project | +| DOI: `10.13039/501100001711` | `award` after `'_'` and before `'/'` | Link to SNSF project | +| DOI: `10.13039/501100004410` | `award` | Link to TUBITAK project | +| DOI: `10.10.13039/100004440` or name: `Wellcome Trust Masters Fellowship` | `award` | Link to Wellcome Trust specific project and to the `unidentified` project. | + +### Intersect Crossref with UnpayWall by DOI + +The fields we consider from UnpayWall are: +* `is_oa` +* `best_oa_location` +* `oa_status` + +The results of Crossref that intersect by DOI with UnpayWall records are enriched with one additional `instance` with the following properties: + +| OpenAIRE Result field path | Unpaywall field path | Notes | +|----------------------------------------|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `instance` | | created only if `is_oa` and a `best_oa_location` is available | +| `instance.accessright` | | default value `Open Access`: we do not add instances if UnpayWall says there is no open version | +| `instance.accessright.code` | | Open Access code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.label` | | Always `OPEN` | +| `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.openAccessRoute` | `oa_status` | | +| `instance.url` | `best_oa_location` | | +| `instance.license` | `best_oa_location.license` | | +| `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | +| `instance.pid.scheme` | | Default value: `doi` | +| `instance.pid.value` | `doi` | The doi is normalised and lower-cased | + +For the definition of UnpayWall's `oa_status` refer to the [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44001777288-what-do-the-types-of-oa-status-green-gold-hybrid-and-bronze-mean-) + +The record will also feature a relation to the UnpayWall data source: `name="UnpayWall"`, `id=openaire____::8ac8380272269217cb09a928c8caa993`. + +### Intersect with ORCID + +The fields we consider from ORCID are: +* `doi` +* `authors`, a list of authors, each with optional `name`, `surname`, `creditName`, `oid` + +| OpenAIRE field path | ORCID path | Notes | +|------------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------| +| `pid` | `doi` | | +| `author.name` | `capitalize(name)` | only mapped if not blank | +| `author.surname` | `capitalize(surname)` | only mapped if not blank | +| `author.fullname` | | if name and surname are not blank, they are concatenated (`capitalize(name) capitalize(surname)`), otherwise we use the `creditName` | +| `author.pid` | | only if the `ORCID` is available | +| `author.pid.id.scheme` | | Default `orcid` (meaning that it is confirmed by ORCID, (in contrast to the `orcid_pending` set from Crossref and Unpaywall) | +| `author.pid.id.value` | `oid` | | +| `author.pid.provenance.provenance` | | Default `Harvested` | +| `author.pid.provenance.trust` | | Default `0.9` | + +The records are enriched with the ORCID identifiers of their authors. + +[//]: # (TODO: Update with the new approach implemented by Miriam.) + +The current approach is: +* if the number of authors from Crossref equals the size of authors from ORCID, then we pick the list of authors with more PIDs and try to enrich it with the PIDs from the other list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available; +* if the number of authors are different, then we take the longest and try to enrich it with the PIDs from the other author list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available + +Miriam will modify the process to ensure that: +* the list of authors from Crossred always "win" +* the identifiers from ORCID "win" + +### Intersect with Microsoft Academic Graph + +*Important Notes* +* Only papers with DOI are considered +* Since for the same DOI we have multiple version of item with different MAG PaperId, we only take one per DOI (the last one we process). We call this dataset `Papers_distinct` + +When mapping MAG records to the OpenAIRE Graph, we consider the following MAG tables: +* `PaperAbstractsInvertedIndex`: for the paper abstracts +* `Authors`: for the authors. The MAG data is pre-processed by grouping authors by PaperId +* `Affiliations` and `PaperAuthorAffiliations`: to generate links between publications and organisations +* `Journals` and `ConferenceInstances`: joined with `Papers_distinct` to have the information about the venues where the paper was published +* TO BE REMOVED `PaperUrls`: to create one instance for the OpenAIRE publication +* `FieldsOfStudy`: to add subjects + +The records are enriched with: +* abstracts +* MAG identifiers of authors +* affiliation relationships +* subjects (MAG FieldsOfStudy) +* conference or journal information (in the `journal` field) TODO: or `container`, in case of the dump? +* [TO BE REMOVED] instances with URL from MAG + +### Enrich DOIBoost3 with hosting data sources (`hostedby`) and access right information + +In this phase, we intersect DOIBoost3 with a dataset composed of journals from OpenAIRE, Crossref, and the ISSN gold list. Each journal comes with its International Standard Serial Numbers (`issn`, `eissn`, `lissn`) and, when available, a flag that tells if the journal is open access. The intersection is done on the basis of the International Standard Serial Numbers. The records with a `journal.[l|e]issn` that match are enriched as follows: +* Each instance gain the `hostedby` information corresponding to the journal +* If the journal is open access, the access rights of the instances are also set to `Open Access` with `gold` route (because by construction, the journals we know are open are from DOAJ or Gold ISSN list) + +The hostedby of records that do not match are set to the `Unknown Repository`. + +## References + +The idea behind DOIBoost and its origin can be found in the paper (and related resources) at: + +* La Bruzzo S., Manghi P., Mannocci A. (2019) OpenAIRE's DOIBoost - Boosting CrossRef for Research. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, doi:10.1007/978-3-030-11226-4_11 . Open Access version available at: [10.5281/zenodo.1441071](https://doi.org/10.5281/zenodo.1441071) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/ebi.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/ebi.md new file mode 100644 index 0000000..f5abf7a --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/ebi.md @@ -0,0 +1,94 @@ +# EMBL-EBIs Protein Data Bank in Europe + +This section describes the mapping implemented for [EMBL-EBIs Protein Data Bank in Europe](https://www.ebi.ac.uk/). + +The Europe PMC RESTful Web Service gives the [datalinks API](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API) to retrieve data-literature links in Scholix format. + +## How the data is collected + +Starting from the Pubmed collection, the API below is used to obtain the bioentities related to publications for each PubMed identifier. + +Example: + +```commandline +curl -s "https://www.ebi.ac.uk/europepmc/webservices/rest/MED/33024307/datalinks?format=json" | jq '.' +{ + "version": "6.8", + "hitCount": 9, + "request": { + "id": "33024307", + "source": "MED" + }, + "dataLinkList": { + "Category": [ + { + "Name": "Nucleotide Sequences", + "CategoryLinkCount": 5, + "Section": [ + { + "ObtainedBy": "tm_accession", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 5, + "Linklist": { + "Link": [ + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "AY278488", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:AY278488" + }, + "Title": "AY278488", + "Publisher": { + "Name": "Europe PMC" + } + }, + [...] +``` + +## Mapping +The table below describes the mapping from the EBI links records to the OpenAIRE Graph dump format. +We filter all the target links with pid type **ena**, **pdb** or **uniprot** +For each target we construct a Bioentity with the following mapping + + +| OpenAIRE Result field path | EBI record field xpath | Notes | +|-----------------------------|----------------------------------------------------------|---------------------------------------------------------------| +| `id` | `target/identifier/ID` and `target/identifier/IDScheme` | id in the form `SCHEMA_________::md5(pid)` | +| `pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `publicationdate` | `target/PublicationDate` | clean and normalize the format of the date to be `YYYY-mm-dd` | +| `maintitle` | `target/Title` | | +| **Instance Mapping** | | | +| `instance.type` | | `Bioentity` | +| `type` | | `Dataset` | +| `instance.pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `instance.url` | `target/identifier/IDURL` | Copy the value as it is | +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | + + +### Relation Mapping +| OpenAIRE Relation Semantic and inverse | Source/Target type | Notes | +|----------------------------------------|---------------------|--------------------------------------------------------------------------| +| `IsRelatedTo` | `result/result` | we create relationships between the BioEntity and the pubmed publication | diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/pubmed.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/pubmed.md new file mode 100644 index 0000000..a6df81d --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/pubmed.md @@ -0,0 +1,44 @@ +# PubMed + +This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/). + +## Input + +The native data is collected from the [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) site. +It contains XML records compliant with the schema available at [www.nlm.nih.gov](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html). + +## Incremental harvesting +Pubmed exposes an entry point FTP with all the updates for each one. [ftp baseline update](https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/). We collect the new file and generate the new dataset by upserting the existing item. + +## Entity Mapping + +The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. + +| OpenAIRE Result field path | PubMed record field xpath | Notes | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Publication Mapping** | | | +| `id` | `//PMID` | id in the form `pmid_________::md5(pmid)` | +| `pid` | `//PMID` | `classid = classname = pmid` | +| `publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | +| `maintitle` | `//Title` | | +| `description` | `//AbstractText` | | +| `language` | `//Language` | cleaning vocabulary -> dnet:languages | +| `subjects` | `//DescriptorName` | classId, className = keyword | +| **Author Mapping** | | | +| `author.surname` | `//Author/LastName` | | +| `author.name` | `//Author/ForeName` | | +| `author.fullname` | `//Author/FullName` | Concatenation of forename + lastName if exist | +| `author.rank` | FOR ALL AUTHORS | sequential number starting from 1 | +| **Journal Mapping** | | | +| `container.conferencedate` | `//Journal/PubDate` | map the date of the Journal | +| `container.name` | `//Journal/Title` | name of the journal | +| `container.vol` | `//Journal/Volume` | journal volume | +| `container.issPrinted` | `//Journal/ISSN` | the journal issn | +| `container.iss` | `//Journal/Issue` | The journal issue | +| **Instance Mapping** | | | +| `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | +| `type` | | Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities: | +| `instance.pid` | `//PMID` | map the pmid in the pid in the instance | +| `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | +| `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/uniprot.md b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/uniprot.md new file mode 100644 index 0000000..47fc7bc --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/aggregation/non-compatible-sources/uniprot.md @@ -0,0 +1,31 @@ +# UniProtKB/Swiss-Prot + +This section describes the mapping implemented to integrate metadata and links from [UniProtKB/Swiss-Prot](https://www.uniprot.org/). +The complete data dump "Reviewed (Swiss-Prot)" can be downloaded from [here](https://www.uniprot.org/help/downloads). + +From this dataset, only the protein records linked to a PubMed publication are extracted. + +## Entity Mapping + +The table below describes the mapping from the TEXT metadata format to the OpenAIRE Graph dump format. +You can check an example of the text metadata [here](https://rest.uniprot.org/uniprotkb/A0A0C5B5G6.txt) + +| OpenAIRE Result field path | FASTA record field xpath | Notes | +|------------------------------|--------------------------------------------------------------------------|------------------------------------------------------------------------------------------| +| **BIOEntity Mapping** | | | +| `id` | `LINE Starts with AC` | id in the form `uniprot_____::md5(id)` | +| `pid` | `LINE Starts with AC` | example `AC A0A0C5B5G6;` classid=classname=`uniprot` the vaue is the text after `AC` | +| `publicationdate` | `LINE START WITH DT containg text integrated into UniProtKB/Swiss-Prot` | clean and normalize the format of the date to be `YYYY-mm-dd` | +| `maintitle` | `LINE START WITH GN` | main title | +| **Instance Mapping** | | | +| `instance.type` | | `Bioentity` | +| `type` | | `Dataset` | +| `instance.pid` | `LINE Starts with AC` | `classid = classname = uniprot` | +| `instance.url` | `pid` | prepend to the value `https://www.uniprot.org/uniprot/` | +| `instance.publicationdate` | `LINE START WITH DT containg text integrated into UniProtKB/Swiss-Prot` | clean and normalize the format of the date to be YYYY-mm-dd | + + +### Relation Mapping +| OpenAIRE Relation Semantic and inverse | Source/Target type | Notes | +|----------------------------------------|----------------------|--------------------------------------------------------------------------------------------------------------------------| +| `IsRelatedTo` | `LINE START WITH RX` | the mapping creates relationships between the BioEntity and the PubMed or DOI generating an unresolved target identifier | \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/cleaning.md b/versioned_docs/version-6.0.0/graph-production-workflow/cleaning.md new file mode 100644 index 0000000..e920026 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/cleaning.md @@ -0,0 +1,37 @@ +# Cleaning + + + + +The aggregation processes run independently one from another and continuously. Each aggregation process, depending on the characteristics of the records exposed by the data source, makes use of one or more vocabularies to harmonise the values available in a given field. +In this page, we describe the *vocabulary-based cleaning* operation performed to harmonise the data of the different data sources. +A vocabulary is a data structure that defines a list of terms, and for each term defines a list of synonyms: + +```xml + + + + + + + [...] + + + + + + + + + + [...] +``` + +Each vocabulary is typically used to control and harmonise the values available in a specific field characterising the bibliographic records. The example above provides a preview of the vocabulary used to clean the [result's instance typology](/data-model/entities/result#instance). + +The content of the vocabularies can be accessed on [api.openaire.eu/vocabularies](https://api.openaire.eu/vocabularies/). + +Given a value provided in the original records, the cleaning process looks for a synonym and, when found, resolves the corresponding term which is used in turn to build the cleaned record. +Each aggregation process applies vocabularies according to their definitions in a given moment of time, however, it could be the case that a vocabulary changes after the aggregation of one data source has finished, thus the aggregated content does not reflect the current status of the controlled vocabularies. + +In addition, the integration of ScholeXplorer and DOIBoost and some enrichment processes applied on the raw and on the de-duplicated graph may introduce values that do not comply with the current status of the OpenAIRE controlled vocabularies. For these reasons, we included a final step of cleansing at the end of the workflow materialisation. \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduction-and-propagation/bulk-tagging.md b/versioned_docs/version-6.0.0/graph-production-workflow/deduction-and-propagation/bulk-tagging.md new file mode 100644 index 0000000..f52188e --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduction-and-propagation/bulk-tagging.md @@ -0,0 +1,37 @@ +# Deduction + +The Deduction process (also known as “bulk tagging”) enriches each record with new information that can be derived from the existing property values. + +This process is used to associate results to community/research initiatives that are part of OpenAIRE. +As of November 2022, three procedures are in place to relate a research product to a research initiative, infrastructure (RI) or community (RC) based on: + +* subjects: it is possible to specify a list of subjects that are relevant for the RC/RI. Every time one of the subjects is found among the subjects of a result, the result is linked to the RC/RI. + +

+ Bulktagging Subject +

+ + +* data sources: it is possible to list a set of data sources relevant for the RC/RI. All the results collected from these data sources will be linked to the RC/RI +

+ Bulktagging Data source +

+ + When only some results collected from a datasource are relevant for the RC/RI, it is possible to specify a set of selection constraints (SC) that have to be verified before linking the result to the +community. The selection constraint has the form SC = S1 or S2 or ... or Sn. The generic Si has the form Si = si1 and si2 and ...and sin and each sij is a condition on a specific field of the result. The set of fields that can be specified is F={title, author, contributor, description, orcid}, +while the set of condition can be among V={contains, equals, not_contains, not_equals, contains_ignorecase, equals_ignorecase, not_contains_ignorecase, not_equal_ignorecase}, and the value is free text. +A possible selection criteria can be: “All the products whose contributor contains DARIAH “ + +

+ Bulktagging Data source +

+ +* Zenodo community: it is possible to list a set of Zenodo communities relevant for the RC/RI. All the products collected from the listed Zenodo communities are linked to the RC/RI + + +

+ Bulktagging Zenodo Community +

+ + +The list of subjects, Zenodo communities and data sources used to enrich the products are defined by the managers of the community gateway or infrastructure monitoring dashboard associated with the RC/RI. diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduction-and-propagation/propagation.md b/versioned_docs/version-6.0.0/graph-production-workflow/deduction-and-propagation/propagation.md new file mode 100644 index 0000000..79f0902 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduction-and-propagation/propagation.md @@ -0,0 +1,55 @@ +# Propagation + +This process enriches the graph by adding new links and/or new properties. The new information is added by exploiting existing semantic +relationships and values between the involved entities + +As of November 2022, the following procedures are in place: + +* Country propagation: updates the property “country” of a results. This happens when the result is collected from an institutional datasource or when the datasource hosting the result is inserted in a whitelist. For all the results whose hosting datasource verifies one of the conditions above, the country of the organization providing the datasource is added to the country of the result: e.g. publication collected from an institutional repository maintained by an italian university will be enriched with the property “country = IT”. +

+ Country Propagation +

+ +* Project propagation: adds a "isProducedBy" relationship (and its inverse) between a Project P and Result R1, if R1 has a strong semantic relationship with another Result R2 and P produces R2: e.g. publication linked to project P “is supplemented by” a dataset D. Dataset D will get the link to project P. The relationships considered for this procedure are “isSupplementedBy” and “isSupplementTo”. +

+ Project Propagation +

+* Result to RC/RI through organization propagation. The manager of the RC/RI can specify a set of organizations whose product are relevant for the +community. +Each result having such a relation of affiliation with at least one organization relevant for the RC/RI will be linked to it. +

+ Result to community through organization propagation +

+ +* Result to RC/RI through semantic relation: extends the set of products linked to a RC/RI by exploiting strong semantic relationships between the results; +e.g. if a result R1 is associated to the community C and is supplemented by a result R2 then the result R2 will be linked to the community. The relationships considered for this procedure are “isSupplementedBy” and “supplements”. +

+ Result to community through semantic relation propagation +

+* ORCID identifiers to result through semantic relation. This propagation enriches the results by adding ORCID identifiers to authors. The added ORCID will be marked as "potential" since they have been inserted through propagation. +The process considers the set of overlapping authors between results (R1 and R2) linked with a strong semantic relationship (IsSupplementedBy, IsSupplementTo). +For each author A in the overlapping set, if R1 provides the ORCID value for A and R2 does not, then the author A in R2 will be enriched with the information of the ORCID found in R1. + +

+ Orcid propation through semantic relation +

+ +* affiliation to organization through institutional repository. This propagation adds one "hasAuthorInstitution" relationship (and its inverse) +between a Result R and Organization O, +if R was collected from a datasource D with type institutional repository, and D was provided by O. +

+ Affiliation propagation through institutional repository +

+ +* affiliation to organization through semantic relation. This propagation adds one "hasAuthorInstitution" relationship (and its inverse) between a +Result R and an Organization O, +if R has an affiliation relation with an organization O1 that is in relation "isChildOf" with O. + +

+ Affiliation propagation through semantic relation +

+ The algorithm exploits only the organization leaves that are in a "IsChildOf" relation with another organization. So far one single step is done +

+ propagation strategy +

\ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/_category_.json b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/_category_.json new file mode 100644 index 0000000..c80249b --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Deduplication", + "position": 2, + "link": { + "type": "doc", + "id": "deduplication" + } +} \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/clustering-functions.md b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/clustering-functions.md new file mode 100644 index 0000000..ded6c57 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/clustering-functions.md @@ -0,0 +1,93 @@ +--- +sidebar_position: 3 +--- +# Clustering functions + +## Ngrams + +It creates ngrams from the input field.
+``` +Example: +Input string: “Search for the Standard Model Higgs Boson” +Parameters: ngram length = 3, maximum number = 4 +List of ngrams: “sea”, “sta”, “mod”, “hig” +``` + +## NgramPairs + +It produces a list of concatenations of a pair of ngrams generated from different words.
+``` +Example: +Input string: “Search for the Standard Model Higgs Boson” +Parameters: ngram length = 3 +Ngram pairs: “seasta”, “stamod”, “modhig” +``` + +## SuffixPrefix + +It produces ngrams pairs in a particular way: it concatenates the suffix of a string with the prefix of the next in the input string. A specialization of this function is available as SortedSuffixPrefix. It returns a sorted list.
+``` +Example: +Input string: “Search for the Standard Model Higgs Boson” +Parameters: suffix and prefix length = 3, maximum number = 2 +Output list: “ardmod”` (suffix of the word “Standard” + prefix of the word “Model”), “rchsta” (suffix of the word “Search” + prefix of the word “Standard”) +``` + +## Acronyms + +It creates a number of acronyms out of the words in the input field.
+``` +Example: +Input string: “Search for the Standard Model Higgs Boson” +Output: "ssmhb" +``` + +## KeywordsClustering + +It creates keys by extracting keywords, out of a customizable list, from the input field.
+``` +Example: +Input string: “University of Pisa” +Output: "key::001" (code that identifies the keyword "University" in the customizable list) +``` + +## LowercaseClustering + +It creates keys by lowercasing the input field.
+``` +Example: +Input string: “10.001/ABCD” +Output: "10.001/abcd" +``` + +## RandomClusteringFunction + +It creates random keys from the input field.
+ +## SpaceTrimmingFieldValue + +It creates keys by trimming spaces in the input field.
+``` +Example: +Input string: “Search for the Standard Model Higgs Boson” +Output: "searchstandardmodelhiggsboson" +``` + +## UrlClustering + +It creates keys for an URL field by extracting the domain.
+``` +Example: +Input string: “http://www.google.it/page” +Output: "www.google.it" +``` + +## WordsStatsSuffixPrefixChain + +It creates keys containing concatenated statistics of the field, i.e. number of words, number of letters and a chain of suffixes and prefixes of the words.
+``` +Example: +Input string: “Search for the Standard Model Higgs Boson” +Parameters: mod = 10 +Output list: "5-3-seaardmod" (number of words + number of letters % 10 + prefix of the word "Search" + suffix of the word "Standard" + prefix of the word "Model"), "5-3-rchstadel" (number of words + number of letters % 10 + suffix of the word "Search" + prefix of the word "Standard" + suffix of the word "Model") +``` \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/deduplication.md b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/deduplication.md new file mode 100644 index 0000000..d4fb7d6 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/deduplication.md @@ -0,0 +1,47 @@ +# Deduplication + +Metadata records about the same scholarly work can be collected from different providers. Each metadata record can possibly carry different information because, for example, some providers are not aware of links to projects, keywords or other details. Another common case is when OpenAIRE collects one metadata record from a repository about a pre-print and another record from a journal about the published article. For the provision of statistics, OpenAIRE must identify those cases and “merge” the two metadata records, so that the scholarly work is counted only once in the statistics OpenAIRE produces. + +## Methodology overview + +The deduplication process can be divided into five different phases: +* Collection import +* Candidate identification (clustering) +* Duplicates identification (pair-wise comparisons) +* Duplicates grouping (transitive closure) +* Relation redistribution + +

+ Deduplication Workflow +

+ +[//]: # (Link to the image: https://docs.google.com/drawings/d/1lLLSU3wsWighmxGQMNMZbgP3mg3BfDVAGVLwt4_OFA8/edit?usp=sharing) + +### Collection import + +The nodes in the graph represent entities of different types. This phase is responsible for identifying all the nodes with a given type and make them available to the subsequent phases representing them in the deduplication record model. + +### Candidate identification (clustering) + +Clustering is a common heuristics used to overcome the N x N complexity required to match all pairs of objects to identify the equivalent ones. The challenge is to identify a [clustering function](./clustering-functions) that maximizes the chance of comparing only records that may lead to a match, while minimizing the number of records that will not be matched while being equivalent. Since the equivalence function is to some level tolerant to minimal errors (e.g. switching of characters in the title, or minimal difference in letters), we need this function to be not too precise (e.g. a hash of the title), but also not too flexible (e.g. random ngrams of the title). On the other hand, reality tells us that in some cases equality of two records can only be determined by their PIDs (e.g. DOI) as the metadata properties are very different across different versions and no [clustering function](./clustering-functions) will ever bring them into the same cluster. + +### Duplicates identification (pair-wise comparisons) + +Pair-wise comparisons are conducted over records in the same cluster following the strategy defined in the decision tree. A different decision tree is adopted depending on the type of the entity being processed. + +To further limit the number of comparisons, a sliding window mechanism is used: (i) records in the same cluster are lexicographically sorted by their title, (ii) a window of K records slides over the cluster, and (iii) records ending up in the same window are pair-wise compared. The result of each comparison produces a similarity relation when the pair of record matches. Such relations will be consequently used as input for the duplicates grouping stage. + +### Duplicates grouping (transitive closure) + +Once the similarity relations between pairs of records are drawn, the groups of equivalent records are obtained (transitive closure, i.e. “mesh”). From such sets a new representative object is obtained, which inherits all properties from the merged records and keeps track of their provenance. + +### Relation redistribution + +Relations involved in nodes identified as duplicated are eventually marked as virtually deleted and used as template for creating a new relation pointing to the new representative record. +Note that nodes and relationships marked as virtually deleted are not exported. + +

+ Deduplication Workflow +

+ +[//]: # (Link to the image: https://docs.google.com/drawings/d/1cDEuVhWnSO8lUZs_Nd748vKfIPxg10jbwKSVZlv33Mg/edit?usp=sharing) \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/organizations.md b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/organizations.md new file mode 100644 index 0000000..c2c57e1 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/organizations.md @@ -0,0 +1,70 @@ +--- +sidebar_position: 2 +--- + +# Organizations + +The organizations in OpenAIRE are aggregated from different registries (e.g. CORDA, OpenDOAR, Re3data, ROR). In some cases, a registry provides organizations as entities with their own persistent identifier. In other cases, those organizations are extracted from other main entities provided by the registry (e.g. datasources, projects, etc.). + +The deduplication of organizations is enhanced by the [OpenOrgs](https://orgs.openaire.eu), a tool that combines an automated approach for identifying duplicated instances +of the same organization record with a "humans in the loop" approach, in which the equivalences produced by a duplicate identification algorithm are suggested to data curators, in charge for validating them. +The data curation activity is twofold, on one end pivots around the disambiguation task, on the other hand assumes to improve the metadata describing the organization records +(e.g. including the translated name, or a different PID) as well as defining the hierarchical structure of existing large organizations (i.e. Universities comprising its departments or large research centers with all its sub-units or sub-institutes). + +Duplicates among organizations are therefore managed through three different stages: + * *Creation of Suggestions*: executes an automatic workflow that performs the deduplication and prepare new suggestions for the curators to be processed; + * *Curation*: manual editing of the organization records performed by the data curators; + * *Creation of Representative Organizations*: executes an automatic workflow that creates curated organizations and exposes them on the OpenAIRE Graph by using the curators' feedback from the OpenOrgs underlying database. + +The next sections describe the above mentioned stages. + +### Creation of Suggestions + +This stage executes an automatic workflow that faces the *candidate identification* and the *duplicates identification* stages of the deduplication to provide suggestions for the curators in the OpenOrgs. + +#### Candidate identification (clustering) + +To match the requirements of limiting the number of comparisons, OpenAIRE clustering for organizations aims at grouping records that would more likely be comparable. +It works with four functions: +* *URL-based function*: the function generates the URL domain when this is provided as part of the record properties from the organization's `websiteurl` field; +* *Title-based functions*: + * generate strings dependent to the keywords in the `legalname` field; + * generate strings obtained as an alternation of the function prefix(3) and suffix(3) (and vice versa) on the first 3 words of the `legalname` field; + * generate strings obtained as a concatenation of ngrams of the `legalname` field; + +#### Duplicates identification (pair-wise comparisons) + +For each pair of organization in a cluster the following strategy (depicted in the figure below) is applied. +The comparison goes through the following decision tree: +1. *grid id check*: comparison of the grid ids. If the grid id is equivalent, then the similarity relation is drawn. If the grid id is not available, the comparison proceeds to the next stage; +2. *early exits*: comparison of the numbers extracted from the `legalname`, the `country` and the `website` url. No similarity relation is drawn in this stage, the comparison proceeds only if the compared fields verified the conditions of equivalence; +3. *city check*: comparison of the city names in the `legalname`. The comparison proceeds only if the legalnames shares at least 10% of cities; +4. *keyword check*: comparison of the keywords in the `legalname`. The comparison proceeds only if the legalnames shares at least 70% of keywords; +5. *legalname check*: comparison of the normalized `legalnames` with the `Jaro-Winkler` distance to determine if it is higher than `0.9`. If so, a similarity relation is drawn. Otherwise, no similarity relation is drawn. + +

+ Organization Decision Tree +

+ +[//]: # (Link to the image: https://docs.google.com/drawings/d/1YKInGGtHu09QG4pT2gRLEum4LxU82d4nKkvGNvRQmrg/edit?usp=sharing) + +### Data Curation + +All the similarity relations drawn by the algorithm involving the decision tree are exposed in OpenOrgs, where are made available to the data curators to give feedbacks and to improve the organizations metadata. +A data curator can: + * *edit organization metadata*: legalname, pid, country, url, parent relations, etc.; + * *approve suggested duplicates*: establish if an equivalence relation is valid; + * *discard suggested duplicates*: establish if an equivalence relation is wrong; + * *create similarity relations*: add a new equivalence relation not drawn by the algorithm. + +Note that if a curator does not provide a feedback on a similarity relation suggested by the algorithm, then such relation is considered as valid. + +### Creation of Representative Organizations + +This stage executes an automatic workflow that faces the *duplicates grouping* stage to create representative organizations and to update them on the OpenAIRE Graph. Such organizations are obtained via transitive closure and the relations used comes from the curators' feedback gathered on the OpenOrgs underlying Database. + +#### Duplicates grouping (transitive closure) + +Once the similarity relations between pairs of organizations have been gathered, the groups of equivalent organizations are obtained (transitive closure, i.e. “mesh”). From such sets a new representative organization is obtained, which inherits all properties from the merged records and keeps track of their provenance. + +The IDs of the representative organizations are obtained by the OpenOrgs Database that creates a unique ``openorgs`` ID for each approved organization. In case an organization is not approved by the curators, the ID is obtained by appending the prefix ``pending_org`` to the MD5 of the first ID (given their lexicographical ordering). \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/research-products.md b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/research-products.md new file mode 100644 index 0000000..4d68c25 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/deduplication/research-products.md @@ -0,0 +1,69 @@ +--- +sidebar_position: 1 +--- + +# Research results + +Duplicates among research results are identified among results of the same type (publications, datasets, software, other research products). If two duplicate results are aggregated one as a dataset and one as a software, for example, they will never be compared and they will never be identified as duplicates. +OpenAIRE supports different deduplication strategies based on the type of results. + +The next sections describe how each stage of the deduplication workflow is faced for research results. + +### Candidate identification (clustering) + +To match the requirements of limiting the number of comparisons, OpenAIRE clustering for research products works with two functions: +* *DOI-based function*: the function generates the DOI when this is provided as part of the record properties; +* *Title-based function*: the function generates a key that depends on (i) number of significant words in the title (normalized, stemming, etc.), (ii) module 10 of the number of characters of such words, and (iii) a string obtained as an alternation of the function prefix(3) and suffix(3) (and vice versa) on the first 3 words (2 words if the title only has 2). For example, the title ``Search for the Standard Model Higgs Boson`` becomes ``search standard model higgs boson`` with two keys key ``5-3-seaardmod`` and ``5-3-rchstadel``. + +To give an idea, this configuration generates around 77Mi blocks, which we limited to 200 records each (only 15K blocks are affected by the cut), and entails 260Bi matches. + +### Duplicates identification (pair-wise comparisons) + +Comparisons in a block are performed using a *sliding window* set to 50 records. The records are sorted lexicographically on a normalized version of their titles. The 1st record is compared against all the 50 following ones using the decision tree, then the second, etc. for an NlogN complexity. +A different decision tree is adopted depending on the type of the entity being processed. +Similarity relations drawn in this stage will be consequently used to perform the duplicates grouping. + +#### Publications + +For each pair of publications in a cluster the following strategy (depicted in the figure below) is applied. +The comparison goes through different stages: +1. *trusted pids check*: comparison of the trusted pid lists (in the `pid` field of the record). If at least 1 pid is equivalent, records match and the similarity relation is drawn. +2. *instance type check*: comparison of the instance types (indicating the subtype of the record, i.e. presentation, conference object, etc.). If the instance types are not compatible then the records does not match. Otherwise, the comparison proceeds to the next stage +3. *untrusted pids check*: comparison of all the available pids (in the `pid` and the `alternateid` fields of the record). In every case, no similarity relation is drawn in this stage. If at least one pid is equivalent, the next stage will be a *soft check*, otherwise the next stage is a *strong check*. +4. *soft check*: comparison of the record titles with the Levenshtein distance. If the distance measure is above 0.9 then the similarity relation is drawn. +5. *strong check*: comparison composed by three substages involving the (i) comparison of the author list sizes and the version of the record to determine if they are coherent, (ii) comparison of the record titles with the Levenshtein distance to determine if it is higher than 0.99, (iii) "smart" comparison of the author lists to check if common authors are more than 60%. + +

+ Publications Decision Tree +

+ +[//]: # (Link to the image: https://docs.google.com/drawings/d/19SIilTp1vukw6STMZuPMdc0pv0ODYCiOxP7OU3iPWK8/edit?usp=sharing) + +#### Software +For each pair of software in a cluster the following strategy (depicted in the figure below) is applied. +The comparison goes through different stages: +1. *pids check*: comparison of the pids in the records. No similarity relation is drawn in this stage, it is only used to establish the final threshold to be used to compare record titles. If there is at least one common pid, then the next stage is a *soft check*. Otherwise, the next stage is a *strong check* +2. *soft check*: comparison of the record titles with Levenshtein distance. If the measure is above 0.9, then the similarity relation is drawn +3. *strong check*: comparison of the record titles with Levenshtein distance. If the measure is above 0.99, then the similarity relation is drawn + +

+ Software Decision Tree +

+ +[//]: # (Link to the image: https://docs.google.com/drawings/d/19gd1-GTOEEo6awMObGRkYFhpAlO_38mfbDFFX0HAkuo/edit?usp=sharing) + +#### Datasets and Other types of research products +For each pair of datasets or other types of research products in a cluster the strategy depicted in the figure below is applied. +The decision tree is almost identical to the publication decision tree, with the only exception of the *instance type check* stage. Since such type of record does not have a relatable instance type, the check is not performed and the decision tree node is skipped. + +

+ Dataset and Other types of research products Decision Tree +

+ +[//]: # (Link to the image: https://docs.google.com/drawings/d/1uBa7Bw2KwBRDUYIfyRr_Keol7UOeyvMNN7MPXYLg4qw/edit?usp=sharing) + +### Duplicates grouping (transitive closure) + +The general concept is that the field coming from the record with higher "trust" value is used as reference for the field of the representative record. + +The IDs of the representative records are obtained by appending the prefix ``dedup_`` to the MD5 of the first ID (given their lexicographical ordering). If the group of merged records contains a trusted ID (i.e. the DOI), also the ``doi`` keyword is added to the prefix. \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/_category_.json b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/_category_.json new file mode 100644 index 0000000..9ecbe8a --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Enrichment", + "position": 3, + "link": { + "type": "doc", + "id": "enrichment" + } +} \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/acks.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/acks.md new file mode 100644 index 0000000..eed8cb1 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/acks.md @@ -0,0 +1,30 @@ +--- +sidebar_position: 3 +--- + +# Extraction of acknowledged concepts + +***Short description:*** Scans the plaintexts of publications for acknowledged concepts, including grant identifiers (projects) of funders, accession numbers of bioetities, EPO patent mentions, as well as custom concepts that can link research objects to specific research communities and initiatives in OpenAIRE. + +***Algorithmic details:*** +The algorithm processes the publication's fulltext and extracts references to acknowledged concepts. It applies pattern matching and string join between the fulltext and a target database which contains the title, the acronym and the identifier of the searched concept. + +***Parameters:*** +Concept titles, acronyms, and identifiers, publication's identifiers and fulltexts + +***Limitations:*** - + +***Environment:*** +Python, [madIS](https://github.com/madgik/madis), [APSW](https://github.com/rogerbinns/apsw) + +***References:*** +* Foufoulas, Y., Zacharia, E., Dimitropoulos, H., Manola, N., Ioannidis, Y. (2022). DETEXA: Declarative Extensible Text Exploration and Analysis. In: , et al. Linking Theory and Practice of Digital Libraries. TPDL 2022. Lecture Notes in Computer Science, vol 13541. Springer, Cham. [doi:10.1007/978-3-031-16802-4_9](https://doi.org/10.1007/978-3-031-16802-4_9) + +***Authority:*** ATHENA RC • ***License:*** CC-BY/CC-0 • ***Code:*** [iis/referenceextraction](https://github.com/openaire/iis/tree/master/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction) + + + + + + + diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/affiliation_matching.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/affiliation_matching.md new file mode 100644 index 0000000..539e51b --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/affiliation_matching.md @@ -0,0 +1,57 @@ +--- +sidebar_position: 1 +--- + +# Affiliation matching + +***Short description:*** The goal of the affiliation matching module is to match affiliations extracted from the pdf and xml documents with organizations from the OpenAIRE organization database. + +***Algorithmic details:*** + +*The buckets concept* + +In order to get the best possible results, the algorithm should compare every affiliation with every organization. However, this approach would be very inefficient and slow, because it would involve the processing of the cartesian product (all possible pairs) of millions of affiliations and thousands of organizations. To avoid this, IIS has introduced the concept of buckets. A bucket is a smaller group of affiliations and organizations that have been selected to be matched with one another. The matching algorithm compares only these affiliations and organizations that belong to the same bucket. + +*Affiliation matching process* + +Every affiliation in a given *bucket* is compared with every organization in the same bucket multiple times, each time by using a different algorithm (*voter*). Each *voter* is assigned a number (match strength) that describes the estimated correctness of the result of its comparison. All the affiliation-organization pairs that have been matched by at least one *voter*, will be assigned the match strength > 0 (the actual number depends on the voters, its calculation method will be shown later). + +It is very important for the algorithm to group the affiliations and organizations properly i.e. the ones that have a chance to match should be in the same *bucket*. To guarantee this, the affiliation matching module allows to create different methods of dividing the affiliations and organizations into *buckets*, and to use all of these methods in a single matching process. The specific method of grouping the affiliations and organizations into *bucket* and then joining them into pairs is carried out by the service called *Joiner*. + +Every *joiner* can be linked with many different *voters* that will tell if the affiliation-organization pairs joined match or not. By providing new *joiners* and *voters* one can extend the matching algorithm with countless new methods for matching affiliations with organizations, thus adjusting the algorithm to his or her needs. + +All the affiliations and organizations are sequentially computed by all the *matchers*. In every *matcher* they are grouped by some *joiner* in pairs, and then these pairs are processed by all the *voters* in the *matcher*. Every affiliation-organization pair that has been matched at least once is assigned the match strength that depends on the match strengths of the *voters* that pointed the given pair is a match. + +**NOTE:** There can be many organizations matched with a given affiliation, each of them matched with a different match strength. The user of the module can set a match strength threshold which will limit the results to only those matches that have the match strength greater than the specified threshold. + +*Calculation of the match strength of the affiliation-organization pair matched by multiple matchers* + +It often happens that the given affiliation-organization pair is returned as a match by more than one matcher, each time with a different match strength. In such a case **the match with the highest match strength will be selected**. + +*Calculation of the match strength of the affiliation-organization pair within a single matcher* + +Every voter has a match strength that is in the range (0, 1]. **The voter match strength says what the quotient of correct matches to all matches guessed by this voter is, and is based on real data and hundreds of matches prepared by hand.** + +The match strength of the given affiliation-organization pair is based on the match strengths of all the voters in the matcher that have pointed that the pair is a match. It will always be less than or equal to 1 and greater than the match strength of each single voter that matched the given pair. + +The total match strength is calculated in such a way that each consecutive voter reduces (by its match strength) the gap of uncertainty about the correctness of the given match. + +***Parameters:*** + +* input + * input_document_metadata: [ExtractedDocumentMetadata](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/metadataextraction/ExtractedDocumentMetadata.avdl) avro datastore location. Document metadata is the source of affiliations. + * input_organizations: [Organization](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/importer/Organization.avdl) avro datastore location. + * input_document_to_project: [DocumentToProject](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/importer/DocumentToProject.avdl) avro datastore location with **imported** document-to-project relations. These relations (alongside with inferred document-project and project-organization relations) are used to generate document-organization pairs which are used as a hint for matching affiliations. + * input_inferred_document_to_project: [DocumentToProject](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/referenceextraction/project/DocumentToProject.avdl) avro datastore location with **inferred** document-to-project relations. + * input_project_to_organization: [ProjectToOrganization](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/importer/ProjectToOrganization.avdl) avro datastore location. These relations (alongside with infered document-project and document-project relations) are used to generate document-organization pairs which are used as a hint for matching affiliations +* output + * [MatchedOrganization](https://github.com/openaire/iis/blob/master/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/model/MatchedOrganization.avdl) avro datastore location with matched publications with organizations. + +***Limitations:*** - + +***Environment:*** +Java, Spark + +***References:*** - + +***Authority:*** ICM • ***License:*** AGPL-3.0 • ***Code:*** [CoAnSys/affiliation-organization-matching](https://github.com/CeON/CoAnSys/tree/master/affiliation-organization-matching) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/citation_matching.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/citation_matching.md new file mode 100644 index 0000000..01fcf37 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/citation_matching.md @@ -0,0 +1,41 @@ +# Citation matching + +***Short description:*** During a citation matching task, bibliographic entries are linked to the documents that they reference. The citation matching module - one of the modules of the Information Inference Service (IIS) - receives as an input a list of documents accompanied by their metadata and bibliography. Among them, it discovers links described above and returns them as a list. In this document we shall evaluate if the module has been properly integrated with the whole +system and assess the accuracy of the algorithm used. It is worth mentioning that the implemented algorithm has been described in detail in arXiv:1303.6906 [cs.IR]1. However, in the referenced paper the algorithm was tested on small datasets, but here we will focus on larger datasets, which are expected to be analysed by the system in the production environment. + +***Algorithmic details:*** + +*General description* + +The algorithm used in citation matching task consists of two phases. In the first one, for each citation string a set of potentially matching documents is retrieved using a heuristic. In the second one, the metadata of these documents is analysed in order to assess which of them is the most similar to given citation. We assume that citations are parsed, i.e. fragments containing meaningful pieces of metadata information are marked in a special way. Note that in the IIS system, the citation parsing step is executed by another module. The following metadata fields are used by the described solution: + +* an author, +* a title, +* a journal name, +* pages, +* a year of publication. + +*Heuristic matching* + +The heuristic is based on indexing of document metadata by their author names. For each citation we extract author names and try to find documents in the index which have the same author entries. As spelling errors and inaccuracies commonly occur in citations, we have implemented approximate index which enables retrieval of entities with edit distance less than or equal 1. + +*Strict matching* + +In this step, all the potentially matching pairs obtained in the heuristic step are evaluated and only the most probable ones are returned as the final result. As citations tend to contain spelling errors and differ in style, there is a need to introduce fuzzy similarity measures fitted to the specifics of various metadata fields. Most of them compute a fraction of tokens or trigrams that occur in both fields being compared. When comparing journal +names, we have taken longest common subsequence (LCS) of two strings into consideration. This can be seen as an instance of the assignment problem with some refinements added. The overall similarity of two citation strings is obtained by applying a linear Support Vector Machine (SVM) using field similarities as features. + +***Parameters:*** + +* input: + * input_metadata: [ExtractedDocumentMetadataMergedWithOriginal](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/transformers/metadatamerger/ExtractedDocumentMetadataMergedWithOriginal.avdl) avro datastore location with the metadata of both publications and bibliorgaphic references to be matched + * input_matched_citations: [Citation](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/common/citations/Citation.avdl) avro datastore location with citations which were already matched and should be excluded from fuzzy matching +* output: [Citation](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/common/citations/Citation.avdl) avro datastore location with matched publications + +***Limitations:*** - + +***Environment:*** +Java, Spark + +***References:*** - + +***Authority:*** ICM • ***License:*** AGPL-3.0 • ***Code:*** [CoAnSys/citation-matching](https://github.com/CeON/CoAnSys/tree/master/citation-matching) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/cites.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/cites.md new file mode 100644 index 0000000..f7d8158 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/cites.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 4 +--- + +# Extraction of cited concepts + +***Short description:*** Scans the plaintexts of publications for cited concepts, currently for references to datasets and software URIs. + +***Algorithmic details:*** +The algorithm extracts citations to specific datasets and software. It extracts the citation section of a publication's fulltext and applies string matching against a target database which includes an inverted index with dataset/software titles, urls and other metadata. + +***Parameters:*** +Title, URL, creator names, publisher names and publication year for each concept to create the target database. Identifier and publication's fulltext to extract the cited concepts + +***Limitations:*** - + +***Environment:*** +Python, [madIS](https://github.com/madgik/madis), [APSW](https://github.com/rogerbinns/apsw) + +***References:*** +* Foufoulas Y., Stamatogiannakis L., Dimitropoulos H., Ioannidis Y. (2017) “High-Pass Text Filtering for Citation Matching”. In: Kamps J., Tsakonas G., Manolopoulos Y., Iliadis L., Karydis I. (eds) Research and Advanced Technology for Digital Libraries. TPDL 2017. Lecture Notes in Computer Science, vol 10450. Springer, Cham. [doi:10.1007/978-3-319-67008-9_28](https://doi.org/10.1007/978-3-319-67008-9_28) + +***Authority:*** ATHENA RC • ***License:*** CC-BY/CC-0 • ***Code:*** [iis/referenceextraction](https://github.com/openaire/iis/tree/master/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/classifies.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/classifies.md new file mode 100644 index 0000000..ed00155 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/classifies.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 5 +--- + +# Classifiers + +***Short description:*** A document classification algorithm that employs analysis of free text stemming from the abstracts of the publications. The purpose of applying a document classification module is to assign a scientific text to one or more predefined content classes. + +***Algorithmic details:*** +The algorithm classifies publication's fulltexts using a Bayesian classifier and weighted terms according to an offline training phase. The training has been done using the following taxonomies: arXiv, MeSH (Medical Subject Headings), ACM, and DDC (Dewey Decimal Classification, or Dewey Decimal System). + +***Parameters:*** Publication's identifier and fulltext + +***Limitations:*** - + +***Environment:*** +Python, [madIS](https://github.com/madgik/madis), [APSW](https://github.com/rogerbinns/apsw) + +***References:*** +* Giannakopoulos, T., Stamatogiannakis, E., Foufoulas, I., Dimitropoulos, H., Manola, N., Ioannidis, Y. (2014). Content Visualization of Scientific Corpora Using an Extensible Relational Database Implementation. In: Bolikowski, Ł., Casarosa, V., Goodale, P., Houssos, N., Manghi, P., Schirrwagen, J. (eds) Theory and Practice of Digital Libraries -- TPDL 2013 Selected Workshops. TPDL 2013. Communications in Computer and Information Science, vol 416. Springer, Cham. [doi:10.1007/978-3-319-08425-1_10](https://doi.org/10.1007/978-3-319-08425-1_10) + +***Authority:*** ATHENA RC • ***License:*** CC-BY/CC-0 • ***Code:*** [iis/referenceextraction](https://github.com/openaire/iis/tree/master/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/documents_similarity.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/documents_similarity.md new file mode 100644 index 0000000..1e02b95 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/documents_similarity.md @@ -0,0 +1,48 @@ +# Documents similarity + +***Short description:*** Document similarity module is responsible for finding similar documents among the ones available in the OpenAIRE Information Space. It produces "similarity" links between the documents stored in the OpenAIRE Information Space. Each link has a similarity score from [0,1] range assigned; it is expected that the higher the score, the more similar are the documents with respect to their content. + +***Algorithmic details:*** +The similarity between two documents is expressed as the similarity between weights of their common terms (i.e., words being reduced to their root form) within a context of all terms from the first and the second document. In this approach, the computation can be divided into three consecutive steps: + +1. selection of proper terms, +2. calculation of weights of terms for each document, +3. calculation of a given similarity function on weights of terms corresponding to each pair of documents. +   +The document similarity module uses the term frequency inverse-document frequency (TFIDF) measure and the cosine similarity to produce weights for terms and calculate their similarity respectively. + +*Steps of execution* + +Computation of similarity between documents is executed in the following steps. + +1. First, we create a text representation of each document. The text is a concatenation of 3 attributes of document object coming from Information Space: title, abstract, and keywords. +2. Text representation of each document is split into words. Next, stop words or words which occur in more than the N percent of documents (say 99%) or these occurring in less than M documents (say 5) are discarded as we assume that they carry no important information. +3. Next, the words are stemmed (reduced to their root form) and thus converted to terms. The importance of each term in each document is calculated using TFIDF measure (resulting in a vector of weights of terms for each document). Only the top P (say 20) important terms per documents remain for the further computations. +4. In order to calculate the cosine similarity value for the documents, we execute the following steps. + a. Triples [document id, term, term weight] are grouped by a common term and for each pair of triples from the group, term importance is recalculated as the multiplication of terms weights, producing quads [document id 1, document id 2, term, multiplied term weight]. + b. Quads are grouped by [document id 1, document id 2] and the values of the multiplied term weight are summed up, resulting in the creation of triples [document id 1, document id 2, total common weight]. + c. Finally, triples are normalized using product of the norm of the term weights' vectors. The normalized value is the final similarity measure with value between 0 and 1. +5. For a given document, only the top R (say 20) links to similar documents are returned. The links that are thrown away are assumed to be uninteresting for the end-user and thus storing them would only needlessly take disk space. + +***Parameters:*** +* input: + * input_document: [DocumentMetadata](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/documentssimilarity/DocumentMetadata.avdl) avro datastore location + * parallel: sets parameter parallel for Pig actions (default=80) + * mapredChildJavaOpts: mapreduce's map and reduce child java opts set to all PIG actions (default=Xmx12g) + * tfidfTopnTermPerDocument: number of the most important terms taken into account (default=20) + * similarityTopnDocumentPerDocument: maximum number of similar documents for each publication (default=20) + * removal_rate: removal rate (default=0.99) + * removal_least_used: removal of the least used terms (default=20) + * threshold_num_of_vector_elems_length: vector elements length threshold, when set to less than 2 all documents will be included in similarity matching (default=2) +* output: [DocumentSimilarity](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/documentssimilarity/DocumentSimilarity.avdl) avro datastore location + +***Limitations:*** - + +***Environment:*** +Pig, Java + +***References:*** + +* P. J. Dendek, A. Czeczko, M. Fedoryszak, A. Kawa, and L. Bolikowski, "Content Analysis of Scientific Articles in Apache Hadoop Ecosystem", Stud. Comp.Intelligence, vol. 541, 2014. + +***Authority:*** ICM • ***License:*** AGPL-3.0 • ***Code:*** [CoAnSys/document-similarity](https://github.com/CeON/CoAnSys/tree/master/document-similarity) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md new file mode 100644 index 0000000..1561699 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md @@ -0,0 +1,18 @@ +import DocCardList from '@theme/DocCardList'; + + +# Enrichment by mining + +**OpenAIRE** collects the full-texts of the publications, in order to apply TDM (Text and Data Mining) algorithms on them and enrich the Graph with inference links. + +The collection of the full-texts is handled by the internal **PDF Aggregation Service**. This service uses the publications' urls, from the OpenAIRE Graph and state-of-the-art algorithms, to crawl the web and try to locate and download the full-texts of the open access publications, while focusing on the most recent ones. It respects the servers of the repositories and publishers and avoids overloading them. + +The service is orchestrating a distributed execution system, on the cloud, with multiple microservices running in parallel, in order to efficiently process and download a large number of publications. The microservices store the generated report records for the publications, in a database, and the full-texts in an S3 Object Store. + +On the publication-page level, it applies text-mining algorithms to analyze the structure of the page, extract the full-text url and download the file. Additionally, it tracks various performance indicators to optimize the crawling speed, during execution. + +The PDF Aggregation Service is also capable of bulk-importing full-texts from compatible data sources, which increases the collection speed of full-texts. + +The different Text and Data Mining (TDM) algorithms used in the graph-enrichment process are grouped in the following categories. + + \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/img.png b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/img.png new file mode 100644 index 0000000..d77d197 Binary files /dev/null and b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/img.png differ diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/metadata_extraction.md b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/metadata_extraction.md new file mode 100644 index 0000000..4ade667 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/enrichment-by-mining/metadata_extraction.md @@ -0,0 +1,36 @@ +# Metadata extraction + +***Short description:*** Metadata Extraction algorithm is responsible for plaintext and metadata extraction out of the PDF documents. It based on [CERMINE](http://cermine.ceon.pl/about.html) project. + +CERMINE is a comprehensive open source system for extracting metadata and content from scientific articles in born-digital form. The system is able to process documents in PDF format and extracts: + +* document's metadata, including title, authors, affiliations, abstract, keywords, journal name, volume and issue, +* parsed bibliographic references +* the structure of document's sections, section titles and paragraphs + +CERMINE is based on a modular workflow, whose architecture ensures that individual workflow steps can be maintained separately. As a result it is easy to perform evaluation, training, improve or replace one step implementation without changing other parts of the workflow. Most steps implementations utilize supervised and unsupervised machine-leaning techniques, which increases the maintainability of the system, as well as its ability to adapt to new document layouts. + +***Algorithmic details:*** +CERMINE workflow is composed of four main parts: + +* Basic structure extraction takes a PDF file on the input and produces a geometric hierarchical structure representing the document. The structure is composed of pages, zones, lines, words and characters. The reading order of all elements is determined. Every zone is labelled with one of four general categories: METADATA, REFERENCES, BODY and OTHER. +* Metadata extraction part analyses parts of the geometric hierarchical structure labelled as METADATA and extracts a rich set of document's metadata from it. +* References extraction part analyses parts of the geometric hierarchical structure labelled as REFERENCES and the result is a list of document's parsed bibliographic references. +* Text extraction part analyses parts of the geometric hierarchical structure labelled as BODY and extracts document's body structure composed of sections, subsections and paragraphs. + +CERMINE uses supervised and unsupervised machine-leaning techniques, such as Support Vector Machines, K-means clustering and Conditional Random Fields. Content classifiers are trained on [GROTOAP2 dataset](http://cermine.ceon.pl/grotoap2/). More information about CERMINE can be found in the [presentation](http://cermine.ceon.pl/static/docs/slides.pdf). + +***Parameters:*** +* input: [DocumentText](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/metadataextraction/DocumentText.avdl) avro datastore location +* output: [ExtractedDocumentMetadata](https://github.com/openaire/iis/blob/master/iis-schemas/src/main/avro/eu/dnetlib/iis/metadataextraction/ExtractedDocumentMetadata.avdl) avro datastore location + +***Limitations:*** +Born-digital form of PDF documents is supported only. Large PDF documents may require more than 4g of assgined memory (set by default). + +***Environment:*** +Java, Hadoop + +***References:*** +* Dominika Tkaczyk, Pawel Szostek, Mateusz Fedoryszak, Piotr Jan Dendek and Lukasz Bolikowski. CERMINE: automatic extraction of structured metadata from scientific literature. In International Journal on Document Analysis and Recognition, 2015, vol. 18, no. 4, pp. 317-335, doi: 10.1007/s10032-015-0249-8. + +***Authority:*** ICM • ***License:*** AGPL-3.0 • ***Code:*** [CERMINE](https://github.com/CeON/CERMINE) diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/finalisation.md b/versioned_docs/version-6.0.0/graph-production-workflow/finalisation.md new file mode 100644 index 0000000..06548dc --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/finalisation.md @@ -0,0 +1,18 @@ +# Finalisation + +At the very end of the graph production workflow, a step is dedicated to perform certain finalisation operations, that we describe in this page, +aiming to improve the overall quality of the data. +The output of this final step is the final version of the OpenAIRE Graph. + +## Filtering + +Bibliographic records that do not meet minimal requirements for being part of the OpenAIRE Graph are eliminated during this phase. +Currently, the only criteria applied horizontally to the entire graph aims at excluding scientific results whose title is not meaningful for citation purposes. +Then, different criteria are applied in the pre-processing of specific sub-collections: + +* [Crossref filtering](/graph-production-workflow/aggregation/non-compatible-sources/doiboost#crossref-filtering) + +## Country cleaning + +This phase is responsible for removing the country information from result records that match specific criteria. The need for this phase is driven by the fact that some datasources, although referred of national pertinence, they contain material that is not always related to the given country. + diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/graph-production-workflow.md b/versioned_docs/version-6.0.0/graph-production-workflow/graph-production-workflow.md new file mode 100644 index 0000000..503611a --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/graph-production-workflow.md @@ -0,0 +1,8 @@ +# Graph production workflow + +OpenAIRE collects metadata records from more than 70K scholarly communication sources from all over the world, including Open Access institutional repositories, data archives, journals. All the metadata records (i.e. descriptions of research products) are put together in a data lake, together with records from Crossref, Unpaywall, ORCID, ROR, and information about projects provided by national and international funders. Dedicated inference algorithms applied to metadata and to the full-texts of Open Access publications enrich the content of the data lake with links between research results and projects, author affiliations, subject classification, links to entries from domain-specific databases. Duplicated organisations and results are identified and merged together to obtain an open, trusted, public resource enabling explorations of the scholarly communication landscape like never before. + +

+ Data provision +

+ diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/indexing.md b/versioned_docs/version-6.0.0/graph-production-workflow/indexing.md new file mode 100644 index 0000000..a2eac4a --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/indexing.md @@ -0,0 +1,17 @@ +# Indexing + +The final version of the OpenAIRE Graph is indexed on a Solr server that is used by the OpenAIRE portals ([EXPLORE](https://explore.openaire.eu), [CONNECT](https://connect.openaire.eu), [PROVIDE](https://provide.openaire.eu)) and APIs, the latter adopted by several third-party applications and organizations, such as: + +* The OpenAIRE Graph APIs and Portals will offer to the EOSC (European Open Science Cloud) an Open Science Resource Catalogue, keeping an up to date map of all research results (publications, datasets, software), services, organizations, projects, funders in Europe and beyond. + +* DSpace & EPrints repositories can install the OpenAIRE plugin to expose OpenAIRE compliant metadata records via their OAI-PMH endpoint and offer to researchers the possibility to link their depositions to the funding project, by selecting it from the list of project provided by OpenAIRE. + +* EC participant portal (Sygma - System for Grant Management) uses the OpenAIRE API in the “Continuous Reporting” section. Sygma automatically fetches from the OpenAIRE Search API the list of publications and datasets in the OpenAIRE Graph that are linked to the project. The user can select the research products from the list and easily compile the continuous reporting data of the project. + +* ScholExplorer is used by different players of the scholarly communication ecosystem. For example, [Elsevier](https://www.elsevier.com/authors/tools-and-resources/research-data/data-base-linking) uses its API to make the links between +publications and datasets automatically appear on ScienceDirect. +ScholExplorer indexes the links among the four major types of research products (API v3) available in the OpenAIRE Graph and makes them available through an HTTP API that allows +to search them by the following criteria: + * Links whose source object has a given PID or PID type; + * Links whose source object has been published by a given data source ("data source as publisher"); + * Links that were collected from a given data source ("data source as provider"). diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/impact-indicators.md b/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/impact-indicators.md new file mode 100644 index 0000000..a1bfdc3 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/impact-indicators.md @@ -0,0 +1,170 @@ +# Impact indicators + +This page summarises all calculated impact indicators, provided by [BIP!](https://bip.imsi.athenarc.gr/), which are included in the [bipIndicators](/data-model/entities/other#bipindicators) property (found under the [indicators](/data-model/entities/result#indicators) property of the result). + +It should be noted that the impact indicators are being calculated on the level of the research output. +Below we explain their main intuition, the way they are calculated, and their most important limitations, in an attempt help avoiding common pitfalls and misuses. + + +## Citation Count (CC) • influence_alt + +***Short description:*** +This is the most widely used scientific impact indicator, which sums all citations received by each article. +Citation count can be viewed as a measure of a publication's overall impact, since it conveys the number of other works that directly +drew on it. + +***Algorithmic details:*** +The citation count of a +publication $i$ corresponds to the in-degree of the corresponding node in the underlying citation network: $s_i = \sum_{j} A_{i,j}$, +where $A$ is the adjacency matrix of the network (i.e., $A_{i,j}=1$ when paper $j$ cites paper $i$, while $A_{i,j}=0$ otherwise). + +***Parameters:*** - + +***Limitations:*** +OpenAIRE collects data from specific data sources which means that part of the existing literature may not be considered when computing this indicator. +Also, since some indicators require the publication year for their calculation, we consider only research products for which we can gather this information from at least one data source. + +***Environment:*** PySpark + +***References:*** - + +***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) + + +## "Incubation" Citation Count (iCC) • impulse + +***Short description:*** +This measure is essentially a time-restricted version of the citation count, where the time window is distinct for each paper, i.e., +only citations $y$ years after its publication are counted. + +***Algorithmic details:*** +The "incubation" citation count of a paper $i$ is +calculated as: $s_i = \sum_{j,t_j \leq t_i+y} A_{i,j}$, where $A$ is the adjacency matrix and $t_j, t_i$ are the citing and cited paper's +publication years, respectively. $t_i$ is cited paper $i$'s publication year. iCC can be seen as an indicator of a paper's initial momentum +(impulse) directly after its publication. + +***Parameters:*** +$y=3$ + +***Limitations:*** +OpenAIRE collects data from specific data sources which means that part of the existing literature may not be considered when computing this indicator. +Also, since some indicators require the publication year for their calculation, we consider only research products for which we can gather this information from at least one data source. + +***Environment:*** PySpark + +***References:*** +* Vergoulis, T., Kanellos, I., Atzori, C., Mannocci, A., Chatzopoulos, S., Bruzzo, S. L., Manola, N., & Manghi, P. (2021, April). Bip! db: A dataset of impact measures for scientific publications. In Companion Proceedings of the Web Conference 2021 (pp. 456-460). + +***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) + + + ## PageRank (PR) • influence + +***Short description:*** +Originally developed to rank Web pages, PageRank has been also widely used to rank publications in citation +networks. In this latter context, a publication's PageRank +score also serves as a measure of its influence. + +***Algorithmic details:*** +The PageRank score of a publication is calculated +as its probability of being read by a researcher that either randomly selects publications to read or selects +publications based on the references of her latest read. Formally, the score of a publication $i$ is given by: + +$$ +s_i = \alpha \cdot \sum_{j} P_{i,j} \cdot s_j + (1-\alpha) \cdot \frac{1}{N} +$$ + +where $P$ is the stochastic transition matrix, which corresponds to the column normalised version of adjacency +matrix $A$, $\alpha \in [0,1]$, and $N$ is the number of publications in the citation network. The first addend +of the equation corresponds to the selection (with probability $\alpha$) of following a reference, while the +second one to the selection of randomly choosing any publication in the network. It should be noted that the +score of each publication relies of the score of publications citing it (the algorithm is executed iteratively +until all scores converge). As a result, PageRank differentiates citations based on the importance of citing +articles, thus alleviating the corresponding issue of the Citation Count. + +***Parameters:*** +$\alpha = 0.5, convergence\_error = 10^{-12}$ + +***Limitations:*** +OpenAIRE collects data from specific data sources which means that part of the existing literature may not be considered when computing this indicator. +Also, since some indicators require the publication year for their calculation, we consider only research products for which we can gather this information from at least one data source. + +***Environment:*** PySpark + +***References:*** +* Page, L., Brin, S., Motwani, R., & Winograd, T. (1999). The PageRank citation ranking: Bringing order to the web. Stanford InfoLab. + +***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) + + +## RAM • popularity_alt + +***Short description:*** +RAM is essentially a modified Citation Count, where recent citations are considered of higher importance compared to older ones. +Hence, it better captures the popularity of publications. This "time-awareness" of citations +alleviates the bias of methods like Citation Count and PageRank against recently published articles, which have +not had "enough" time to gather as many citations. + +***Algorithmic details:*** +The RAM score of each paper $i$ is calculated as follows: + +$$ +s_i = \sum_j{R_{i,j}} +$$ + +where $R$ is the so-called Retained Adjacency Matrix (RAM) and $R_{i,j}=\gamma^{t_c-t_j}$ when publication $j$ cites publication +$i$, and $R_{i,j}=0$ otherwise. Parameter $\gamma \in (0,1)$, $t_c$ corresponds to the current year and $t_j$ corresponds to the +publication year of citing article $j$. + +***Parameters:*** +$\gamma = 0.6$ + +***Limitations:*** +OpenAIRE collects data from specific data sources which means that part of the existing literature may not be considered when computing this indicator. +Also, since some indicators require the publication year for their calculation, we consider only research products for which we can gather this information from at least one data source. + +***Environment:*** PySpark + +***References:*** +* Ghosh, R., Kuo, T. T., Hsu, C. N., Lin, S. D., & Lerman, K. (2011, December). Time-aware ranking in dynamic citation networks. In 2011 ieee 11^{th} international conference on data mining workshops (pp. 373-380). IEEE. + +***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) + + +## AttRank • popularity + +***Short description:*** +AttRank is a PageRank variant that alleviates its bias against recent publications (i.e., it is tailored to capture popularity). +AttRank achieves this by modifying PageRank's probability of randomly selecting a publication. Instead of using a uniform probability, +AttRank defines it based on a combination of the publication's age and the citations it received in recent years. + +***Algorithmic details:*** +The AttRank score +of each publication $i$ is calculated based on: + +$$ +s_i = \alpha \cdot \sum_{j} P_{i,j} \cdot s_j + + \beta \cdot Att(i)+ \gamma \cdot c \cdot e^{-\rho \cdot (t_c-t_i)} +$$ + +where $\alpha + \beta + \gamma =1$ and $\alpha,\beta,\gamma \in [0,1]$. $Att(i)$ denotes a recent attention-based score for publication $i$, +which reflects its share of citations in the $y$ most recent years, $t_i$ is the publication year of article $i$, $t_c$ denotes the current +year, and $c$ is a normalisation constant. Finally, $P$ is the stochastic transition matrix. + +***Parameters:*** +$\alpha = 0.2, \beta = 0.5, \gamma = 0.3, \rho = -0.16, convergence\_error = 10^-{12}$ + +Note that recent attention is based on the 3 most recent years (including current one). + +***Limitations:*** +OpenAIRE collects data from specific data sources which means that part of the existing literature may not be considered when computing this indicator. +Also, since some indicators require the publication year for their calculation, we consider only research products for which we can gather this information from at least one data source. + +***Environment:*** PySpark + +***References:*** +* Kanellos, I., Vergoulis, T., Sacharidis, D., Dalamagas, T., & Vassiliou, Y. (2021, April). Ranking papers by their short-term scientific impact. In 2021 IEEE 37th International Conference on Data Engineering (ICDE) (pp. 1997-2002). IEEE. + +***Authority:*** ATHENA RC • ***License:*** GPL-2.0 • ***Code:*** [BIP! Ranker](https://github.com/athenarc/Bip-Ranker) + + \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/indicators-ingestion.md b/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/indicators-ingestion.md new file mode 100644 index 0000000..4402287 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/indicators-ingestion.md @@ -0,0 +1,8 @@ +import DocCardList from '@theme/DocCardList'; + +# Indicators ingestion + +In this step, results are enriched with Impact and Usage Statistics indicators. +The former are provided by [BIP!](https://bip.imsi.athenarc.gr/) while the latter are computed by OpenAIRE's [UsageCounts service](https://usagecounts.openaire.eu/). + + \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/usage-counts.md b/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/usage-counts.md new file mode 100644 index 0000000..b1a86bd --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/indicators-ingestion/usage-counts.md @@ -0,0 +1,7 @@ +# Usage Statistics indicators + +Usage Statistics indicators for research products, like publications, datasets,etc., are an important complement to other (traditional and alternative) bibliometric indicators to provide a comprehensive and recent view of the impact of such resources but also about their authors, institutions and the platforms themselves. They are taking into account different levels of information: the usage of data sources, the usage of individual items in the context of their resource type and the usage of individual web resources or files. + +Usage Statistics Indicators are built by the OpenAIRE's UsageCounts service. The service collects usage data and consolidated usage statistics reports respectively, from its distributed network of data providers (repositories, e-journals, CRIS) by utilizing open standards and protocols and delivers reliable, consolidated and comparable usage metrics like counts of item downloads and metadata views conformant to COUNTER Code of Practice. + +You can find more information about the UsageCounts service [here](https://usagecounts.openaire.eu/). \ No newline at end of file diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/merge-by-id.md b/versioned_docs/version-6.0.0/graph-production-workflow/merge-by-id.md new file mode 100644 index 0000000..72c7ebd --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/merge-by-id.md @@ -0,0 +1,28 @@ +# Merge by id + +In the metadata aggregation system it is common to find the same record provided by +different datasources and, sometimes, even inside the same datasource (especially in +case of aggregators). As the harmonisation processes are performed per datasource +contents, the relative records are the output of different mapping implementations. +This approach has the advantage to be deeply customisable to catch datasource specific +aspects, but it leaves room for inconsistencies when evaluating the different mappings +across the various datasources. + +This phase is therefore responsible to compensate for such inconsistencies and performs +a global grouping of every record available in the graph: + +- entities are grouped by [`id`](../data-model/entities/result#id) +- relations are grouped by [`source`, `target`, `reltype`](../data-model/relationships/relationship-object) + +This ensures that the same record, possibly assigned to different types by different +mappings, appears only once in the graph and under a single typing. In case of clashing +identifiers, the properties are merged (including the provencance information), considering +the following precedence order for the result typing: + +``` +publication > dataset > software > other +``` + +The same holds for relationships, as the same (e.g.) DOI-to-DOI citation relation could +be aggregated from multiple sources, this grouping phase would collapse all the different +duplicates onto a single relation that would however include all the individual provenances. diff --git a/versioned_docs/version-6.0.0/graph-production-workflow/stats.md b/versioned_docs/version-6.0.0/graph-production-workflow/stats.md new file mode 100644 index 0000000..9d0de86 --- /dev/null +++ b/versioned_docs/version-6.0.0/graph-production-workflow/stats.md @@ -0,0 +1,12 @@ +# Stats analysis + +The OpenAIRE Graph is also processed by a pipeline for extracting the statistics +and producing the charts for funders, research initiative, research infrastructures, +and policymakers available on [MONITOR](https://monitor.openaire.eu). + +Based on the information available on the graph, OpenAIRE provides a set of +indicators for monitoring the funding and research impact and the uptake of +Open Science publishing practices, such as Open Access publishing of publications +and datasets, availability of interlinks between research products, availability +of post-print versions in institutional or thematic Open Access repositories, etc. + diff --git a/versioned_docs/version-6.0.0/intro.md b/versioned_docs/version-6.0.0/intro.md new file mode 100644 index 0000000..5bbf407 --- /dev/null +++ b/versioned_docs/version-6.0.0/intro.md @@ -0,0 +1,34 @@ +--- +slug: / +id: intro +sidebar_position: 1 +--- + +# Overview + +The [OpenAIRE Graph](https://graph.openaire.eu/) (formerly known as the OpenAIRE Research Graph) is one of the largest open scholarly record collections worldwide, key in fostering Open Science and establishing its practices in the daily research activities. +Conceived as a public and transparent good, populated out of data sources trusted by scientists, the Graph aims at bringing discovery, monitoring, and assessment of science back in the hands of the scientific community. + +Imagine a vast collection of research products all linked together, contextualised and openly available. For the past years OpenAIRE has been working to gather this valuable record. It is a massive collection of metadata and links between scientific products such as articles, datasets, software, and other research products, entities like organisations, funders, funding streams, projects, communities, and data sources. + +The OpenAIRE Graph aggregates millions of metadata records collected from trusted data sources, including: + +* Open Access journals registered in DOAJ +* Crossref +* Unpaywall +* ORCID +* Microsoft Academic Graph +* Datacite + +And repositories registered in OpenDOAR, re3data.org, FAIRSharing.org, and the EOSC Service Catalogue. Among these, prominent repositories such as: + +* UKPubMed +* ArXiv +* HAL +* Zenodo +* Figshare +* Dryad +* Repec + +After cleaning, deduplication, enrichment and full-text mining processes, the graph is analysed to produce statistics for the [OpenAIRE MONITOR](https://monitor.openaire.eu), the [Open Science Observatory](https://osobservatory.openaire.eu), made discoverable via the [OpenAIRE EXPLORE](https://explore.openaire.eu) and programmatically accessible via [OpenAIRE Public APIs](https://develop.openaire.eu). +Last but not least, the Graph data are openly available and can be used by third-parties to create added value services. diff --git a/versioned_docs/version-6.0.0/license.md b/versioned_docs/version-6.0.0/license.md new file mode 100644 index 0000000..b55436d --- /dev/null +++ b/versioned_docs/version-6.0.0/license.md @@ -0,0 +1,10 @@ +--- +sidebar_position: 11 +--- + +# License + +OpenAIRE Graph is available for download and re-use as CC-BY (due to some input sources whose license is CC-BY). Parts of the graphs can be re-used as CC-0. + +If you are using data from the OpenAIRE Graph, please find the appropriate way to acknowledge this [here](downloads/full-graph#how-to-acknowledge-this-work). + diff --git a/versioned_docs/version-6.0.0/publications.md b/versioned_docs/version-6.0.0/publications.md new file mode 100644 index 0000000..c35ac13 --- /dev/null +++ b/versioned_docs/version-6.0.0/publications.md @@ -0,0 +1,80 @@ +--- +sidebar_position: 7 +--- + +# Relevant publications + +Open Science services are open and transparent and survive thanks to your active support and to the visibility and reward they gather. If you use one of the [OpenAIRE Graph dumps](https://doi.org/10.5281/zenodo.3516917) for your research, please provide a proper citation following the recommendation that you find on the dump's Zenodo page or as provided below. + +:::note How to cite + +Manghi P., Atzori C., Bardi A., Baglioni M., Schirrwagen J., Dimitropoulos H., La Bruzzo S., Foufoulas I., Mannocci A., Horst M., Czerniak A., Iatropoulou K., Kokogiannaki A., De Bonis M., Artini M., Lempesis A., Ioannidis A., Manola N., Principe P., Vergoulis T., Chatzopoulos S., Pierrakos D. (2022). "OpenAIRE Research Graph Dump", *Dataset*, Zenodo. [doi:10.5281/zenodo.3516917](https://doi.org/10.5281/zenodo.3516917) ([BibTex](/bibtex/OpenAIRE_Research_Graph_dump.bib)) +::: + +## Other relevant research products + +Please also consider citing the related research products listed below. + +### Aggregation system + +Manghi P., Artini M., Atzori C., Bardi A., Mannocci A., La Bruzzo S., Candela L., Castelli D., Pagano P. (2014). "The D-NET software toolkit: A framework for the realization, maintenance, and operation of aggregative infrastructures", Program: electronic library and information systems, Vol. 48 No. 4, pp. 322-354. [doi:10.1108/prog-08-2013-0045](http://doi.org/10.1108/prog-08-2013-0045) + +Atzori C., Bardi A., Manghi P., Mannocci A. (2017). "The OpenAIRE workflows for data management", In Italian Research Conference on Digital Libraries (IRCDL), pp. 95-107, Springer, Cham. [doi:10.1007/978-3-319-68130-6_8](https://doi.org/10.1007/978-3-319-68130-6_8) + +Artini M., Atzori C., Bardi A., La Bruzzo S., Manghi P., Mannocci A. (2016). "The D-NET software toolkit: dnet-basic-aggregator (Version 1.3.0)". *Software*, Zenodo. [doi:10.5281/zenodo.168356](https://doi.org/10.5281/zenodo.168356) + +Mannocci A., Manghi P. (2016). "DataQ: a data flow quality monitoring system for aggregative data infrastructures", International Conference on Theory and Practice of Digital Libraries (TPDL), pp. 357-369, Springer, Cham. [doi:10.1007/978-3-319-43997-6_28](https://doi.org/10.1007/978-3-319-43997-6_28) + +### Deduplication + +Vichos K., De Bonis M., Kanellos I., Chatzopoulos S., Atzori C., Manola N., Manghi P., Vergoulis T. (2022). "A preliminary assessment of the article deduplication algorithm used for the OpenAIRE Research Graph", In Italian Research Conference on Digital Libraries (IRCDL), Padua, Italy, CEUR-WS Proceedings. [http://ceur-ws.org/Vol-3160](http://ceur-ws.org/Vol-3160/) + +De Bonis M., Manghi P., Atzori C. (2022). "FDup: a framework for general-purpose and efficient entity deduplication of record collections", PeerJ Computer Science, 8, e1058. [https://peerj.com/articles/cs-1058](https://peerj.com/articles/cs-1058) + +Manghi P., Atzori C., De Bonis M., Bardi, A. (2020). "Entity deduplication in big data graphs for scholarly communication", Data Technologies and Applications. [doi:10.1108/dta-09-2019-0163](https://doi.org/10.1108/dta-09-2019-0163) + + +Atzori C., Manghi P., Bardi, A. (2018). "GDup: de-duplication of scholarly communication big graphs", In 2018 IEEE/ACM 5th International Conference on Big Data Computing Applications and Technologies (BDCAT) (pp. 142-151). IEEE. [doi:10.1109/bdcat.2018.00025](https://doi.org/10.1109/bdcat.2018.00025) + +Atzori C., & Paolo Manghi. (2017). "GDup: a big graph entity deduplication system" (Version 4.0.5), *Software*, Zenodo. [doi:/10.5281/zenodo.292980](https://doi.org/10.5281/zenodo.292980) + +Atzori C. (2016). "GDup: an Integrated, Scalable Big Graph Deduplication System.". [doi:10.5281/zenodo.1454879](https://doi.org/10.5281/zenodo.1454879) + +Manghi P., Mikulicic M., Atzori C. (2012). "De-duplication of aggregation authority files." International Journal of Metadata, Semantics and Ontologies 7.2: 114-130. [doi:10.1504/ijmso.2012.050014](https://doi.org/10.1504/ijmso.2012.050014) + +Manghi P., Mikulicic M. (2011). "PACE: A general-purpose tool for authority control", In Research Conference on Metadata and Semantic Research, pp. 80-92, Springer, Berlin, Heidelberg. [doi:10.1007/978-3-642-24731-6_8](https://doi.org/10.1007/978-3-642-24731-6_8) + +### Mining + +Giannakopoulos T., Foufoulas Y., Dimitropoulos H., Manola N. (2019). "Interactive Text Analysis and Information Extraction", In Italian Research Conference on Digital Libraries (IRCDL), vol 988. Springer, Cham. [doi:10.1007/978-3-030-11226-4_27](https://doi.org/10.1007/978-3-030-11226-4_27) + +Foufoulas Y., Stamatogiannakis L., Dimitropoulos H., Ioannidis Y. (2017). "High-Pass Text Filtering for Citation Matching", In International Conference on Theory and Practice of Digital Libraries (TPDL). Springer, Cham. [doi:10.1007/978-3-319-67008-9_28](https://doi.org/10.1007/978-3-319-67008-9_28) + +Chronis Y., Foufoulas Y., Nikolopoulos V., Papadopoulos A., Stamatogiannakis L., Svingos C., Ioannidis Y. E. (2016). "A Relational Approach to Complex Dataflows", In Workshop Proceedings of the EDBT/ICDT 2016 (MEDAL 2016) Joint Conference on CEUR-WS.org (ISSN 1613-0073) [http://ceur-ws.org/Vol-1558/paper45.pdf](http://ceur-ws.org/Vol-1558/paper45.pdf) + +Giannakopoulos T., Foufoulas I., Stamatogiannakis E., Dimitropoulos H., Manola N., Ioannidis Y. (2015). "Visual-Based Classification of Figures from Scientific Literature", In Proceedings of the 24th International Conference on World Wide Web (WWW), Association for Computing Machinery, New York, NY, USA, 1059–1060. [doi:10.1145/2740908.2742024](https://doi.org/10.1145/2740908.2742024) + +Giannakopoulos T., Foufoulas I., Stamatogiannakis E., Dimitropoulos H., Manola N., Ioannidis Y. (2014). "Discovering and Visualizing Interdisciplinary Content Classes in Scientific Publications". D-Lib Mag., Volume 20, Number 11/12. [doi:10.1045/november14-giannakopoulos](https://doi.org/10.1045/november14-giannakopoulos) + +Giannakopoulos T., Stamatogiannakis E., Foufoulas I., Dimitropoulos H., Manola N., Ioannidis Y. (2014). "Content Visualization of Scientific Corpora Using an Extensible Relational Database Implementation", International Conference on Theory and Practice of Digital Libraries (TPDL), Springer, Cham. [doi:10.1007/978-3-319-08425-1_10](https://doi.org/10.1007/978-3-319-08425-1_10) + +Giannakopoulos T., Dimitropoulos H., Metaxas O., Manola N., Ioannidis Y. (2013). "Supervised Content Visualization of Scientific Publications: A Case Study on the ArXiv Dataset", Intelligent Information Systems Symposium (IIS) vol 7912, Springer, Berlin, Heidelberg. [doi:10.1007/978-3-642-38634-3_23](https://doi.org/10.1007/978-3-642-38634-3_23) + +Tkaczyk, D., Szostek, P., Fedoryszak, M., Jan Dendek P., Bolikowski Ł. (2015). "CERMINE: automatic extraction of structured metadata from scientific literature", International Journal on Document Analysis and Recognition (IJDAR), 317–335. [doi:10.1007/s10032-015-0249-8](https://doi.org/10.1007/s10032-015-0249-8) + +Kobos M., Bolikowski Ł., Horst M., Manghi P., Μanola N., Schirrwagen J. (2014). "Information inference in scholarly communication infrastructures: the OpenAIREplus project experience", Procedia Computer Science 38, 92-99. [doi:10.1016/j.procs.2014.10.016](https://doi.org/10.1016/j.procs.2014.10.016) + +### Portals + +Baglioni Μ., Bardi Α., Kokogiannaki Α., Manghi P., Iatropoulou K., Principe P., Vieira A., Nielsen L. H., Dimitropoulos H., Foufoulas I., Manola N., Atzori C., La Bruzzo S., Lazzeri E., Artini M., De Bonis M., Dell’Amico A. (2019). "The OpenAIRE Research Community Dashboard: On Blending Scientific Workflows and Scientific Publishing", +International Conference on Theory and Practice of Digital Libraries (TPDL). Lecture Notes in Computer Science, vol 11799. Springer, Cham. [doi:10.1007/978-3-030-30760-8_5](https://doi.org/10.1007/978-3-030-30760-8_5) + +### Broker Service + +Manghi P., Atzori C., Bardi A., La Bruzzo S., Artini M. (2016). "Realizing a Scalable and History-Aware Literature Broker Service for OpenAIRE", Italian Research Conference on Digital Libraries (IRCDL), pp. 92-103, Springer, Cham. [doi:10.1007/978-3-319-56300-8_9](https://doi.org/10.1007/978-3-319-56300-8_9) + +Artini M., Atzori C., Bardi A., La Bruzzo S., Manghi P., Mannocci A. (2015). "The OpenAIRE literature broker service for institutional repositories", D-Lib Magazine, 21(11/12), 1. [doi:10.1045/november2015-artini](https://doi.org/10.1045/november2015-artini) + + + + diff --git a/versioned_sidebars/version-6.0.0-sidebars.json b/versioned_sidebars/version-6.0.0-sidebars.json new file mode 100644 index 0000000..75204f7 --- /dev/null +++ b/versioned_sidebars/version-6.0.0-sidebars.json @@ -0,0 +1,288 @@ +{ + "mySidebar": [ + { + "type": "doc", + "id": "intro" + }, + { + "type": "category", + "label": "Data model", + "link": { + "type": "doc", + "id": "data-model/data-model" + }, + "items": [ + { + "type": "category", + "label": "Entities", + "link": { + "type": "generated-index", + "description": "The main entities of the OpenAIRE Graph are listed below." + }, + "items": [ + { + "type": "doc", + "id": "data-model/entities/result" + }, + { + "type": "doc", + "id": "data-model/entities/data-source" + }, + { + "type": "doc", + "id": "data-model/entities/organization" + }, + { + "type": "doc", + "id": "data-model/entities/project" + }, + { + "type": "doc", + "id": "data-model/entities/community" + } + ] + }, + { + "type": "category", + "label": "Relationships", + "link": { + "type": "generated-index", + "description": "This section describes the relationships between entities in the OpenAIRE Graph: they way they are modelled as well as the different relationship types currently supported." + }, + "items": [ + { + "type": "doc", + "id": "data-model/relationships/relationship-object" + }, + { + "type": "doc", + "id": "data-model/relationships/relationship-types" + } + ] + }, + { + "type": "doc", + "id": "data-model/pids-and-identifiers" + } + ] + }, + { + "type": "link", + "label": "Public API", + "href": "https://graph.openaire.eu/develop/overview.html" + }, + { + "type": "category", + "label": "Downloads", + "link": { + "type": "generated-index", + "description": "All resources, available for download, are listed below. For the versions available in Zenodo, please refer to the Changelog section." + }, + "items": [ + { + "type": "doc", + "id": "downloads/full-graph" + }, + { + "type": "doc", + "id": "downloads/beginners-kit" + }, + { + "type": "doc", + "id": "downloads/subgraphs" + }, + { + "type": "doc", + "id": "downloads/related-datasets" + } + ] + }, + { + "type": "category", + "label": "Graph production workflow", + "link": { + "type": "doc", + "id": "graph-production-workflow/graph-production-workflow" + }, + "items": [ + { + "type": "category", + "label": "Aggregation", + "link": { + "type": "doc", + "id": "graph-production-workflow/aggregation/aggregation" + }, + "items": [ + { + "type": "doc", + "label": "OpenAIRE compatible sources", + "id": "graph-production-workflow/aggregation/compatible-sources" + }, + { + "type": "category", + "label": "Non-compatible sources", + "link": { + "type": "generated-index" + }, + "items": [ + { + "type": "doc", + "id": "graph-production-workflow/aggregation/non-compatible-sources/doiboost", + "label": "DOIBoost" + }, + { + "type": "doc", + "id": "graph-production-workflow/aggregation/non-compatible-sources/pubmed" + }, + { + "type": "doc", + "id": "graph-production-workflow/aggregation/non-compatible-sources/datacite" + }, + { + "type": "doc", + "id": "graph-production-workflow/aggregation/non-compatible-sources/ebi", + "label": "EMBL-EBI" + }, + { + "type": "doc", + "id": "graph-production-workflow/aggregation/non-compatible-sources/uniprot", + "label": "UniProtKB/Swiss-Prot" + } + ] + } + ] + }, + { + "type": "doc", + "id": "graph-production-workflow/merge-by-id" + }, + { + "type": "category", + "label": "Enrichment by mining", + "link": { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/enrichment-by-mining" + }, + "items": [ + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/affiliation_matching" + }, + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/citation_matching" + }, + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/classifies" + }, + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/documents_similarity" + }, + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/acks" + }, + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/cites" + }, + { + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/metadata_extraction" + } + ] + }, + { + "type": "doc", + "id": "graph-production-workflow/cleaning" + }, + { + "type": "category", + "label": "Deduplication", + "link": { + "type": "doc", + "id": "graph-production-workflow/deduplication/deduplication" + }, + "items": [ + { + "type": "doc", + "id": "graph-production-workflow/deduplication/research-products" + }, + { + "type": "doc", + "id": "graph-production-workflow/deduplication/organizations" + } + ] + }, + { + "type": "category", + "label": "Deduction & propagation", + "link": { + "type": "generated-index", + "description": "The OpenAIRE Graph is further enriched by the deduction and propagation processes descibed in this section." + }, + "items": [ + { + "type": "doc", + "id": "graph-production-workflow/deduction-and-propagation/bulk-tagging" + }, + { + "type": "doc", + "id": "graph-production-workflow/deduction-and-propagation/propagation" + } + ] + }, + { + "type": "category", + "label": "Indicators ingestion", + "link": { + "type": "doc", + "id": "graph-production-workflow/indicators-ingestion/indicators-ingestion" + }, + "items": [ + { + "type": "doc", + "id": "graph-production-workflow/indicators-ingestion/impact-indicators" + }, + { + "type": "doc", + "id": "graph-production-workflow/indicators-ingestion/usage-counts" + } + ] + }, + { + "type": "doc", + "id": "graph-production-workflow/finalisation" + }, + { + "type": "doc", + "id": "graph-production-workflow/indexing" + }, + { + "type": "doc", + "id": "graph-production-workflow/stats" + } + ] + }, + { + "type": "doc", + "id": "publications", + "label": "Relevant publications" + }, + { + "type": "doc", + "id": "license" + }, + { + "type": "doc", + "id": "changelog" + }, + { + "type": "link", + "label": "Helpdesk", + "href": "https://graph.openaire.eu/support" + } + ] +} diff --git a/versions.json b/versions.json index 62da762..3f7194c 100644 --- a/versions.json +++ b/versions.json @@ -1,4 +1,5 @@ [ + "6.0.0", "5.2.0", "5.1.3", "5.1.2",