From 616f6482f1d8a81fca9f99915d45f67b993a52dd Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Oct 2022 14:00:54 +0200 Subject: [PATCH 01/25] more git ignores --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b2d6de3..e005ae6 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ npm-debug.log* yarn-debug.log* yarn-error.log* + +.idea/ \ No newline at end of file From 797f7c21f7d93fc1ed8500f622b6c41a4d428ef1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 4 Oct 2022 15:48:01 +0200 Subject: [PATCH 02/25] WIP: aggregation section --- docs/data-provision/aggregation.md | 16 ----- .../data-provision/aggregation/aggregation.md | 58 +++++++++++++++++++ .../aggregation/authoritative-datasources.md | 5 ++ sidebars.js | 9 ++- 4 files changed, 71 insertions(+), 17 deletions(-) delete mode 100644 docs/data-provision/aggregation.md create mode 100644 docs/data-provision/aggregation/aggregation.md create mode 100644 docs/data-provision/aggregation/authoritative-datasources.md diff --git a/docs/data-provision/aggregation.md b/docs/data-provision/aggregation.md deleted file mode 100644 index 6b865d6..0000000 --- a/docs/data-provision/aggregation.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -sidebar_position: 1 ---- - -# Aggregation - -OpenAIRE collects metadata records from a variety of content providers as described in the [aggregation and content provision workflows](https://www.openaire.eu/aggregation-and-content-provision-workflows). - -OpenAIRE aggregates metadata records describing objects of the research life-cycle from content providers compliant to the [OpenAIRE guidelines](https://guidelines.openaire.eu/) and from entity registries (i.e. data sources offering authoritative lists of entities, like OpenDOAR, re3data, DOAJ, and funder databases). After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph that you can access from the OpenAIRE portal and the APIs. - -The transformation process includes the application of cleaning functions whose goal is to ensure that values are harmonised according to a common format (e.g. dates as YYYY-MM-dd) and, whenever applicable, to a common controlled vocabulary. The controlled vocabularies used for cleansing are accessible at http://api.openaire.eu/vocabularies. Each vocabulary features a set of controlled terms, each with one code, one label, and a set of synonyms. If a synonym is found as field value, the value is updated with the corresponding term. Also, the OpenAIRE Research Graph is extended with other relevant scholarly communication sources that are too big to be integrated via the “normal” aggregation mechanism: DOIBoost (which merges Crossref, ORCID, Microsoft Academic Graph, and Unpaywall), and ScholeXplorer, one of the Scholix hubs offering a large set of links between research literature and data. - - -

- Aggregation -

\ No newline at end of file diff --git a/docs/data-provision/aggregation/aggregation.md b/docs/data-provision/aggregation/aggregation.md new file mode 100644 index 0000000..078a205 --- /dev/null +++ b/docs/data-provision/aggregation/aggregation.md @@ -0,0 +1,58 @@ +--- +sidebar_position: 1 +--- + +# Aggregation + +OpenAIRE materializes an open, participatory research graph (the OpenAIRE Research graph) where products of the research life-cycle (e.g. scientific literature, research data, project, software) are semantically linked to each other and carry information about their access rights (i.e. if they are Open Access, Restricted, Embargoed, or Closed) and the sources from which they have been collected and where they are hosted. The OpenAIRE research graph is materialised via a set of autonomic, orchestrated workflows operating in a regimen of continuous data aggregation and integration. [1] + +## What does OpenAIRE collect? + +OpenAIRE aggregates metadata records describing objects of the research life-cycle from content providers compliant to the [OpenAIRE guidelines](https://guidelines.openaire.eu/) and from entity registries (i.e. data sources offering authoritative lists of entities, like OpenDOAR, re3data, DOAJ, and funder databases). After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph that you can access from the OpenAIRE portal and the APIs. + +The transformation process includes the application of cleaning functions whose goal is to ensure that values are harmonised according to a common format (e.g. dates as YYYY-MM-dd) and, whenever applicable, to a common controlled vocabulary. The controlled vocabularies used for cleansing are accessible at http://api.openaire.eu/vocabularies. Each vocabulary features a set of controlled terms, each with one code, one label, and a set of synonyms. If a synonym is found as field value, the value is updated with the corresponding term. +Also, the OpenAIRE Research Graph is extended with other relevant scholarly communication sources that do not follow the OpenAIRE Guidelines and/or are too large to be integrated via the “normal” aggregation mechanism: DOIBoost (which merges Crossref, ORCID, Microsoft Academic Graph, and Unpaywall). + +

+ Aggregation +

+ +The OpenAIRE aggregation system collects information about objects of the research life-cycle compliant to the [OpenAIRE acquisition policy](https://www.openaire.eu/content-aquisition-policy1) from [different types of data sources](https://explore.openaire.eu/search/find/dataproviders): + +1. Scientific literature metadata and full-texts from institutional and thematic repositories, CRIS (Common Research Information Systems), Open Access journals and publishers; +2. Dataset metadata from data repositories and data journals; +3. Scientific literature, data and software metadata from Zenodo; +4. Metadata about data sources, organizations, projects, and funding programs from entity registries, i.e. authoritative sources such as CORDA and other funder databases for projects, OpenDOAR for publication repositories, re3data for data repositories, DOAJ for Open Access journals; +5. Metadata of open source research software from software repositories and SoftwareHeritge +6. Metadata about other types of research products, like workflow, protocols, methods, research packages + +Relationships between objects are collected from the data sources, but also automatically detected by [inference algorithms](https://www.openaire.eu/blogs/text-mining-services-in-openaire-1) and added by authenticated users, who can insert links between literature, datasets, software and projects via [the “Link” procedure available from the OpenAIRE explore portal](https://explore.openaire.eu/participate/claim). + +## What kind of data sources are in OpenAIRE? + +Objects and relationships in the OpenAIRE Research Graph are extracted from information packages, i.e. metadata records, collected from data sources of the following kinds: + +- *Institutional or thematic repositories*: Information systems where scientists upload the bibliographic metadata and full-texts of their articles, due to obligations from their organization or due to community practices (e.g. ArXiv, Europe PMC); +- *Open Access Publishers and journals*: Information system of open access publishers or relative journals, which offer bibliographic metadata and PDFs of their published articles; +- *Data archives*: Information systems where scientists deposit descriptive metadata and files about their research data (also known as scientific data, datasets, etc.).; +- *Hybrid repositories/archives*: information systems where scientists deposit metadata and file of any kind of scientific products, incuding scientific literature, research data and research software (e.g. Zenodo) +- *Aggregator services*: Information systems that collect descriptive metadata about publications or datasets from multiple sources in order to enable cross-data source discovery of given research products. Examples are DataCite, BASE, DOAJ; +- *Entity Registries*: Information systems created with the intent of maintaining authoritative registries of given entities in the scholarly communication, such as OpenDOAR for the institutional repositories, re3data for the data repositories, CORDA and other funder databases for projects and funding information; +- *CRIS*: Information systems adopted by research and academic organizations to keep track of their research administration records and relative results; examples of CRIS content are articles or datasets funded by projects, their principal investigators, facilities acquired thanks to funding, etc.. +- *Research Graphs*: services that maintain an information space of (possibly interlinked) scholalrly communication objects. Examples are CrossRef, ScholeXplorer and OpenAIRE itself. + +## How does OpenAIRE collect metadata records? + +OpenAIRE collects metadata records describing objects of the research life-cycle from content providers compliant to the OpenAIRE guidelines and from entity registries (i.e. data sources offering authoritative lists of entities, like OpenDOAR, re3data, DOAJ, and funder databases). + +The OpenAIRE aggregator collects metadata records in the majority of cases via OAI-PMH, but also supports other standard exchange protocols like FTP(S), SFTP, and RESTful API. + +After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph that you can access from the OpenAIRE portal and the APIs. + +For additional details about the aggregation workflows, please refer to [2]. + +## References + +[1] Manghi P. et al. (2014) "The D-NET software toolkit: A framework for the realization, maintenance, and operation of aggregative infrastructures", Program, Vol. 48 Issue: 4, pp.322-354, [10.1108/PROG-08-2013-0045](https://doi.org/10.1108/PROG-08-2013-0045) + +[2] Atzori, Claudio, Bardi, Alessia, Manghi, Paolo, & Mannocci, Andrea. (2017). The OpenAIRE workflows for data management. Zenodo. [10.5281/zenodo.996006](http://doi.org/10.5281/zenodo.996006) diff --git a/docs/data-provision/aggregation/authoritative-datasources.md b/docs/data-provision/aggregation/authoritative-datasources.md new file mode 100644 index 0000000..04f4fac --- /dev/null +++ b/docs/data-provision/aggregation/authoritative-datasources.md @@ -0,0 +1,5 @@ +--- +sidebar_position: 1 +--- + +# Authoritative data sources diff --git a/sidebars.js b/sidebars.js index 0d09d07..a01c2b7 100644 --- a/sidebars.js +++ b/sidebars.js @@ -58,7 +58,14 @@ const sidebars = { label: "Data provision", link: {type: 'doc', id: 'data-provision/data-provision'}, items: [ - { type: 'doc', id: 'data-provision/aggregation' }, + { + type: 'category', + label: "Aggregation", + link: {type: 'doc', id: 'data-provision/aggregation/aggregation'}, + items: [ + { type: 'doc', id: 'data-provision/aggregation/authoritative-datasources' } + ] + }, { type: 'category', label: "Deduplication", From ec3d23ce887aa97c256c0d9eb062b27ee7e3c65f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 5 Oct 2022 15:06:21 +0200 Subject: [PATCH 03/25] WIP: authoritative data sourcesag section --- .../data-provision/aggregation/aggregation.md | 6 +- .../aggregation/authoritative-datasources.md | 275 ++++++++++++++++++ 2 files changed, 277 insertions(+), 4 deletions(-) diff --git a/docs/data-provision/aggregation/aggregation.md b/docs/data-provision/aggregation/aggregation.md index 078a205..4e159e9 100644 --- a/docs/data-provision/aggregation/aggregation.md +++ b/docs/data-provision/aggregation/aggregation.md @@ -8,7 +8,7 @@ OpenAIRE materializes an open, participatory research graph (the OpenAIRE Resear ## What does OpenAIRE collect? -OpenAIRE aggregates metadata records describing objects of the research life-cycle from content providers compliant to the [OpenAIRE guidelines](https://guidelines.openaire.eu/) and from entity registries (i.e. data sources offering authoritative lists of entities, like OpenDOAR, re3data, DOAJ, and funder databases). After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph that you can access from the OpenAIRE portal and the APIs. +OpenAIRE aggregates metadata records describing objects of the research life-cycle from content providers compliant to the [OpenAIRE guidelines](https://guidelines.openaire.eu/) and from entity registries (i.e. data sources offering authoritative lists of entities, like [OpenDOAR](https://v2.sherpa.ac.uk/opendoar/), [re3data](https://www.re3data.org/), [DOAJ](https://doaj.org/), and various funder databases). After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph, accessible from the [OpenAIRE EXPLORE portal](https://explore.openaire.eu) and the [APIs](https://graph.openaire.eu/develop/). The transformation process includes the application of cleaning functions whose goal is to ensure that values are harmonised according to a common format (e.g. dates as YYYY-MM-dd) and, whenever applicable, to a common controlled vocabulary. The controlled vocabularies used for cleansing are accessible at http://api.openaire.eu/vocabularies. Each vocabulary features a set of controlled terms, each with one code, one label, and a set of synonyms. If a synonym is found as field value, the value is updated with the corresponding term. Also, the OpenAIRE Research Graph is extended with other relevant scholarly communication sources that do not follow the OpenAIRE Guidelines and/or are too large to be integrated via the “normal” aggregation mechanism: DOIBoost (which merges Crossref, ORCID, Microsoft Academic Graph, and Unpaywall). @@ -45,9 +45,7 @@ Objects and relationships in the OpenAIRE Research Graph are extracted from info OpenAIRE collects metadata records describing objects of the research life-cycle from content providers compliant to the OpenAIRE guidelines and from entity registries (i.e. data sources offering authoritative lists of entities, like OpenDOAR, re3data, DOAJ, and funder databases). -The OpenAIRE aggregator collects metadata records in the majority of cases via OAI-PMH, but also supports other standard exchange protocols like FTP(S), SFTP, and RESTful API. - -After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph that you can access from the OpenAIRE portal and the APIs. +The OpenAIRE aggregator collects metadata records in the majority of cases via [OAI-PMH](https://www.openarchives.org/pmh/), but also supports other standard exchange protocols like FTP(S), SFTP, and some RESTful API. For additional details about the aggregation workflows, please refer to [2]. diff --git a/docs/data-provision/aggregation/authoritative-datasources.md b/docs/data-provision/aggregation/authoritative-datasources.md index 04f4fac..1c16786 100644 --- a/docs/data-provision/aggregation/authoritative-datasources.md +++ b/docs/data-provision/aggregation/authoritative-datasources.md @@ -3,3 +3,278 @@ sidebar_position: 1 --- # Authoritative data sources + +One of the challenges towards the stability of the contents in the OpenAIRE Research Graph consists of making its identifiers and records stable over time. The barriers to this scenario are many, as the Graph keeps a map of data sources that is subject to constant variations: records in repositories vary in content, original IDs, and PIDs, may disappear or reappear, and the same holds for the repository or the metadata collection it exposes. Not only, but the mappings applied to the original contents may also change and improve over time to catch up with the changes in the input records. + +One of the fronts regards the attribution of the identity to the objects populating the graph. The basic idea is to build the identifiers of the objects in the graph from the PIDs available in some authoritative sources while considering all the other sources as by definition “unstable”. Examples of authoritative sources are Crossref and DataCite. Examples of non-authoritative ones are institutional repositories, aggregators, etc. PIDs from the authoritative sources would form the stable OpenAIRE ID skeleton of the Graph, precisely because they are immutable by construction. + +Such a policy defines a list of data sources that are considered authoritative for a specific type of PID they provide, whose effect is twofold: +* OpenAIRE IDs depend on persistent IDs when they are provided by the authority responsible to create them; +* PIDs are included in the graph according to a tight criterion: the PID Types declared in the table below are considered to be mapped as PIDs only when they are collected from the relative PID authority data source. + +| *PID Type* | *Authority* | +|---|------------------------------------------------------------------------------------| +| doi | [Crossref](https://www.crossref.org), [Datacite](https://datacite.org) | +| pmc, pmid | [Europe PubMed Central](https://europepmc.org/), [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc) | +| arXiv | [arXiv.org e-Print Archive](https://arxiv.org/) | + +There is an exception though: Handle(s) are minted by several repositories; as listing them all would not be a viable option, to avoid losing them as PIDs, Handles bypass the PID authority filtering rule. +In all other cases, PIDs are be included in the graph as alternate Identifiers. + +When a record is aggregated from multiple sources considered authoritative for minting specific PIDs, different mappings could be applied to them and, depending on the case, +this could result in inconsistencies in the attribution of the field values. +To overcome the issue, the intuition is to include such records only once in the graph. To do so, the concept of "delegated authorities" defines a list of datasources that +assigns PIDs to their scientific products from a given PID minter. + +This "selection" can be performed when the entities in the graph sharing the same identifier are grouped together. The list of the delegated authorities currently includes + +| *Datasource delegated* | *Datasource delegating* | *Pid Type* | +|------------------------------|---------------------------|-----| +| [Zenodo](https://zenodo.org) | [Datacite](https://datacite.org) | doi | +| [RoHub](https://reliance.rohub.org/) | [W3ID](https://w3id.org/) | w3id | + +## DOIBoost: Crossref, Unpaywall, Microsoft Academic Graph, ORCID + +DOIBoost is a dataset that combines research outputs and links among them from a selection of data sources. It enriches the records available on Crossref with what's available on Unpaywall, Microsoft Academic Graph, ORCID intersecting all those datasets by DOI. As consequence, DOIBoost does not contain any record from MAG, Unpaywall, or ORCID that doesn't provide a DOI available in Crossref. + +The idea behind DOIBoost and its origin can be found in the paper (and related resources) at: + +* La Bruzzo S., Manghi P., Mannocci A. (2019) OpenAIRE's DOIBoost - Boosting CrossRef for Research. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, doi:10.1007/978-3-030-11226-4_11 . Open Access version available at: [10.5281/zenodo.1441071](https://doi.org/10.5281/zenodo.1441071) + +Each Crossref record is enriched with: +* ORCID identifiers of authors from ORCID +* Open Access instance (with OA color/route and license) from Unpaywall +* the following information from MAG: + * abstracts + * MAG identifiers of authors + * affiliation (result - organization) relationships + * subjects (MAG FieldsOfStudy) + * conference or journal information + +The Open Access status is also set by intersecting the journal information of a record with the journal lists available from DOAJ and the Gold ISSN list. + +### Inputs + +* *Crossref*: dump available to Crossref subscribers via MetadataPlus service, updated once a month. +* *Microsoft Academic Graph*: downloaded version on 2021-02-15. We plan to take the latest version in Dec 2021 before MAG will be retired. +* *ORCID*: baseline dump obtained in 2020-10-13, regularly updated every week from the [ORCID public API](https://info.orcid.org/documentation/features/public-api). +* *Unpaywall*: public database snapshot downloaded in March 2021. Unpaywall updates it twice a year (https://unpaywall.org/products/snapshot) + +The construction of the DOIBoost dataset consists of the following phases: + +### 1 Filtering + +Records in Crossref are ruled out according to the following criteria + +* have blank title, examples: + * `10.1093/rheumatology/41.7.837` + * `10.1093/qjmed/95.7.430` + * `10.1371/journal.pone.0171434.g005` +* have one of the following publishers: `"Test accounts"`, `"CrossRef Test Account"` + * Examples from https://api.crossref.org/works?query.publisher-name=%22Test%20accounts%22 + * `10.1007/bf00344543` + * `10.1007/bf00186154` + * `10.1306/64ed947a-1724-11d7-8645000102c1865d` +* have no authors with valid names, where valid means: not blank and different from all strings in this list: `List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")` + * Examples for blank authors: + * `10.1108/00070709810247807` + * `10.1016/s1074-9098(02)00346-5` + * `10.1136/heart.88.1.6` + * Examples for `"none"` author from https://api.crossref.org/works?query.author=%22none%22 + * `10.4007/annals.2016.184.3.11` + * `10.4007/annals.2012.176.1.6` + * `10.2172/6393585` + * Examples for `"test"` author from https://api.crossref.org/works?query.author=%22test%22 + * `10.5116/ijme.54ca.a5ae` + * `10.5755/j01.ss.71.2.544` + * `10.5755/j01.ee.22.2.319` +* have `"Addie Jackson"` as author and `"Elsevier BV"` as publisher (empirically we say they are test records) + * Examples from https://api.crossref.org/works?query.author=Addie+Jackson&query.publisher-name=%22Elsevier%20BV%22 + * `10.2139/ssrn.2082156` + * `10.2139/ssrn.2202300` + * `10.2139/ssrn.2255657` +* have not one of the following values in the field `type` : `"book-section"`, `"book"`, `"book-chapter"`, `"book-part"`, `"book-series"`, `"book-set"`, `"book-track"`, `"edited-book"`, `"reference-book"`, `"monograph"`, `"journal-article"`, `"dissertation"`, `"other"`, `"peer-review"`, `"proceedings"`, `"proceedings-article"`, `"reference-entry"`, `"report"`, `"report-series"`, `"standard"`, `"standard-series"`, `"posted-content"`, `"dataset"`, + * Example: + * `10.1371/journal.pone.0171434.g005` + * `10.7554/elife.21052.049` + * `10.1371/journal.pcbi.1005379.s006` + +Records with `type=dataset` are mapped into OpenAIRE results of type dataset. All others are mapped as OpenAIRE results of type publication. + +### 2 Mapping Crossref properties into the OpenAIRE Research Graph + +Properties in OpenAIRE results are set based on the logic described in the following table: + +| OpenAIRE Result field path | Crossref path(s) | Notes | +|----------------------------------------|--------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `doi` | id in the form `doi_________::md5(doi)` | +| `dateofcollection` | `indexed.datetime` | | +| `lastupdatetimestamp` | `indexed.timestamp` | | +| `type` | `type` | `dataset` if the Crossref type is dataset, `publication` otherwise (based on the filtering logics described above) | +| `originalId` | `doi, clinical-trial-number, alternative-id` | | +| `pid` | | The scheme tells the type of PID, the value contains the actual value | +| `pid.scheme` | | Default value: doi | +| `pid.value` | `doi` | The doi is normalised and lower-cased | +| `maintitle` | `title` | | +| `subtitle` | `subtitle` | | +| `author` | `author` | if available the sequence is mapped to rank and the ORCID is also mapped | +| `author.name` | `author.given` | | +| `author.surname` | `author.family` | | +| `author.fullname` | `author.given author.family` | | +| `author.rank` | | based on the order, starts from 1 | +| `author.pid` | | only if the ORCID is available | +| `author.pid.id.scheme` | | Default `'pending_orcid'` (meaning that it is not an id confirmed by ORCID) | +| `author.pid.id.value` | `author.ORCID` | | +| `author.pid.provenance.provenance` | | Default 'Harvested' | +| `author.pid.provenance.trust` | | Default '0.9' | +| `description` | `abstract` | | +| `subject` | `subject` | with `classid='keywords'`, i.e. no controlled vocabularies for Crossref subjects | +| `publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | +| `publisher` | `publisher` | | +| `source` | `source` | only if the record is not of type `book` | +| `source` | concatenation of `container-title.head` + `"ISBN: "` + `ISBN.head` | only if the record is of type @book@ | +| `container` | | It is set only for publications with information about the journal it was published in. | +| `container.name` | `container-title.head` | | +| `container.issnOnline` | `issn-type.value` | if `issn-type.type='electronic'` | +| `container.issnPrinted` | `issn-type.value` | if `issn-type.type='print'` | +| `container.vol` | `volume` | | +| `container.sp` | `page` | before `'-'` | +| `container.ep` | `page` | after `'-'` | +| `instance` | | One instance is created with the DOI URL | +| `instance.accessright` | | Values in `instance.accessright.code` and `instance.accessright.label` are set based on license and dateofacceptance:
- `UNKNOWN`: if the license is blank
- `OPEN ACCESS`: if the license is a CC license or an ACS license or an APA license (considered OPEN also by Unpaywall, see [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44002063718-what-is-an-oa-license-) for details) or if OUP license, but only after 12 months from the publication date
- `EMBARGO`: OUP license, before 12 months from the publication date
- `CLOSED`: if there is a license not covered by the previous cases | +| `instance.accessright.code` | | Code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | + | `instance.accessright.label` | | One of: `OPEN`, `RESTRICTED`, `CLOSED`, `EMBARGO` | + | `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | + | `instance.accessright.openAccessRoute` | | only if `instance.accessright.value = 'OPEN ACCESS'`. Default is `hybrid`. The route is fixed in subsequent phases of DOIBoost, namely when intersecting with Unpaywall and patching the hostedby via DOAJ and the Gold-ISSN list. | + | `instance.license` | `license.URL ` | If there is a `license.content-version='vor'`, then this is used. Otherwise the first license entry is used. | + | `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | + | `instance.pid.scheme` | | Default value: `doi` | + | `instance.pid.value` | `doi` | The doi is normalised and lower-cased | + | `instance.publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | + | `instance.refereed` | | set to `peerReviewed` only if `relation.has-review.id` is not empty, `UNKNOWN` otherwise. | + | `instance.type` | `subtype` | mapped using the [OpenAIRE vocabulary for result typologies](https://api.openaire.eu/vocabularies/dnet:result_typologies) | + | `instance.url` | `doi` | Full URL of the DOI | + +All other fields of the Json schema not mentioned in the table contain empty values. + +All the records from Crossref are related to the datasource with `name=Crossref` and `id=openaire____::081b82f96300b6a6e3d282bad31cb6e2` + +Possible improvements: +* map `clinical-trial-number` and `alternative-id` in `alternateIdentifiers`? +* Verify if Crossref has a property for `language`, `country`, `container.issnLinking`, `container.iss`, `container.edition`, `container.conferenceplace` and `container.conferencedate` +* Different approach to set the `refereed` field and improve its coverage? + +h3. 2 Map Crossref links to projects/funders + +Links to funding available in Crossref are mapped as funding relationships (`result -- isProducedBy --> project`) applying the following mapping: + +| *funder* | *grant code* | *Link to* | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|---| +| DOI: `{10.13039/100010663, 10.13039/100010661, 10.13039/501100007601, 10.13039/501100000780, 10.13039/100010665}` or name: `'European Union’s Horizon 2020 research and innovation program'` | series of `4-9` digits in `award` | Link to H2020 project | +| DOI: `{10.13039/100011199, 10.13039/100004431, 10.13039/501100004963, 10.13039/501100000780}` | series of `4-9` digits in `award` | Link to FP7 project | +| DOI: `10.13039/501100000781` OR name: `'European Union's'` | series of `4-9` digits in `award` | Link to FP7 or H2020 project | +| DOI: `10.13039/100000001` | `award` | Link to NSF project | +| DOI: `10.13039/501100001665` OR name: `{'The French National Research Agency (ANR)', 'The French National Research Agency'}` | `award` | Link to ANR project | +| DOI: `10.13039/501100002341` | `award` | Link to Academy of Finland project | +| DOI: `10.13039/501100001602` | `award`, removing the initial 'SFI' if present | Link to SFI project | +| DOI: `10.13039/501100000923` | `award` | Link to ARC project | +| DOI: `10.13039/501100000038` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to NSERC (@unidentified@ project) | +| DOI: `10.13039/501100000155` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to SSHRC (@unidentified@ project) | +| DOI: `10.13039/501100000024` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to CIHR (@unidentified@ project) | +| DOI: `10.13039/501100002848` OR name :`'CONICYT, Programa de Formación de Capital Humano Avanzado'` | `award` | Link to CONICYT project | +| DOI: `10.13039/501100003448` | series of `4-9` digits in award | Link to GSRT project | +| DOI: `10.13039/501100010198` | `award` | Link to SGOV project | +| DOI: `10.13039/501100004564` | series of `4-9` digits in award | Link to MESTD project | +| DOI: `10.13039/501100003407` | `award` | Link to MIUR project. Since OpenAIRE has a small subset of MIUR projects, a link to the MIUR funder (@unidentified@ project) is also generated | +| DOI: `{10.13039/501100006588, 10.13039/501100004488}` | `award`, removing `'Project No'` and `'HRZZ'` prefix, if present | Link to HRZZ or MZOS project | +| DOI: `10.13039/501100006769` | `award` | Link to Russian Science Foundation project | +| DOI: `10.13039/501100001711` | `award` after `'_'` and before `'/'` | Link to SNSF project | +| DOI: `10.13039/501100004410` | `award` | Link to TUBITAK project | +| DOI: `10.10.13039/100004440` or name: `Wellcome Trust Masters Fellowship` | `award` | Link to Wellcome Trust specific project and to the `unidentified` project.| + +### 3 Intersect Crossref with UnpayWall by DOI (DOIBoost1) + +The fields we consider from UnpayWall are: +* `is_oa` +* `best_oa_location` +* `oa_status` + +The results of Crossref that intersect by DOI with UnpayWall records are enriched with one additional `instance` with the following properties: + +| *OpenAIRE Result field path* | *Unpaywall field path* | *Notes* | +|---|---|------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `instance` | | created only if @is_oa@ and a `best_oa_location` is available | +| `instance.accessright` | | default value `Open Access`: we do not add instances if UnpayWall says there is no open version | +| `instance.accessright.code` | | Open Access code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.label` | | Always `OPEN` | +| `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.openAccessRoute` | `oa_status` | | +| `instance.url` | `best_oa_location` | | +| `instance.license` | `best_oa_location.license` | | +| `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | +| `instance.pid.scheme` | | Default value: `doi` | +| `instance.pid.value` | `doi` | The doi is normalised and lower-cased | + +For the definition of UnpayWall's @oa_status@ refer to the [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44001777288-what-do-the-types-of-oa-status-green-gold-hybrid-and-bronze-mean-) + +The record will also feature a relation to the UnpayWall data source: `name="UnpayWall"`, `id=openaire____::8ac8380272269217cb09a928c8caa993`. + +### 4 Intersect DOIBoost1 with ORCID (DOIBoost2) + +The fields we consider from ORCID are: +* `doi` +* `authors`, a list of authors, each with optional `name`, `surname`, `creditName`, `oid` + +| *OpenAIRE field path* | *ORCID path* | *Notes* | +|-------------------------------------|---|-------------------------------------------------------------------------------------------------------------------------------------| +| `pid` | `doi` | | +| `author.name` | `capitalize(name)` | only mapped if not blank | +| `author.surname` | `capitalize(surname)` | only mapped if not blank | +| `author.fullname` | | if name and surname are not blank, they are concatenated (`capitalize(name) capitalize(surname)`), otherwise we use the `creditName` | +| `author.pid` | | only if the `ORCID` is available | +| `author.pid.id.scheme` | | Default `orcid` (meaning that it is confirmed by ORCID, (in contrast to the `orcid_pending` set from Crossref and Unpaywall) | +| `author.pid.id.value` | `oid` | | +| `author.pid.provenance.provenance` | | Default `Harvested` | +| `author.pid.provenance.trust` | | Default `0.9` | + +The records are enriched with the ORCID identifiers of their authors. + +[//]: # (TODO: Update with the new approach implemented by Miriam.) + +The current approach is: +* if the number of authors from Crossref equals the size of authors from ORCID, then we pick the list of authors with more PIDs and try to enrich it with the PIDs from the other list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available; +* if the number of authors are different, then we take the longest and try to enrich it with the PIDs from the other author list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available + +Miriam will modify the process to ensure that: +* the list of authors from Crossred always "win" +* the identifiers from ORCID "win" + +### 5 Intersect DOIBoost2 with Microsoft Academic Graph (DOIBoost3) + +*Important Notes* +* Only papers with DOI are considered +* Since for the same DOI we have multiple version of item with different MAG PaperId, we only take one per DOI (the last one we process). We call this dataset @Papers_distinct@ + +When mapping MAG records to the OpenAIRE Research Graph, we consider the following MAG tables: +* `PaperAbstractsInvertedIndex`: for the paper abstracts +* `Authors`: for the authors. The MAG data is pre-processed by grouping authors by PaperId +* `Affiliations` and `PaperAuthorAffiliations`: to generate links between publications and organisations +* `Journals` and `ConferenceInstances`: joined with @Papers_distinct@ to have the information about the venues where the paper was published +* TO BE REMOVED `PaperUrls`: to create one instance for the OpenAIRE publication +* `FieldsOfStudy`: to add subjects + +The records are enriched with: +* abstracts +* MAG identifiers of authors +* affiliation relationships +* subjects (MAG FieldsOfStudy) +* conference or journal information (in the @journal@ field) TODO: or @container@, in case of the dump? +* [TO BE REMOVED] instances with URL from MAG + +### 6 Enrich DOIBoost3 with hosting data sources (`hostedby`) and access right information + +In this phase, we intersect DOIBoost3 with a dataset composed of journals from OpenAIRE, Crossref, and the ISSN gold list. Each journal comes with its International Standard Serial Numbers (`issn`, `eissn`, `lissn`) and, when available, a flag that tells if the journal is open access. The intersection is done on the basis of the International Standard Serial Numbers. The records with a `journal.[l|e]issn` that match are enriched as follows: +* Each instance gain the `hostedby` information corresponding to the journal +* If the journal is open access, the access rights of the instances are also set to `Open Access` with `gold` route (because by construction, the journals we know are open are from DOAJ or Gold ISSN list) + +The hostedby of records that do not match are set to the `Unknown Repository`. From a5b100520c985b51402c60be50ab1afd82892668 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Oct 2022 11:36:44 +0200 Subject: [PATCH 04/25] WIP: authoritative data sources section --- docs/data-provision/aggregation/datacite.md | 0 docs/data-provision/aggregation/doiboost.md | 0 docs/data-provision/aggregation/pubmed.md | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/data-provision/aggregation/datacite.md create mode 100644 docs/data-provision/aggregation/doiboost.md create mode 100644 docs/data-provision/aggregation/pubmed.md diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/data-provision/aggregation/doiboost.md b/docs/data-provision/aggregation/doiboost.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md new file mode 100644 index 0000000..e69de29 From 2bd8fa9956bda67bb935f1f60cace185e1b8fb22 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Oct 2022 11:37:33 +0200 Subject: [PATCH 05/25] WIP: authoritative data sources section --- docs/data-provision/aggregation/datacite.md | 1 + docs/data-provision/aggregation/doiboost.md | 247 ++++++++++++++++++++ docs/data-provision/aggregation/pubmed.md | 61 +++++ 3 files changed, 309 insertions(+) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index e69de29..f6cd37e 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -0,0 +1 @@ +# Datacite \ No newline at end of file diff --git a/docs/data-provision/aggregation/doiboost.md b/docs/data-provision/aggregation/doiboost.md index e69de29..9a039b2 100644 --- a/docs/data-provision/aggregation/doiboost.md +++ b/docs/data-provision/aggregation/doiboost.md @@ -0,0 +1,247 @@ +# DOIBoost: Crossref, Unpaywall, Microsoft Academic Graph, ORCID + +DOIBoost is a dataset that combines research outputs and links among them from a selection of data sources. +It enriches the records available on Crossref with what's available on Unpaywall, Microsoft Academic Graph, ORCID intersecting all those datasets by DOI. +As consequence, DOIBoost does not contain any record from MAG, Unpaywall, or ORCID that doesn't provide a DOI available in Crossref. + +The idea behind DOIBoost and its origin can be found in the paper (and related resources) at: + +* La Bruzzo S., Manghi P., Mannocci A. (2019) OpenAIRE's DOIBoost - Boosting CrossRef for Research. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, doi:10.1007/978-3-030-11226-4_11 . Open Access version available at: [10.5281/zenodo.1441071](https://doi.org/10.5281/zenodo.1441071) + +Each Crossref record is enriched with: +* ORCID identifiers of authors from ORCID +* Open Access instance (with OA color/route and license) from Unpaywall +* the following information from MAG: + * abstracts + * MAG identifiers of authors + * affiliation (result - organization) relationships + * subjects (MAG FieldsOfStudy) + * conference or journal information + +The Open Access status is also set by intersecting the journal information of a record with the journal lists available from DOAJ and the Gold ISSN list. + +## Inputs + +* *Crossref*: dump available to Crossref subscribers via MetadataPlus service, updated once a month. +* *Microsoft Academic Graph*: downloaded version on 2021-02-15. We plan to take the latest version in Dec 2021 before MAG will be retired. +* *ORCID*: baseline dump obtained in 2020-10-13, regularly updated every week from the [ORCID public API](https://info.orcid.org/documentation/features/public-api). +* *Unpaywall*: public database snapshot downloaded in March 2021. Unpaywall updates it twice a year (https://unpaywall.org/products/snapshot) + +The construction of the DOIBoost dataset consists of the following phases: + +## 1. Crossref filtering + +Records in Crossref are ruled out according to the following criteria + +* have blank title, examples: + * `10.1093/rheumatology/41.7.837` + * `10.1093/qjmed/95.7.430` + * `10.1371/journal.pone.0171434.g005` +* have one of the following publishers: `"Test accounts"`, `"CrossRef Test Account"` + * Examples from https://api.crossref.org/works?query.publisher-name=%22Test%20accounts%22 + * `10.1007/bf00344543` + * `10.1007/bf00186154` + * `10.1306/64ed947a-1724-11d7-8645000102c1865d` +* have no authors with valid names, where valid means: not blank and different from all strings in this list: `List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")` + * Examples for blank authors: + * `10.1108/00070709810247807` + * `10.1016/s1074-9098(02)00346-5` + * `10.1136/heart.88.1.6` + * Examples for `"none"` author from https://api.crossref.org/works?query.author=%22none%22 + * `10.4007/annals.2016.184.3.11` + * `10.4007/annals.2012.176.1.6` + * `10.2172/6393585` + * Examples for `"test"` author from https://api.crossref.org/works?query.author=%22test%22 + * `10.5116/ijme.54ca.a5ae` + * `10.5755/j01.ss.71.2.544` + * `10.5755/j01.ee.22.2.319` +* have `"Addie Jackson"` as author and `"Elsevier BV"` as publisher (empirically we say they are test records) + * Examples from https://api.crossref.org/works?query.author=Addie+Jackson&query.publisher-name=%22Elsevier%20BV%22 + * `10.2139/ssrn.2082156` + * `10.2139/ssrn.2202300` + * `10.2139/ssrn.2255657` +* have not one of the following values in the field `type` : `"book-section"`, `"book"`, `"book-chapter"`, `"book-part"`, `"book-series"`, `"book-set"`, `"book-track"`, `"edited-book"`, `"reference-book"`, `"monograph"`, `"journal-article"`, `"dissertation"`, `"other"`, `"peer-review"`, `"proceedings"`, `"proceedings-article"`, `"reference-entry"`, `"report"`, `"report-series"`, `"standard"`, `"standard-series"`, `"posted-content"`, `"dataset"`, + * Example: + * `10.1371/journal.pone.0171434.g005` + * `10.7554/elife.21052.049` + * `10.1371/journal.pcbi.1005379.s006` + +Records with `type=dataset` are mapped into OpenAIRE results of type dataset. All others are mapped as OpenAIRE results of type publication. + +## 2. Mapping Crossref properties into the OpenAIRE Research Graph + +Properties in OpenAIRE results are set based on the logic described in the following table: + +| OpenAIRE Result field path | Crossref path(s) | Notes | +|----------------------------------------|--------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `doi` | id in the form `doi_________::md5(doi)` | +| `dateofcollection` | `indexed.datetime` | | +| `lastupdatetimestamp` | `indexed.timestamp` | | +| `type` | `type` | `dataset` if the Crossref type is dataset, `publication` otherwise (based on the filtering logics described above) | +| `originalId` | `doi, clinical-trial-number, alternative-id` | | +| `pid` | | The scheme tells the type of PID, the value contains the actual value | +| `pid.scheme` | | Default value: doi | +| `pid.value` | `doi` | The doi is normalised and lower-cased | +| `maintitle` | `title` | | +| `subtitle` | `subtitle` | | +| `author` | `author` | if available the sequence is mapped to rank and the ORCID is also mapped | +| `author.name` | `author.given` | | +| `author.surname` | `author.family` | | +| `author.fullname` | `author.given author.family` | | +| `author.rank` | | based on the order, starts from 1 | +| `author.pid` | | only if the ORCID is available | +| `author.pid.id.scheme` | | Default `'pending_orcid'` (meaning that it is not an id confirmed by ORCID) | +| `author.pid.id.value` | `author.ORCID` | | +| `author.pid.provenance.provenance` | | Default 'Harvested' | +| `author.pid.provenance.trust` | | Default '0.9' | +| `description` | `abstract` | | +| `subject` | `subject` | with `classid='keywords'`, i.e. no controlled vocabularies for Crossref subjects | +| `publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | +| `publisher` | `publisher` | | +| `source` | `source` | only if the record is not of type `book` | +| `source` | concatenation of `container-title.head` + `"ISBN: "` + `ISBN.head` | only if the record is of type `book` | +| `container` | | It is set only for publications with information about the journal it was published in. | +| `container.name` | `container-title.head` | | +| `container.issnOnline` | `issn-type.value` | if `issn-type.type='electronic'` | +| `container.issnPrinted` | `issn-type.value` | if `issn-type.type='print'` | +| `container.vol` | `volume` | | +| `container.sp` | `page` | before `'-'` | +| `container.ep` | `page` | after `'-'` | +| `instance` | | One instance is created with the DOI URL | +| `instance.accessright` | | Values in `instance.accessright.code` and `instance.accessright.label` are set based on license and dateofacceptance:
- `UNKNOWN`: if the license is blank
- `OPEN ACCESS`: if the license is a CC license or an ACS license or an APA license (considered OPEN also by Unpaywall, see [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44002063718-what-is-an-oa-license-) for details) or if OUP license, but only after 12 months from the publication date
- `EMBARGO`: OUP license, before 12 months from the publication date
- `CLOSED`: if there is a license not covered by the previous cases | +| `instance.accessright.code` | | Code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.label` | | One of: `OPEN`, `RESTRICTED`, `CLOSED`, `EMBARGO` | +| `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.openAccessRoute` | | only if `instance.accessright.value = 'OPEN ACCESS'`. Default is `hybrid`. The route is fixed in subsequent phases of DOIBoost, namely when intersecting with Unpaywall and patching the hostedby via DOAJ and the Gold-ISSN list. | +| `instance.license` | `license.URL ` | If there is a `license.content-version='vor'`, then this is used. Otherwise the first license entry is used. | +| `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | +| `instance.pid.scheme` | | Default value: `doi` | +| `instance.pid.value` | `doi` | The doi is normalised and lower-cased | +| `instance.publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | +| `instance.refereed` | | set to `peerReviewed` only if `relation.has-review.id` is not empty, `UNKNOWN` otherwise. | +| `instance.type` | `subtype` | mapped using the [OpenAIRE vocabulary for result typologies](https://api.openaire.eu/vocabularies/dnet:result_typologies) | +| `instance.url` | `doi` | Full URL of the DOI | + +All other fields of the Json schema not mentioned in the table contain empty values. + +All the records from Crossref are related to the datasource with `name=Crossref` and `id=openaire____::081b82f96300b6a6e3d282bad31cb6e2` + +Possible improvements: +* map `clinical-trial-number` and `alternative-id` in `alternateIdentifiers`? +* Verify if Crossref has a property for `language`, `country`, `container.issnLinking`, `container.iss`, `container.edition`, `container.conferenceplace` and `container.conferencedate` +* Different approach to set the `refereed` field and improve its coverage? + +h3. 2 Map Crossref links to projects/funders + +Links to funding available in Crossref are mapped as funding relationships (`result -- isProducedBy --> project`) applying the following mapping: + +| *funder* | *grant code* | *Link to* | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| DOI: `{10.13039/100010663, 10.13039/100010661, 10.13039/501100007601, 10.13039/501100000780, 10.13039/100010665}` or name: `'European Union’s Horizon 2020 research and innovation program'` | series of `4-9` digits in `award` | Link to H2020 project | +| DOI: `{10.13039/100011199, 10.13039/100004431, 10.13039/501100004963, 10.13039/501100000780}` | series of `4-9` digits in `award` | Link to FP7 project | +| DOI: `10.13039/501100000781` OR name: `'European Union's'` | series of `4-9` digits in `award` | Link to FP7 or H2020 project | +| DOI: `10.13039/100000001` | `award` | Link to NSF project | +| DOI: `10.13039/501100001665` OR name: `{'The French National Research Agency (ANR)', 'The French National Research Agency'}` | `award` | Link to ANR project | +| DOI: `10.13039/501100002341` | `award` | Link to Academy of Finland project | +| DOI: `10.13039/501100001602` | `award`, removing the initial 'SFI' if present | Link to SFI project | +| DOI: `10.13039/501100000923` | `award` | Link to ARC project | +| DOI: `10.13039/501100000038` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to NSERC (`unidentified` project) | +| DOI: `10.13039/501100000155` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to SSHRC (`unidentified` project) | +| DOI: `10.13039/501100000024` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to CIHR (`unidentified` project) | +| DOI: `10.13039/501100002848` OR name :`'CONICYT, Programa de Formación de Capital Humano Avanzado'` | `award` | Link to CONICYT project | +| DOI: `10.13039/501100003448` | series of `4-9` digits in award | Link to GSRT project | +| DOI: `10.13039/501100010198` | `award` | Link to SGOV project | +| DOI: `10.13039/501100004564` | series of `4-9` digits in award | Link to MESTD project | +| DOI: `10.13039/501100003407` | `award` | Link to MIUR project. Since OpenAIRE has a small subset of MIUR projects, a link to the MIUR funder (`unidentified`
project) is also generated | +| DOI: `{10.13039/501100006588, 10.13039/501100004488}` | `award`, removing `'Project No'` and `'HRZZ'` prefix, if present | Link to HRZZ or MZOS project | +| DOI: `10.13039/501100006769` | `award` | Link to Russian Science Foundation project | +| DOI: `10.13039/501100001711` | `award` after `'_'` and before `'/'` | Link to SNSF project | +| DOI: `10.13039/501100004410` | `award` | Link to TUBITAK project | +| DOI: `10.10.13039/100004440` or name: `Wellcome Trust Masters Fellowship` | `award` | Link to Wellcome Trust specific project and to the `unidentified` project. | + +## 3. Intersect Crossref with UnpayWall by DOI + +The fields we consider from UnpayWall are: +* `is_oa` +* `best_oa_location` +* `oa_status` + +The results of Crossref that intersect by DOI with UnpayWall records are enriched with one additional `instance` with the following properties: + +| *OpenAIRE Result field path* | *Unpaywall field path* | *Notes* | +|----------------------------------------|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `instance` | | created only if `is_oa` and a `best_oa_location` is available | +| `instance.accessright` | | default value `Open Access`: we do not add instances if UnpayWall says there is no open version | +| `instance.accessright.code` | | Open Access code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.label` | | Always `OPEN` | +| `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | +| `instance.accessright.openAccessRoute` | `oa_status` | | +| `instance.url` | `best_oa_location` | | +| `instance.license` | `best_oa_location.license` | | +| `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | +| `instance.pid.scheme` | | Default value: `doi` | +| `instance.pid.value` | `doi` | The doi is normalised and lower-cased | + +For the definition of UnpayWall's `oa_status` refer to the [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44001777288-what-do-the-types-of-oa-status-green-gold-hybrid-and-bronze-mean-) + +The record will also feature a relation to the UnpayWall data source: `name="UnpayWall"`, `id=openaire____::8ac8380272269217cb09a928c8caa993`. + +## 4. Intersect with ORCID + +The fields we consider from ORCID are: +* `doi` +* `authors`, a list of authors, each with optional `name`, `surname`, `creditName`, `oid` + +| *OpenAIRE field path* | *ORCID path* | *Notes* | +|-------------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------| +| `pid` | `doi` | | +| `author.name` | `capitalize(name)` | only mapped if not blank | +| `author.surname` | `capitalize(surname)` | only mapped if not blank | +| `author.fullname` | | if name and surname are not blank, they are concatenated (`capitalize(name) capitalize(surname)`), otherwise we use the `creditName` | +| `author.pid` | | only if the `ORCID` is available | +| `author.pid.id.scheme` | | Default `orcid` (meaning that it is confirmed by ORCID, (in contrast to the `orcid_pending` set from Crossref and Unpaywall) | +| `author.pid.id.value` | `oid` | | +| `author.pid.provenance.provenance` | | Default `Harvested` | +| `author.pid.provenance.trust` | | Default `0.9` | + +The records are enriched with the ORCID identifiers of their authors. + +[//]: # (TODO: Update with the new approach implemented by Miriam.) + +The current approach is: +* if the number of authors from Crossref equals the size of authors from ORCID, then we pick the list of authors with more PIDs and try to enrich it with the PIDs from the other list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available; +* if the number of authors are different, then we take the longest and try to enrich it with the PIDs from the other author list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available + +Miriam will modify the process to ensure that: +* the list of authors from Crossred always "win" +* the identifiers from ORCID "win" + +## 5. Intersect with Microsoft Academic Graph + +*Important Notes* +* Only papers with DOI are considered +* Since for the same DOI we have multiple version of item with different MAG PaperId, we only take one per DOI (the last one we process). We call this dataset `Papers_distinct` + +When mapping MAG records to the OpenAIRE Research Graph, we consider the following MAG tables: +* `PaperAbstractsInvertedIndex`: for the paper abstracts +* `Authors`: for the authors. The MAG data is pre-processed by grouping authors by PaperId +* `Affiliations` and `PaperAuthorAffiliations`: to generate links between publications and organisations +* `Journals` and `ConferenceInstances`: joined with `Papers_distinct` to have the information about the venues where the paper was published +* TO BE REMOVED `PaperUrls`: to create one instance for the OpenAIRE publication +* `FieldsOfStudy`: to add subjects + +The records are enriched with: +* abstracts +* MAG identifiers of authors +* affiliation relationships +* subjects (MAG FieldsOfStudy) +* conference or journal information (in the `journal` field) TODO: or `container`, in case of the dump? +* [TO BE REMOVED] instances with URL from MAG + +## 6. Enrich DOIBoost3 with hosting data sources (`hostedby`) and access right information + +In this phase, we intersect DOIBoost3 with a dataset composed of journals from OpenAIRE, Crossref, and the ISSN gold list. Each journal comes with its International Standard Serial Numbers (`issn`, `eissn`, `lissn`) and, when available, a flag that tells if the journal is open access. The intersection is done on the basis of the International Standard Serial Numbers. The records with a `journal.[l|e]issn` that match are enriched as follows: +* Each instance gain the `hostedby` information corresponding to the journal +* If the journal is open access, the access rights of the instances are also set to `Open Access` with `gold` route (because by construction, the journals we know are open are from DOAJ or Gold ISSN list) + +The hostedby of records that do not match are set to the `Unknown Repository`. diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md index e69de29..c0c6ac6 100644 --- a/docs/data-provision/aggregation/pubmed.md +++ b/docs/data-provision/aggregation/pubmed.md @@ -0,0 +1,61 @@ +# PubMed + +This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/). + +## Input + +The native data is collected from the [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) site. +It contains XML records compliant with the schema available at https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html. + +## Mapping + +The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. + + +| *OpenAIRE Result field path* | PubMed record field xpath | Notes | +|--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Publication Mapping** | | | +| `id` | ?? | ?? | +| `pid` | `//PMID` | `classid = classname = pmid` | +| `publicationdate` | `//PubmedPubDate` | apply the function GraphCleaningFunctions.cleanDate before assign it | +| `maintitle` | `//Title` | | +| `description` | `//AbstractText` | | +| `language` | `//Language` | cleaning vocabulary -> dnet:languages | +| `subjects` | `//DescriptorName` | classId, className = keyword | +| **Author Mapping** | | | +| `author.surname` | `//Author/LastName` | | +| `author.name` | `//Author/ForeName` | | +| `author.fullname` | `//Author/FullName` | Concatenation of forename + lastName if exist | +| `author.rank` | FOR ALL AUTHORS | sequential number starting from 1 | +| **Journal Mapping** | | | +| `container.conferencedate` | `//Journal/PubDate` | map the date of the Journal | +| `container.name` | `//Journal/Title` | name of the journal | +| `container.vol` | `//Journal/Volume` | journal volume | +| `container.issPrinted` | `//Journal/ISSN` | ?? | +| `container.iss` | `//Journal/Issue` | The journal issue | +| **Instance Mapping** | | | +| `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | +| `instance.pid` | `//PMID` | map the pmid in the pid in the instance | +| `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | +| `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | +| `instance.publicationdate` | `//PubmedPubDate` | | + + +| *OpenAIRE Relation field path* | PubMed record field xpath | Notes | +|--------------------------------|---------------------------|-------| +| | | | + +#TODO + +Missing item mapped + + + + + + + + + + + From 629fe0d51ab17d24e36ee71328345175c2a110d0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Oct 2022 12:10:53 +0200 Subject: [PATCH 06/25] WIP: pids and identifiers --- .../data-model/entities/entity-identifiers.md | 39 --- docs/data-model/entities/result.md | 2 +- docs/data-model/pids-and-identifiers.md | 74 +++++ .../aggregation/authoritative-datasources.md | 280 ------------------ sidebars.js | 5 +- 5 files changed, 79 insertions(+), 321 deletions(-) delete mode 100644 docs/data-model/entities/entity-identifiers.md create mode 100644 docs/data-model/pids-and-identifiers.md delete mode 100644 docs/data-provision/aggregation/authoritative-datasources.md diff --git a/docs/data-model/entities/entity-identifiers.md b/docs/data-model/entities/entity-identifiers.md deleted file mode 100644 index f6b86fa..0000000 --- a/docs/data-model/entities/entity-identifiers.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -sidebar_position: 8 ---- - -# OpenAIRE entity identifier and PID mapping policy - -OpenAIRE assigns internal identifiers for each object it collects. -By default, the internal identifier is generated as `sourcePrefix::md5(localId)` where: - -* `sourcePrefix` is a namespace prefix of 12 chars assigned to the data source at registration time -* `localid` is the identifier assigned to the object by the data source - -After years of operation, we can say that: - -* `localId` are unstable -* objects can disappear from sources -* PIDs provided by sources that are not PID agencies (authoritative sources for a specific type of PID) are often wrong (e.g. pre-print with the DOI of the published version, DOIs with typos) - -Therefore, when the record is collected from an authoritative source: - -* the identity of the record is forged using the PID, like `pidTypePrefix::md5(lowercase(doi))` -* the PID is added in a `pid` element of the data model - -When the record is collected from a source which is not authoritative for any type of PID: -* the identity of the record is forged as usual using the local identifier -* the PID, if available, is added as `alternateIdentifier` - -Currently, the following data sources are used as "PID authorities": - -| PID Type | Prefix (12 chars) | Authority | -|---------- |------------------- |--------------------------------------- | -| doi | `doi_________` | Crossref, Datacite, Zenodo | -| pmc | `pmc_________` | Europe PubMed Central, PubMed Central | -| pmid | `pmid________` | Europe PubMed Central, PubMed Central | -| arXiv | `arXiv_______` | arXiv.org e-Print Archive | -| handle | `handle______` | any repository | - -OpenAIRE also perform duplicate identification (see the [dedicated section for details](../../data-provision/deduplication/)). -All duplicates are **merged** together in a **representative record** which must be assigned a dedicated OpenAIRE identifier (i.e. it cannot have the identifier of one of the aggregated record). \ No newline at end of file diff --git a/docs/data-model/entities/result.md b/docs/data-model/entities/result.md index 2d62bc3..504b7ce 100644 --- a/docs/data-model/entities/result.md +++ b/docs/data-model/entities/result.md @@ -20,7 +20,7 @@ Moreover, there are the following sub-types of a `Result`, that inherit all its ### id _Type: String • Cardinality: ONE_ -Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers). +Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). ```json "id": "50|doi_dedup___::80f29c8c8ba18c46c88a285b7e739dc3" diff --git a/docs/data-model/pids-and-identifiers.md b/docs/data-model/pids-and-identifiers.md new file mode 100644 index 0000000..a6b0afd --- /dev/null +++ b/docs/data-model/pids-and-identifiers.md @@ -0,0 +1,74 @@ +# PIDs and identifiers + +One of the challenges towards the stability of the contents in the OpenAIRE Research Graph consists of making its identifiers and records stable over time. +The barriers to this scenario are many, as the Graph keeps a map of data sources that is subject to constant variations: records in repositories vary in content, +original IDs, and PIDs, may disappear or reappear, and the same holds for the repository or the metadata collection it exposes. +Not only, but the mappings applied to the original contents may also change and improve over time to catch up with the changes in the input records. + +## PID Authorities + +One of the fronts regards the attribution of the identity to the objects populating the graph. The basic idea is to build the identifiers of the objects in the graph from the PIDs available in some authoritative sources while considering all the other sources as by definition “unstable”. Examples of authoritative sources are Crossref and DataCite. Examples of non-authoritative ones are institutional repositories, aggregators, etc. PIDs from the authoritative sources would form the stable OpenAIRE ID skeleton of the Graph, precisely because they are immutable by construction. + +Such a policy defines a list of data sources that are considered authoritative for a specific type of PID they provide, whose effect is twofold: +* OpenAIRE IDs depend on persistent IDs when they are provided by the authority responsible to create them; +* PIDs are included in the graph according to a tight criterion: the PID Types declared in the table below are considered to be mapped as PIDs only when they are collected from the relative PID authority data source. + +| *PID Type* | *Authority* | +|------------|-----------------------------------------------------------------------------------------------------| +| doi | [Crossref](https://www.crossref.org), [Datacite](https://datacite.org) | +| pmc, pmid | [Europe PubMed Central](https://europepmc.org/), [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc) | +| arXiv | [arXiv.org e-Print Archive](https://arxiv.org/) | + +There is an exception though: Handle(s) are minted by several repositories; as listing them all would not be a viable option, to avoid losing them as PIDs, Handles bypass the PID authority filtering rule. +In all other cases, PIDs are be included in the graph as alternate Identifiers. + +## Delegated authorities + +When a record is aggregated from multiple sources considered authoritative for minting specific PIDs, different mappings could be applied to them and, depending on the case, +this could result in inconsistencies in the attribution of the field values. +To overcome the issue, the intuition is to include such records only once in the graph. To do so, the concept of "delegated authorities" defines a list of datasources that +assigns PIDs to their scientific products from a given PID minter. + +This "selection" can be performed when the entities in the graph sharing the same identifier are grouped together. The list of the delegated authorities currently includes + +| *Datasource delegated* | *Datasource delegating* | *Pid Type* | +|--------------------------------------|----------------------------------|------------| +| [Zenodo](https://zenodo.org) | [Datacite](https://datacite.org) | doi | +| [RoHub](https://reliance.rohub.org/) | [W3ID](https://w3id.org/) | w3id | + + +## Identifiers in the Graph + +OpenAIRE assigns internal identifiers for each object it collects. +By default, the internal identifier is generated as `sourcePrefix::md5(localId)` where: + +* `sourcePrefix` is a namespace prefix of 12 chars assigned to the data source at registration time +* `localid` is the identifier assigned to the object by the data source + +After years of operation, we can say that: + +* `localId` are generally unstable +* objects can disappear from sources +* PIDs provided by sources that are not PID agencies (authoritative sources for a specific type of PID) are often wrong (e.g. pre-print with the DOI of the published version, DOIs with typos) + +Therefore, when the record is collected from an authoritative source: + +* the identity of the record is forged using the PID, like `pidTypePrefix::md5(lowercase(doi))` +* the PID is added in a `pid` element of the data model + +When the record is collected from a source which is not authoritative for any type of PID: +* the identity of the record is forged as usual using the local identifier +* the PID, if available, is added as `alternateIdentifier` + +Currently, the following data sources are used as "PID authorities": + +| PID Type | Prefix (12 chars) | Authority | +|-----------|------------------------|-----------------------------------------| +| doi | `doi_________` | Crossref, Datacite, Zenodo | +| pmc | `pmc_________` | Europe PubMed Central, PubMed Central | +| pmid | `pmid________` | Europe PubMed Central, PubMed Central | +| arXiv | `arXiv_______` | arXiv.org e-Print Archive | +| handle | `handle______` | any repository | + +OpenAIRE also perform duplicate identification (see the [dedicated section for details](../../data-provision/deduplication/)). +All duplicates are **merged** together in a **representative record** which must be assigned a dedicated OpenAIRE identifier (i.e. it cannot have the identifier of one of the aggregated record). diff --git a/docs/data-provision/aggregation/authoritative-datasources.md b/docs/data-provision/aggregation/authoritative-datasources.md deleted file mode 100644 index 1c16786..0000000 --- a/docs/data-provision/aggregation/authoritative-datasources.md +++ /dev/null @@ -1,280 +0,0 @@ ---- -sidebar_position: 1 ---- - -# Authoritative data sources - -One of the challenges towards the stability of the contents in the OpenAIRE Research Graph consists of making its identifiers and records stable over time. The barriers to this scenario are many, as the Graph keeps a map of data sources that is subject to constant variations: records in repositories vary in content, original IDs, and PIDs, may disappear or reappear, and the same holds for the repository or the metadata collection it exposes. Not only, but the mappings applied to the original contents may also change and improve over time to catch up with the changes in the input records. - -One of the fronts regards the attribution of the identity to the objects populating the graph. The basic idea is to build the identifiers of the objects in the graph from the PIDs available in some authoritative sources while considering all the other sources as by definition “unstable”. Examples of authoritative sources are Crossref and DataCite. Examples of non-authoritative ones are institutional repositories, aggregators, etc. PIDs from the authoritative sources would form the stable OpenAIRE ID skeleton of the Graph, precisely because they are immutable by construction. - -Such a policy defines a list of data sources that are considered authoritative for a specific type of PID they provide, whose effect is twofold: -* OpenAIRE IDs depend on persistent IDs when they are provided by the authority responsible to create them; -* PIDs are included in the graph according to a tight criterion: the PID Types declared in the table below are considered to be mapped as PIDs only when they are collected from the relative PID authority data source. - -| *PID Type* | *Authority* | -|---|------------------------------------------------------------------------------------| -| doi | [Crossref](https://www.crossref.org), [Datacite](https://datacite.org) | -| pmc, pmid | [Europe PubMed Central](https://europepmc.org/), [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc) | -| arXiv | [arXiv.org e-Print Archive](https://arxiv.org/) | - -There is an exception though: Handle(s) are minted by several repositories; as listing them all would not be a viable option, to avoid losing them as PIDs, Handles bypass the PID authority filtering rule. -In all other cases, PIDs are be included in the graph as alternate Identifiers. - -When a record is aggregated from multiple sources considered authoritative for minting specific PIDs, different mappings could be applied to them and, depending on the case, -this could result in inconsistencies in the attribution of the field values. -To overcome the issue, the intuition is to include such records only once in the graph. To do so, the concept of "delegated authorities" defines a list of datasources that -assigns PIDs to their scientific products from a given PID minter. - -This "selection" can be performed when the entities in the graph sharing the same identifier are grouped together. The list of the delegated authorities currently includes - -| *Datasource delegated* | *Datasource delegating* | *Pid Type* | -|------------------------------|---------------------------|-----| -| [Zenodo](https://zenodo.org) | [Datacite](https://datacite.org) | doi | -| [RoHub](https://reliance.rohub.org/) | [W3ID](https://w3id.org/) | w3id | - -## DOIBoost: Crossref, Unpaywall, Microsoft Academic Graph, ORCID - -DOIBoost is a dataset that combines research outputs and links among them from a selection of data sources. It enriches the records available on Crossref with what's available on Unpaywall, Microsoft Academic Graph, ORCID intersecting all those datasets by DOI. As consequence, DOIBoost does not contain any record from MAG, Unpaywall, or ORCID that doesn't provide a DOI available in Crossref. - -The idea behind DOIBoost and its origin can be found in the paper (and related resources) at: - -* La Bruzzo S., Manghi P., Mannocci A. (2019) OpenAIRE's DOIBoost - Boosting CrossRef for Research. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, doi:10.1007/978-3-030-11226-4_11 . Open Access version available at: [10.5281/zenodo.1441071](https://doi.org/10.5281/zenodo.1441071) - -Each Crossref record is enriched with: -* ORCID identifiers of authors from ORCID -* Open Access instance (with OA color/route and license) from Unpaywall -* the following information from MAG: - * abstracts - * MAG identifiers of authors - * affiliation (result - organization) relationships - * subjects (MAG FieldsOfStudy) - * conference or journal information - -The Open Access status is also set by intersecting the journal information of a record with the journal lists available from DOAJ and the Gold ISSN list. - -### Inputs - -* *Crossref*: dump available to Crossref subscribers via MetadataPlus service, updated once a month. -* *Microsoft Academic Graph*: downloaded version on 2021-02-15. We plan to take the latest version in Dec 2021 before MAG will be retired. -* *ORCID*: baseline dump obtained in 2020-10-13, regularly updated every week from the [ORCID public API](https://info.orcid.org/documentation/features/public-api). -* *Unpaywall*: public database snapshot downloaded in March 2021. Unpaywall updates it twice a year (https://unpaywall.org/products/snapshot) - -The construction of the DOIBoost dataset consists of the following phases: - -### 1 Filtering - -Records in Crossref are ruled out according to the following criteria - -* have blank title, examples: - * `10.1093/rheumatology/41.7.837` - * `10.1093/qjmed/95.7.430` - * `10.1371/journal.pone.0171434.g005` -* have one of the following publishers: `"Test accounts"`, `"CrossRef Test Account"` - * Examples from https://api.crossref.org/works?query.publisher-name=%22Test%20accounts%22 - * `10.1007/bf00344543` - * `10.1007/bf00186154` - * `10.1306/64ed947a-1724-11d7-8645000102c1865d` -* have no authors with valid names, where valid means: not blank and different from all strings in this list: `List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")` - * Examples for blank authors: - * `10.1108/00070709810247807` - * `10.1016/s1074-9098(02)00346-5` - * `10.1136/heart.88.1.6` - * Examples for `"none"` author from https://api.crossref.org/works?query.author=%22none%22 - * `10.4007/annals.2016.184.3.11` - * `10.4007/annals.2012.176.1.6` - * `10.2172/6393585` - * Examples for `"test"` author from https://api.crossref.org/works?query.author=%22test%22 - * `10.5116/ijme.54ca.a5ae` - * `10.5755/j01.ss.71.2.544` - * `10.5755/j01.ee.22.2.319` -* have `"Addie Jackson"` as author and `"Elsevier BV"` as publisher (empirically we say they are test records) - * Examples from https://api.crossref.org/works?query.author=Addie+Jackson&query.publisher-name=%22Elsevier%20BV%22 - * `10.2139/ssrn.2082156` - * `10.2139/ssrn.2202300` - * `10.2139/ssrn.2255657` -* have not one of the following values in the field `type` : `"book-section"`, `"book"`, `"book-chapter"`, `"book-part"`, `"book-series"`, `"book-set"`, `"book-track"`, `"edited-book"`, `"reference-book"`, `"monograph"`, `"journal-article"`, `"dissertation"`, `"other"`, `"peer-review"`, `"proceedings"`, `"proceedings-article"`, `"reference-entry"`, `"report"`, `"report-series"`, `"standard"`, `"standard-series"`, `"posted-content"`, `"dataset"`, - * Example: - * `10.1371/journal.pone.0171434.g005` - * `10.7554/elife.21052.049` - * `10.1371/journal.pcbi.1005379.s006` - -Records with `type=dataset` are mapped into OpenAIRE results of type dataset. All others are mapped as OpenAIRE results of type publication. - -### 2 Mapping Crossref properties into the OpenAIRE Research Graph - -Properties in OpenAIRE results are set based on the logic described in the following table: - -| OpenAIRE Result field path | Crossref path(s) | Notes | -|----------------------------------------|--------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `id` | `doi` | id in the form `doi_________::md5(doi)` | -| `dateofcollection` | `indexed.datetime` | | -| `lastupdatetimestamp` | `indexed.timestamp` | | -| `type` | `type` | `dataset` if the Crossref type is dataset, `publication` otherwise (based on the filtering logics described above) | -| `originalId` | `doi, clinical-trial-number, alternative-id` | | -| `pid` | | The scheme tells the type of PID, the value contains the actual value | -| `pid.scheme` | | Default value: doi | -| `pid.value` | `doi` | The doi is normalised and lower-cased | -| `maintitle` | `title` | | -| `subtitle` | `subtitle` | | -| `author` | `author` | if available the sequence is mapped to rank and the ORCID is also mapped | -| `author.name` | `author.given` | | -| `author.surname` | `author.family` | | -| `author.fullname` | `author.given author.family` | | -| `author.rank` | | based on the order, starts from 1 | -| `author.pid` | | only if the ORCID is available | -| `author.pid.id.scheme` | | Default `'pending_orcid'` (meaning that it is not an id confirmed by ORCID) | -| `author.pid.id.value` | `author.ORCID` | | -| `author.pid.provenance.provenance` | | Default 'Harvested' | -| `author.pid.provenance.trust` | | Default '0.9' | -| `description` | `abstract` | | -| `subject` | `subject` | with `classid='keywords'`, i.e. no controlled vocabularies for Crossref subjects | -| `publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | -| `publisher` | `publisher` | | -| `source` | `source` | only if the record is not of type `book` | -| `source` | concatenation of `container-title.head` + `"ISBN: "` + `ISBN.head` | only if the record is of type @book@ | -| `container` | | It is set only for publications with information about the journal it was published in. | -| `container.name` | `container-title.head` | | -| `container.issnOnline` | `issn-type.value` | if `issn-type.type='electronic'` | -| `container.issnPrinted` | `issn-type.value` | if `issn-type.type='print'` | -| `container.vol` | `volume` | | -| `container.sp` | `page` | before `'-'` | -| `container.ep` | `page` | after `'-'` | -| `instance` | | One instance is created with the DOI URL | -| `instance.accessright` | | Values in `instance.accessright.code` and `instance.accessright.label` are set based on license and dateofacceptance:
- `UNKNOWN`: if the license is blank
- `OPEN ACCESS`: if the license is a CC license or an ACS license or an APA license (considered OPEN also by Unpaywall, see [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44002063718-what-is-an-oa-license-) for details) or if OUP license, but only after 12 months from the publication date
- `EMBARGO`: OUP license, before 12 months from the publication date
- `CLOSED`: if there is a license not covered by the previous cases | -| `instance.accessright.code` | | Code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | - | `instance.accessright.label` | | One of: `OPEN`, `RESTRICTED`, `CLOSED`, `EMBARGO` | - | `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | - | `instance.accessright.openAccessRoute` | | only if `instance.accessright.value = 'OPEN ACCESS'`. Default is `hybrid`. The route is fixed in subsequent phases of DOIBoost, namely when intersecting with Unpaywall and patching the hostedby via DOAJ and the Gold-ISSN list. | - | `instance.license` | `license.URL ` | If there is a `license.content-version='vor'`, then this is used. Otherwise the first license entry is used. | - | `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | - | `instance.pid.scheme` | | Default value: `doi` | - | `instance.pid.value` | `doi` | The doi is normalised and lower-cased | - | `instance.publicationdate` | `issued.datetime` or, if not available, `created.datetime` | | - | `instance.refereed` | | set to `peerReviewed` only if `relation.has-review.id` is not empty, `UNKNOWN` otherwise. | - | `instance.type` | `subtype` | mapped using the [OpenAIRE vocabulary for result typologies](https://api.openaire.eu/vocabularies/dnet:result_typologies) | - | `instance.url` | `doi` | Full URL of the DOI | - -All other fields of the Json schema not mentioned in the table contain empty values. - -All the records from Crossref are related to the datasource with `name=Crossref` and `id=openaire____::081b82f96300b6a6e3d282bad31cb6e2` - -Possible improvements: -* map `clinical-trial-number` and `alternative-id` in `alternateIdentifiers`? -* Verify if Crossref has a property for `language`, `country`, `container.issnLinking`, `container.iss`, `container.edition`, `container.conferenceplace` and `container.conferencedate` -* Different approach to set the `refereed` field and improve its coverage? - -h3. 2 Map Crossref links to projects/funders - -Links to funding available in Crossref are mapped as funding relationships (`result -- isProducedBy --> project`) applying the following mapping: - -| *funder* | *grant code* | *Link to* | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|---| -| DOI: `{10.13039/100010663, 10.13039/100010661, 10.13039/501100007601, 10.13039/501100000780, 10.13039/100010665}` or name: `'European Union’s Horizon 2020 research and innovation program'` | series of `4-9` digits in `award` | Link to H2020 project | -| DOI: `{10.13039/100011199, 10.13039/100004431, 10.13039/501100004963, 10.13039/501100000780}` | series of `4-9` digits in `award` | Link to FP7 project | -| DOI: `10.13039/501100000781` OR name: `'European Union's'` | series of `4-9` digits in `award` | Link to FP7 or H2020 project | -| DOI: `10.13039/100000001` | `award` | Link to NSF project | -| DOI: `10.13039/501100001665` OR name: `{'The French National Research Agency (ANR)', 'The French National Research Agency'}` | `award` | Link to ANR project | -| DOI: `10.13039/501100002341` | `award` | Link to Academy of Finland project | -| DOI: `10.13039/501100001602` | `award`, removing the initial 'SFI' if present | Link to SFI project | -| DOI: `10.13039/501100000923` | `award` | Link to ARC project | -| DOI: `10.13039/501100000038` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to NSERC (@unidentified@ project) | -| DOI: `10.13039/501100000155` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to SSHRC (@unidentified@ project) | -| DOI: `10.13039/501100000024` | `award` ignore: we cannot map the project codes in Crossref to project codes in OpenAIRE | Link to CIHR (@unidentified@ project) | -| DOI: `10.13039/501100002848` OR name :`'CONICYT, Programa de Formación de Capital Humano Avanzado'` | `award` | Link to CONICYT project | -| DOI: `10.13039/501100003448` | series of `4-9` digits in award | Link to GSRT project | -| DOI: `10.13039/501100010198` | `award` | Link to SGOV project | -| DOI: `10.13039/501100004564` | series of `4-9` digits in award | Link to MESTD project | -| DOI: `10.13039/501100003407` | `award` | Link to MIUR project. Since OpenAIRE has a small subset of MIUR projects, a link to the MIUR funder (@unidentified@ project) is also generated | -| DOI: `{10.13039/501100006588, 10.13039/501100004488}` | `award`, removing `'Project No'` and `'HRZZ'` prefix, if present | Link to HRZZ or MZOS project | -| DOI: `10.13039/501100006769` | `award` | Link to Russian Science Foundation project | -| DOI: `10.13039/501100001711` | `award` after `'_'` and before `'/'` | Link to SNSF project | -| DOI: `10.13039/501100004410` | `award` | Link to TUBITAK project | -| DOI: `10.10.13039/100004440` or name: `Wellcome Trust Masters Fellowship` | `award` | Link to Wellcome Trust specific project and to the `unidentified` project.| - -### 3 Intersect Crossref with UnpayWall by DOI (DOIBoost1) - -The fields we consider from UnpayWall are: -* `is_oa` -* `best_oa_location` -* `oa_status` - -The results of Crossref that intersect by DOI with UnpayWall records are enriched with one additional `instance` with the following properties: - -| *OpenAIRE Result field path* | *Unpaywall field path* | *Notes* | -|---|---|------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `instance` | | created only if @is_oa@ and a `best_oa_location` is available | -| `instance.accessright` | | default value `Open Access`: we do not add instances if UnpayWall says there is no open version | -| `instance.accessright.code` | | Open Access code from the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | -| `instance.accessright.label` | | Always `OPEN` | -| `instance.accessright.scheme` | | Scheme that defines the code and label, i.e. the URL to the [COAR vocabulary for access right](http://vocabularies.coar-repositories.org/documentation/access_rights/) | -| `instance.accessright.openAccessRoute` | `oa_status` | | -| `instance.url` | `best_oa_location` | | -| `instance.license` | `best_oa_location.license` | | -| `instance.pid` | | The scheme tells the type of PID, the value contains the actual value | -| `instance.pid.scheme` | | Default value: `doi` | -| `instance.pid.value` | `doi` | The doi is normalised and lower-cased | - -For the definition of UnpayWall's @oa_status@ refer to the [Unpaywall FAQ](https://support.unpaywall.org/support/solutions/articles/44001777288-what-do-the-types-of-oa-status-green-gold-hybrid-and-bronze-mean-) - -The record will also feature a relation to the UnpayWall data source: `name="UnpayWall"`, `id=openaire____::8ac8380272269217cb09a928c8caa993`. - -### 4 Intersect DOIBoost1 with ORCID (DOIBoost2) - -The fields we consider from ORCID are: -* `doi` -* `authors`, a list of authors, each with optional `name`, `surname`, `creditName`, `oid` - -| *OpenAIRE field path* | *ORCID path* | *Notes* | -|-------------------------------------|---|-------------------------------------------------------------------------------------------------------------------------------------| -| `pid` | `doi` | | -| `author.name` | `capitalize(name)` | only mapped if not blank | -| `author.surname` | `capitalize(surname)` | only mapped if not blank | -| `author.fullname` | | if name and surname are not blank, they are concatenated (`capitalize(name) capitalize(surname)`), otherwise we use the `creditName` | -| `author.pid` | | only if the `ORCID` is available | -| `author.pid.id.scheme` | | Default `orcid` (meaning that it is confirmed by ORCID, (in contrast to the `orcid_pending` set from Crossref and Unpaywall) | -| `author.pid.id.value` | `oid` | | -| `author.pid.provenance.provenance` | | Default `Harvested` | -| `author.pid.provenance.trust` | | Default `0.9` | - -The records are enriched with the ORCID identifiers of their authors. - -[//]: # (TODO: Update with the new approach implemented by Miriam.) - -The current approach is: -* if the number of authors from Crossref equals the size of authors from ORCID, then we pick the list of authors with more PIDs and try to enrich it with the PIDs from the other list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available; -* if the number of authors are different, then we take the longest and try to enrich it with the PIDs from the other author list, based on JaroWrinkler distance on authors' names, surnames, or fullnames, depending on which properties are available - -Miriam will modify the process to ensure that: -* the list of authors from Crossred always "win" -* the identifiers from ORCID "win" - -### 5 Intersect DOIBoost2 with Microsoft Academic Graph (DOIBoost3) - -*Important Notes* -* Only papers with DOI are considered -* Since for the same DOI we have multiple version of item with different MAG PaperId, we only take one per DOI (the last one we process). We call this dataset @Papers_distinct@ - -When mapping MAG records to the OpenAIRE Research Graph, we consider the following MAG tables: -* `PaperAbstractsInvertedIndex`: for the paper abstracts -* `Authors`: for the authors. The MAG data is pre-processed by grouping authors by PaperId -* `Affiliations` and `PaperAuthorAffiliations`: to generate links between publications and organisations -* `Journals` and `ConferenceInstances`: joined with @Papers_distinct@ to have the information about the venues where the paper was published -* TO BE REMOVED `PaperUrls`: to create one instance for the OpenAIRE publication -* `FieldsOfStudy`: to add subjects - -The records are enriched with: -* abstracts -* MAG identifiers of authors -* affiliation relationships -* subjects (MAG FieldsOfStudy) -* conference or journal information (in the @journal@ field) TODO: or @container@, in case of the dump? -* [TO BE REMOVED] instances with URL from MAG - -### 6 Enrich DOIBoost3 with hosting data sources (`hostedby`) and access right information - -In this phase, we intersect DOIBoost3 with a dataset composed of journals from OpenAIRE, Crossref, and the ISSN gold list. Each journal comes with its International Standard Serial Numbers (`issn`, `eissn`, `lissn`) and, when available, a flag that tells if the journal is open access. The intersection is done on the basis of the International Standard Serial Numbers. The records with a `journal.[l|e]issn` that match are enriched as follows: -* Each instance gain the `hostedby` information corresponding to the journal -* If the journal is open access, the access rights of the instances are also set to `Open Access` with `gold` route (because by construction, the journals we know are open are from DOAJ or Gold ISSN list) - -The hostedby of records that do not match are set to the `Unknown Repository`. diff --git a/sidebars.js b/sidebars.js index a01c2b7..e7501f1 100644 --- a/sidebars.js +++ b/sidebars.js @@ -23,6 +23,7 @@ const sidebars = { label: "Data model", link: {type: 'doc', id: 'data-model/data-model'}, items: [ + { type: 'doc', id: 'data-model/pids-and-identifiers' }, { type: 'category', label: "Entities", @@ -63,7 +64,9 @@ const sidebars = { label: "Aggregation", link: {type: 'doc', id: 'data-provision/aggregation/aggregation'}, items: [ - { type: 'doc', id: 'data-provision/aggregation/authoritative-datasources' } + { type: 'doc', id: 'data-provision/aggregation/doiboost' }, + { type: 'doc', id: 'data-provision/aggregation/pubmed' }, + { type: 'doc', id: 'data-provision/aggregation/datacite' } ] }, { From f0cbce56c500b21aedae49cb4e8dd21005e7b55f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Oct 2022 13:57:46 +0200 Subject: [PATCH 07/25] WIP: fixed internal links --- docs/data-model/entities/community.md | 2 +- docs/data-model/entities/data-source.md | 2 +- docs/data-model/entities/organization.md | 2 +- docs/data-model/entities/other.md | 2 +- docs/data-model/entities/project.md | 2 +- docs/data-model/entities/result.md | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/data-model/entities/community.md b/docs/data-model/entities/community.md index dfbf24a..32d666b 100644 --- a/docs/data-model/entities/community.md +++ b/docs/data-model/entities/community.md @@ -16,7 +16,7 @@ For example, the organizations supporting a research infrastructure fall in the ### id _Type: String • Cardinality: ONE_ -The OpenAIRE id for the community/research infrastructure, created according to the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers). +The OpenAIRE id for the community/research infrastructure, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). ```json "id": "00|context_____::5b7f9fa40bdc12072249204cedfa7808" diff --git a/docs/data-model/entities/data-source.md b/docs/data-model/entities/data-source.md index c62ff16..9b68231 100644 --- a/docs/data-model/entities/data-source.md +++ b/docs/data-model/entities/data-source.md @@ -15,7 +15,7 @@ For example, a metadata record about a project carries information for the creat ### id _Type: String • Cardinality: ONE_ -The OpenAIRE id of the data source, created according to the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers). +The OpenAIRE id of the data source, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). ```json "id": "10|issn___print::22c514d022b199c346e7f29ca06efc95" diff --git a/docs/data-model/entities/organization.md b/docs/data-model/entities/organization.md index 47f224a..61e54eb 100644 --- a/docs/data-model/entities/organization.md +++ b/docs/data-model/entities/organization.md @@ -14,7 +14,7 @@ Organizations include companies, research centers or institutions involved as pr ### id _Type: String • Cardinality: ONE_ -The OpenAIRE id for the organization, created according to the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers). +The OpenAIRE id for the organization, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). ```json "id": "20|openorgs____::b84450f9864182c67b8611b5593f4250" diff --git a/docs/data-model/entities/other.md b/docs/data-model/entities/other.md index a0f6c0b..a14ca5e 100644 --- a/docs/data-model/entities/other.md +++ b/docs/data-model/entities/other.md @@ -560,7 +560,7 @@ The measures computed for this instance (e.g. those provided by [BIP! Finder](ht ### pid _Type: [ResultPid](#resultpid) • Cardinality: MANY_ -The set of persistent identifiers associated to this instance that have been collected from an authority for the pid type (i.e. Crossref/Datacite for doi). See the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers) for more information. +The set of persistent identifiers associated to this instance that have been collected from an authority for the pid type (i.e. Crossref/Datacite for doi). See the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers) for more information. ```json "pid": [ diff --git a/docs/data-model/entities/project.md b/docs/data-model/entities/project.md index cc984ff..a03ee7c 100644 --- a/docs/data-model/entities/project.md +++ b/docs/data-model/entities/project.md @@ -13,7 +13,7 @@ Of crucial interest to OpenAIRE is also the identification of the funders (e.g. ### id _Type: String • Cardinality: ONE_ -Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers). +Main entity identifier, created according to the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers). ```json "id": "40|corda__h2020::70ea22400fd890c5033cb31642c4ae68" diff --git a/docs/data-model/entities/result.md b/docs/data-model/entities/result.md index 504b7ce..be2de82 100644 --- a/docs/data-model/entities/result.md +++ b/docs/data-model/entities/result.md @@ -258,7 +258,7 @@ Timestamp of last update of the record in OpenAIRE. ### pid _Type: [ResultPid](other#resultpid) • Cardinality: MANY_ -Persistent identifiers of the result. See also the [OpenAIRE entity identifier and PID mapping policy](entity-identifiers) to learn more. +Persistent identifiers of the result. See also the [OpenAIRE entity identifier and PID mapping policy](../pids-and-identifiers) to learn more. ```json "pid": [ From fe15c483dadf1a76565f40783020247d8eef4540 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 6 Oct 2022 16:25:08 +0200 Subject: [PATCH 08/25] added d a part of Datacite documentation --- docs/data-provision/aggregation/datacite.md | 43 ++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index f6cd37e..2ff0dbb 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -1 +1,42 @@ -# Datacite \ No newline at end of file +# Datacite +This section describes the aggregation workflow of Datacite and the mapping implemented for it. + +## Datacite datasource +[Datacite](https://datacite.org/index.html) is a leading global non-profit organisation that provides persistent identifiers (DOIs) for research data and other research outputs. + +## Datacite API +The [DataCite REST API](https://support.datacite.org/docs/api) allows users to retrieve, query, and browse DataCite DOI metadata records. In particular, it exposes a method for incremental harvesting new datacite records. + +``` +https://api.datacite.org/dois?page[cursor]=$CURSOR&page[size]=$NUMBER_OF_ITEM_PER_PAGE&query=updated:[$FROM_DATE_TIMESAMP TO $TO_DATE_TIMESAMP] +``` + +On this API Request, we introduce some variables: +- **CURSOR**: The value of the cursor to iterate the pages +- **NUMBER_OF_ITEM_PER_PAGE**: (max 1000) defines how many records we can download for each page. +- **FROM_DATE_TIMESAMP, TO_DATE_TIMESAMP** interval timestamp of the updated record + + +Each record contains two pieces of information needed for incremental harvesting: +- **isActive**: tell if the record is deleted (isActive:false) +- **updated**: timestamp of last update + + +## Collection Workflow + +The collection workflow is responsible for aggregating new datacite records. Each record is stored on a table called Native Datacite Store with the following schema: +- **DOI**: The DOI PID of the datacite record (It is a primary key) +- **update_timestamp**: the last update date timestamp +- **json**: the native record JSON + +During the collection workflow, we identify the most updated record date, and the collection phase downloads all new datacite records and update the existing one through the API using this date as **FROM_DATE_TIMESAMP** variable. + + +## Datacite Mapping +The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. + + +| OpenAIRE Result field path | Datacite record JSON path | # Notes | +|------------------------------------|-------------------------------|-------------------| +| `id` | `\attributes\doi`|the identifier will be created by folloing the openaire PID generation policy | +| `instance`
`instance.type` | `\attributes\types\resourceType` `\attributes\types\resourceTypeGeneral` `attributes\types\schemaOrg` | Use the vocabulary _dnet:publication_resource_ to find a synonym to one of these terms and get the `instance.type`. Using the **dnet:result_typologies** vocabulary to find the `instance.type` synonym we can get one of the main entity:
`publication`
`dataset`
`software`
`otherresearchproduct` | From cbfd719b777d7f98750ba3b0d0f325da19851347 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 11 Oct 2022 11:55:04 +0200 Subject: [PATCH 09/25] added d a part of Datacite documentation --- docs/data-provision/aggregation/datacite.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 2ff0dbb..125fe01 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -18,7 +18,7 @@ On this API Request, we introduce some variables: Each record contains two pieces of information needed for incremental harvesting: -- **isActive**: tell if the record is deleted (isActive:false) +- **isActive**: tells if the record is deleted (`isActive:false`) - **updated**: timestamp of last update @@ -39,4 +39,8 @@ The table below describes the mapping from the XML baseline records to the OpenA | OpenAIRE Result field path | Datacite record JSON path | # Notes | |------------------------------------|-------------------------------|-------------------| | `id` | `\attributes\doi`|the identifier will be created by folloing the openaire PID generation policy | -| `instance`
`instance.type` | `\attributes\types\resourceType` `\attributes\types\resourceTypeGeneral` `attributes\types\schemaOrg` | Use the vocabulary _dnet:publication_resource_ to find a synonym to one of these terms and get the `instance.type`. Using the **dnet:result_typologies** vocabulary to find the `instance.type` synonym we can get one of the main entity:
`publication`
`dataset`
`software`
`otherresearchproduct` | +|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +| `pid` | `\attributes\doi` | `scheme = doi` | +| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | +| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield| + From ae41daf81d427bd357061784c464cfafff81d8fe Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 12 Oct 2022 12:16:35 +0200 Subject: [PATCH 10/25] completed Documentation of Datacite --- docs/data-provision/aggregation/datacite.md | 38 ++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 125fe01..3ca23e7 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -41,6 +41,42 @@ The table below describes the mapping from the XML baseline records to the OpenA | `id` | `\attributes\doi`|the identifier will be created by folloing the openaire PID generation policy | |
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| | `pid` | `\attributes\doi` | `scheme = doi` | +| `originalid` | `\attributes\doi` | | | `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | -| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield| +| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped**| +| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | +| `author.rank` | | Incremental index starting from 1 | +| `author.name` | `\attributes\creators\givenName` | | +| `author.surname` | `\attributes\creators\familyName` | | +| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | +| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | +| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | +| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | +| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | +| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | +|`publicationdate` | `\attributes\dates` | where `dateType` is **issued** | +|`publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | +|`embargoenddate` | `\attributes\dates` | where `dateType` is **available** | +| `subjects` | `\attributes\subject` | `scheme=keywords` | +| `description` | `\attributes\descriptions` | | +| `publisher` | `\attributes\publisher` | | +| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | +| `publisher` | `\attributes\publisher` | | +| `instance.license` | `\attributes\rightsList` | if right value starts with http and matches a particular regex | +| `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is _figshare_:`open`
  • If `embargo_date < today()`: _OPEN_
| + + +### Mapping Relation + + +| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | +|-------------------------------------------|-------------------------------|-------------------------------|---------| +| `isProducedBy` |`attributes\fundingReferences` | `Result/Project`| we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)`| +| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite| +| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` |we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | + + +### Relation Resolution + + From ceb8a070b50ae53a965aa4e5e19671038bb7ea25 Mon Sep 17 00:00:00 2001 From: Paolo Manghi Date: Wed, 12 Oct 2022 12:21:14 +0200 Subject: [PATCH 11/25] Update 'docs/publications.md' --- docs/publications.md | 63 +++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/docs/publications.md b/docs/publications.md index 77fcbd4..a267e8e 100644 --- a/docs/publications.md +++ b/docs/publications.md @@ -4,54 +4,69 @@ sidebar_position: 7 # How to cite -If you use one of the [OpenAIRE Research Graph dumps](https://zenodo.org/record/6616871), please cite it following the recommendation that you find on the Zenodo page. +Open Science services are open and transparent and survive thanks to your active support and to the visibility and reward they gather. If you use one of the [OpenAIRE Research Graph dumps](https://zenodo.org/record/6616871) for your research, please provide a proper citation following the recommendation that you find on the dump's Zenodo page. -## Other relevant publications +## Relevant research products ### Aggregation system -Manghi, P., Artini, M., Atzori, C., Bardi, A., Mannocci, A., La Bruzzo, S., Candela, L., Castelli, D. and Pagano, P. (2014), “The D-NET software toolkit: A framework for the realization, maintenance, and operation of aggregative infrastructures”, Program: electronic library and information systems, Vol. 48 No. 4, pp. 322-354. -Michele Artini, Claudio Atzori, Alessia Bardi, Sandro La Bruzzo, Paolo Manghi, & Andrea Mannocci. (2016, November 24). The D-NET software toolkit: dnet-basic-aggregator (Version 1.3.0). Zenodo. -Atzori, C., Bardi, A., Manghi, P., & Mannocci, A. (2017, January). The OpenAIRE workflows for data management. In Italian Research Conference on Digital Libraries (pp. 95-107). Springer, Cham. +Manghi, P., Artini, M., Atzori, C., Bardi, A., Mannocci, A., La Bruzzo, S., Candela, L., Castelli, D. and Pagano, P. (2014), “The D-NET software toolkit: A framework for the realization, maintenance, and operation of aggregative infrastructures”, Program: electronic library and information systems, Vol. 48 No. 4, pp. 322-354. [doi:10.1108/prog-08-2013-0045](http://doi.org/10.1108/prog-08-2013-0045) -Mannocci, A., & Manghi, P. (2016, September). DataQ: a data flow quality monitoring system for aggregative data infrastructures. In International Conference on Theory and Practice of Digital Libraries (pp. 357-369). Springer, Cham. +Atzori, C., Bardi, A., Manghi, P., & Mannocci, A. (2017, January). "The OpenAIRE workflows for data management". In Italian Research Conference on Digital Libraries (pp. 95-107). Springer, Cham. [doi:10.1007/978-3-319-68130-6_8](https://doi.org/10.1007/978-3-319-68130-6_8) + +*Software* Michele Artini, Claudio Atzori, Alessia Bardi, Sandro La Bruzzo, Paolo Manghi, & Andrea Mannocci. (2016, November 24). "The D-NET software toolkit: dnet-basic-aggregator (Version 1.3.0)". Zenodo. [doi:10.5281/zenodo.168356](https://doi.org/10.5281/zenodo.168356) + +Mannocci, A., & Manghi, P. (2016, September). "DataQ: a data flow quality monitoring system for aggregative data infrastructures". In International Conference on Theory and Practice of Digital Libraries (pp. 357-369). Springer, Cham. [doi:10.1007/978-3-319-43997-6_28](https://doi.org/10.1007/978-3-319-43997-6_28) ### Deduplication -Claudio Atzori, & Paolo Manghi. (2017, February 17). gdup: a big graph entity deduplication system (Version 4.0.5). Zenodo. https://code-repo.d4science.org/D-Net/dnet-dedup/releases -Manghi, Paolo, Marko Mikulicic, and Claudio Atzori. "De-duplication of aggregation authority files." International Journal of Metadata, Semantics and Ontologies 7.2 (2012): 114-130. +Vichos K., De Bonis M., Kanellos I., Chatzopoulos S., Atzori C., Manola N., Manghi P., Vergoulis T. (Feb. 2022), "A preliminary assessment of the article deduplication algorithm used for the OpenAIRE Research Graph". IRCDL 2022 - 18th Italian Research Conference on Digital Libraries, Padua, Italy. CEUR-WS Proceedings. [http://ceur-ws.org/Vol-3160](http://ceur-ws.org/Vol-3160/) -Manghi, P., Atzori, C., De Bonis, M., & Bardi, A. (2020). Entity deduplication in big data graphs for scholarly communication. Data Technologies and Applications. -Manghi, P., & Mikulicic, M. (2011, October). PACE: A general-purpose tool for authority control. In Research Conference on Metadata and Semantic Research (pp. 80-92). Springer, Berlin, Heidelberg. +De Bonis, M., Manghi, P., & Atzori, C. (2022). "FDup: a framework for general-purpose and efficient entity deduplication of record collections". PeerJ Computer Science, 8, e1058. [https://peerj.com/articles/cs-1058](https://peerj.com/articles/cs-1058) -Atzori, C., Manghi, P., & Bardi, A. (2018, December). GDup: de-duplication of scholarly communication big graphs. In 2018 IEEE/ACM 5th International Conference on Big Data Computing Applications and Technologies (BDCAT) (pp. 142-151). IEEE. -Atzori, Claudio. "GDup: an Integrated, Scalable Big Graph Deduplication System." (2016). +Manghi, P., Atzori, C., De Bonis, M., & Bardi, A. (2020). "Entity deduplication in big data graphs for scholarly communication". Data Technologies and Applications. [doi:10.1108/dta-09-2019-0163](https://doi.org/10.1108/dta-09-2019-0163) + + +Atzori, C., Manghi, P., & Bardi, A. (2018, December). "GDup: de-duplication of scholarly communication big graphs". In 2018 IEEE/ACM 5th International Conference on Big Data Computing Applications and Technologies (BDCAT) (pp. 142-151). IEEE. [doi:10.1109/bdcat.2018.00025](https://doi.org/10.1109/bdcat.2018.00025) + +*Software* Claudio Atzori, & Paolo Manghi. (2017, February 17). "GDup: a big graph entity deduplication system" (Version 4.0.5). Zenodo. [doi:/10.5281/zenodo.292980](https://doi.org/10.5281/zenodo.292980) + +Atzori, Claudio. "GDup: an Integrated, Scalable Big Graph Deduplication System." (2016). [doi:10.5281/zenodo.1454879](https://doi.org/10.5281/zenodo.1454879) + +Manghi, Paolo, Marko Mikulicic, and Claudio Atzori. "De-duplication of aggregation authority files." International Journal of Metadata, Semantics and Ontologies 7.2 (2012): 114-130. [doi:10.1504/ijmso.2012.050014](https://doi.org/10.1504/ijmso.2012.050014) + +Manghi, P., & Mikulicic, M. (2011, October). "PACE: A general-purpose tool for authority control". In Research Conference on Metadata and Semantic Research (pp. 80-92). Springer, Berlin, Heidelberg. [doi:10.1007/978-3-642-24731-6_8](https://doi.org/10.1007/978-3-642-24731-6_8) ### Mining -M. Kobos, Ł. Bolikowski, M. Horst, P. Manghi, N. Manola, J. Schirrwagen, “Information inference in scholarly communication infrastructures: the OpenAIREplus project experience”, Procedia Computer Science 38, 92-99. +Giannakopoulos T., Foufoulas Y., Dimitropoulos H., Manola N. (2019) “Interactive Text Analysis and Information Extraction”. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, Cham. [doi:10.1007/978-3-030-11226-4_27](https://doi.org/10.1007/978-3-030-11226-4_27) -Tkaczyk, D., Szostek, P., Fedoryszak, M. et al. CERMINE: automatic extraction of structured metadata from scientific literature. IJDAR 18, 317–335 (2015). -Giannakopoulos T., Foufoulas Y., Dimitropoulos H., Manola N. (2019) “Interactive Text Analysis and Information Extraction”. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, Cham. +Foufoulas Y., Stamatogiannakis L., Dimitropoulos H., Ioannidis Y. (2017) “High-Pass Text Filtering for Citation Matching”. In: Kamps J., Tsakonas G., Manolopoulos Y., Iliadis L., Karydis I. (eds) Research and Advanced Technology for Digital Libraries. TPDL 2017. Lecture Notes in Computer Science, vol 10450. Springer, Cham. [doi:10.1007/978-3-319-67008-9_28](https://doi.org/10.1007/978-3-319-67008-9_28) -Foufoulas Y., Stamatogiannakis L., Dimitropoulos H., Ioannidis Y. (2017) “High-Pass Text Filtering for Citation Matching”. In: Kamps J., Tsakonas G., Manolopoulos Y., Iliadis L., Karydis I. (eds) Research and Advanced Technology for Digital Libraries. TPDL 2017. Lecture Notes in Computer Science, vol 10450. Springer, Cham. +Y. Chronis, Y. Foufoulas, V. Nikolopoulos, A. Papadopoulos, L. Stamatogiannakis, C. Svingos, Y. E. Ioannidis, "A Relational Approach to Complex Dataflows", in Workshop Proceedings of the EDBT/ICDT 2016 (MEDAL 2016) Joint Conference (March 15, 2016, Bordeaux, France) on CEUR-WS.org (ISSN 1613-0073) [http://ceur-ws.org/Vol-1558/paper45.pdf](http://ceur-ws.org/Vol-1558/paper45.pdf) -T. Giannakopoulos, I. Foufoulas, E. Stamatogiannakis, H. Dimitropoulos, N. Manola, and Y. Ioannidis. 2015. “Visual-Based Classification of Figures from Scientific Literature”. In Proceedings of the 24th International Conference on World Wide Web (WWW '15 Companion). Association for Computing Machinery, New York, NY, USA, 1059–1060. +T. Giannakopoulos, I. Foufoulas, E. Stamatogiannakis, H. Dimitropoulos, N. Manola, and Y. Ioannidis. 2015. “Visual-Based Classification of Figures from Scientific Literature”. In Proceedings of the 24th International Conference on World Wide Web (WWW '15 Companion). Association for Computing Machinery, New York, NY, USA, 1059–1060. [doi:10.1145/2740908.2742024](https://doi.org/10.1145/2740908.2742024) -Giannakopoulos, T., Foufoulas, I., Stamatogiannakis, E., Dimitropoulos, H., Manola, N., & Ioannidis, Y. (2014). “Discovering and Visualizing Interdisciplinary Content Classes in Scientific Publications”. D-Lib Mag., Volume 20, Number 11/12. +Giannakopoulos, T., Foufoulas, I., Stamatogiannakis, E., Dimitropoulos, H., Manola, N., & Ioannidis, Y. (2014). “Discovering and Visualizing Interdisciplinary Content Classes in Scientific Publications”. D-Lib Mag., Volume 20, Number 11/12. [doi:10.1045/november14-giannakopoulos](https://doi.org/10.1045/november14-giannakopoulos) -Giannakopoulos T., Stamatogiannakis E., Foufoulas I., Dimitropoulos H., Manola N., Ioannidis Y. (2014) “Content Visualization of Scientific Corpora Using an Extensible Relational Database Implementation”. In: Bolikowski Ł., Casarosa V., Goodale P., Houssos N., Manghi P., Schirrwagen J. (eds) Theory and Practice of Digital Libraries -- TPDL 2013 Selected Workshops. TPDL 2013. Communications in Computer and Information Science, vol 416. Springer, Cham. Also in: Google Books +Giannakopoulos T., Stamatogiannakis E., Foufoulas I., Dimitropoulos H., Manola N., Ioannidis Y. (2014) “Content Visualization of Scientific Corpora Using an Extensible Relational Database Implementation”. In: Bolikowski Ł., Casarosa V., Goodale P., Houssos N., Manghi P., Schirrwagen J. (eds) Theory and Practice of Digital Libraries -- TPDL 2013 Selected Workshops. TPDL 2013. Communications in Computer and Information Science, vol 416. Springer, Cham. [doi:10.1007/978-3-319-08425-1_10](https://doi.org/10.1007/978-3-319-08425-1_10) -Giannakopoulos T., Dimitropoulos H., Metaxas O., Manola N., Ioannidis Y. (2013) “Supervised Content Visualization of Scientific Publications: A Case Study on the ArXiv Dataset”. In: Kłopotek M.A., Koronacki J., Marciniak M., Mykowiecka A., Wierzchoń S.T. (eds) Language Processing and Intelligent Information Systems. IIS 2013. Lecture Notes in Computer Science, vol 7912. Springer, Berlin, Heidelberg. +Giannakopoulos T., Dimitropoulos H., Metaxas O., Manola N., Ioannidis Y. (2013) “Supervised Content Visualization of Scientific Publications: A Case Study on the ArXiv Dataset”. In: Kłopotek M.A., Koronacki J., Marciniak M., Mykowiecka A., Wierzchoń S.T. (eds) Language Processing and Intelligent Information Systems. IIS 2013. Lecture Notes in Computer Science, vol 7912. Springer, Berlin, Heidelberg. [doi:10.1007/978-3-642-38634-3_23](https://doi.org/10.1007/978-3-642-38634-3_23) -Y. Chronis, Y. Foufoulas, V. Nikolopoulos, A. Papadopoulos, L. Stamatogiannakis, C. Svingos, Y. E. Ioannidis, "A Relational Approach to Complex Dataflows", in Workshop Proceedings of the EDBT/ICDT 2016 (MEDAL 2016) Joint Conference (March 15, 2016, Bordeaux, France) on CEUR-WS.org (ISSN 1613-0073) +Tkaczyk, D., Szostek, P., Fedoryszak, M. et al. "CERMINE: automatic extraction of structured metadata from scientific literature". IJDAR 18, 317–335 (2015). [doi:10.1007/s10032-015-0249-8](https://doi.org/10.1007/s10032-015-0249-8) + +M. Kobos, Ł. Bolikowski, M. Horst, P. Manghi, N. Manola, J. Schirrwagen (2014) “Information inference in scholarly communication infrastructures: the OpenAIREplus project experience”, Procedia Computer Science 38, 92-99. [doi:10.1016/j.procs.2014.10.016](https://doi.org/10.1016/j.procs.2014.10.016) ### Portals -Baglioni M. et al. (2019) The OpenAIRE Research Community Dashboard: On Blending Scientific Workflows and Scientific Publishing. In: Doucet A., Isaac A., Golub K., Aalberg T., Jatowt A. (eds) Digital Libraries for Open Knowledge. TPDL 2019. Lecture Notes in Computer Science, vol 11799. Springer, Cham. + +Baglioni M. et al. (2019) "The OpenAIRE Research Community Dashboard: On Blending Scientific Workflows and Scientific Publishing". In: Doucet A., Isaac A., Golub K., Aalberg T., Jatowt A. (eds) Digital Libraries for Open Knowledge. TPDL 2019. Lecture Notes in Computer Science, vol 11799. Springer, Cham. [doi:10.1007/978-3-030-30760-8_5](https://doi.org/10.1007/978-3-030-30760-8_5) ### Broker Service -Artini, M., Atzori, C., Bardi, A., La Bruzzo, S., Manghi, P., & Mannocci, A. (2015). The OpenAIRE literature broker service for institutional repositories. D-Lib Magazine, 21(11/12), 1. -Manghi, P., Atzori, C., Bardi, A., La Bruzzo, S., & Artini, M. (2016, February). Realizing a Scalable and History-Aware Literature Broker Service for OpenAIRE. In Italian Research Conference on Digital Libraries (pp. 92-103). Springer, Cham. +Manghi, P., Atzori, C., Bardi, A., La Bruzzo, S., & Artini, M. (2016, February). "Realizing a Scalable and History-Aware Literature Broker Service for OpenAIRE". In Italian Research Conference on Digital Libraries (pp. 92-103). Springer, Cham. [doi:10.1007/978-3-319-56300-8_9](https://doi.org/10.1007/978-3-319-56300-8_9) + +Artini, M., Atzori, C., Bardi, A., La Bruzzo, S., Manghi, P., & Mannocci, A. (2015). "The OpenAIRE literature broker service for institutional repositories". D-Lib Magazine, 21(11/12), 1. [doi:10.1045/november2015-artini](https://doi.org/10.1045/november2015-artini) + + From 93bad11a045c84d0135d853d1bda6dc26458ab27 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 21 Oct 2022 13:44:45 +0200 Subject: [PATCH 12/25] WIP: updated text in the datacite section --- docs/data-provision/aggregation/datacite.md | 75 ++++++++++----------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 3ca23e7..6d838fd 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -1,69 +1,66 @@ # Datacite -This section describes the aggregation workflow of Datacite and the mapping implemented for it. +This section describes the aggregation workflow used to gather the bibliographic material from Datacite and the relative mapping. ## Datacite datasource [Datacite](https://datacite.org/index.html) is a leading global non-profit organisation that provides persistent identifiers (DOIs) for research data and other research outputs. ## Datacite API -The [DataCite REST API](https://support.datacite.org/docs/api) allows users to retrieve, query, and browse DataCite DOI metadata records. In particular, it exposes a method for incremental harvesting new datacite records. +The [DataCite REST API](https://support.datacite.org/docs/api) allows users to retrieve, query, and browse Datacite metadata records. In particular, it exposes a method for harvesting new records incrementally. ``` https://api.datacite.org/dois?page[cursor]=$CURSOR&page[size]=$NUMBER_OF_ITEM_PER_PAGE&query=updated:[$FROM_DATE_TIMESAMP TO $TO_DATE_TIMESAMP] ``` On this API Request, we introduce some variables: -- **CURSOR**: The value of the cursor to iterate the pages -- **NUMBER_OF_ITEM_PER_PAGE**: (max 1000) defines how many records we can download for each page. -- **FROM_DATE_TIMESAMP, TO_DATE_TIMESAMP** interval timestamp of the updated record - +- **CURSOR**: The value of the cursor to iterate the pages; the cursor is extracted from each API response and used in the next request. +- **NUMBER_OF_ITEM_PER_PAGE**: (max 1000) defines how many records must be returned within each API response. +- **FROM_DATE_TIMESAMP, TO_DATE_TIMESAMP** interval timestamp of the updated record. Each record contains two pieces of information needed for incremental harvesting: - **isActive**: tells if the record is deleted (`isActive:false`) - **updated**: timestamp of last update - ## Collection Workflow -The collection workflow is responsible for aggregating new datacite records. Each record is stored on a table called Native Datacite Store with the following schema: -- **DOI**: The DOI PID of the datacite record (It is a primary key) +The collection workflow is responsible for aggregating new records. Each record is stored locally on a table with the following schema: +- **DOI**: The DOI of the Datacite record (it is the primary key) - **update_timestamp**: the last update date timestamp - **json**: the native record JSON -During the collection workflow, we identify the most updated record date, and the collection phase downloads all new datacite records and update the existing one through the API using this date as **FROM_DATE_TIMESAMP** variable. - +The metadata collection process identifies the most recent record date available locally and uses such date to requests the records to the Datacite API, populating the **FROM_DATE_TIMESAMP** variable. The records in the API response are included in the local storage in upsert mode. ## Datacite Mapping The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. -| OpenAIRE Result field path | Datacite record JSON path | # Notes | -|------------------------------------|-------------------------------|-------------------| -| `id` | `\attributes\doi`|the identifier will be created by folloing the openaire PID generation policy | -|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| -| `pid` | `\attributes\doi` | `scheme = doi` | -| `originalid` | `\attributes\doi` | | -| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | -| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped**| -| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | -| `author.rank` | | Incremental index starting from 1 | -| `author.name` | `\attributes\creators\givenName` | | -| `author.surname` | `\attributes\creators\familyName` | | -| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | -| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | -| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | -| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | -| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | -| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | -|`publicationdate` | `\attributes\dates` | where `dateType` is **issued** | -|`publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | -|`embargoenddate` | `\attributes\dates` | where `dateType` is **available** | -| `subjects` | `\attributes\subject` | `scheme=keywords` | -| `description` | `\attributes\descriptions` | | -| `publisher` | `\attributes\publisher` | | -| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | -| `publisher` | `\attributes\publisher` | | -| `instance.license` | `\attributes\rightsList` | if right value starts with http and matches a particular regex | -| `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is _figshare_:`open`
  • If `embargo_date < today()`: _OPEN_
| +| OpenAIRE Result field path | Datacite record JSON path | # Notes | +|--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `\attributes\doi` | the identifier will be created by folloing the openaire PID generation policy | +|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +| `pid` | `\attributes\doi` | `scheme = doi` | +| `originalid` | `\attributes\doi` | | +| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | +| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped** | +| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | +| `author.rank` | | Incremental index starting from 1 | +| `author.name` | `\attributes\creators\givenName` | | +| `author.surname` | `\attributes\creators\familyName` | | +| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | +| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | +| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | +| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | +| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | +| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | +| `publicationdate` | `\attributes\dates` | where `dateType` is **issued** | +| `publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | +| `embargoenddate` | `\attributes\dates` | where `dateType` is **available** | +| `subjects` | `\attributes\subject` | `scheme=keywords` | +| `description` | `\attributes\descriptions` | | +| `publisher` | `\attributes\publisher` | | +| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | +| `publisher` | `\attributes\publisher` | | +| `instance.license` | `\attributes\rightsList` | if right value starts with http and matches a particular regex | +| `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is _figshare_:`open`
  • If `embargo_date < today()`: _OPEN_
| ### Mapping Relation From 89cc05d25aa72c197d02c4d1ba5255f06b95e8a7 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 21 Oct 2022 14:58:16 +0200 Subject: [PATCH 13/25] reviewed Pubmed Mapping, added EBI page --- docs/data-provision/aggregation/datacite.md | 2 +- docs/data-provision/aggregation/pubmed.md | 30 +++++---------------- sidebars.js | 3 ++- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 6d838fd..13b67cd 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -35,7 +35,7 @@ The table below describes the mapping from the XML baseline records to the OpenA | OpenAIRE Result field path | Datacite record JSON path | # Notes | |--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `id` | `\attributes\doi` | the identifier will be created by folloing the openaire PID generation policy | +| `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | |
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| | `pid` | `\attributes\doi` | `scheme = doi` | | `originalid` | `\attributes\doi` | | diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md index c0c6ac6..d2355ff 100644 --- a/docs/data-provision/aggregation/pubmed.md +++ b/docs/data-provision/aggregation/pubmed.md @@ -7,6 +7,8 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm The native data is collected from the [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) site. It contains XML records compliant with the schema available at https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html. +## Incremental harvesting +Pubmed exposes an entry point FTP with all the updates for each one. [ftp baseline update](https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/). We collect the new file and generate the new dataset by upserting the existing item. ## Mapping The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. @@ -15,9 +17,9 @@ The table below describes the mapping from the XML baseline records to the OpenA | *OpenAIRE Result field path* | PubMed record field xpath | Notes | |--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| | **Publication Mapping** | | | -| `id` | ?? | ?? | +| `id` | ?? | id in the form `pmid_________::md5(pmid)` | | `pid` | `//PMID` | `classid = classname = pmid` | -| `publicationdate` | `//PubmedPubDate` | apply the function GraphCleaningFunctions.cleanDate before assign it | +| `publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | | `maintitle` | `//Title` | | | `description` | `//AbstractText` | | | `language` | `//Language` | cleaning vocabulary -> dnet:languages | @@ -31,31 +33,11 @@ The table below describes the mapping from the XML baseline records to the OpenA | `container.conferencedate` | `//Journal/PubDate` | map the date of the Journal | | `container.name` | `//Journal/Title` | name of the journal | | `container.vol` | `//Journal/Volume` | journal volume | -| `container.issPrinted` | `//Journal/ISSN` | ?? | +| `container.issPrinted` | `//Journal/ISSN` | the journal issn | | `container.iss` | `//Journal/Issue` | The journal issue | | **Instance Mapping** | | | | `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | | `instance.pid` | `//PMID` | map the pmid in the pid in the instance | | `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | | `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | -| `instance.publicationdate` | `//PubmedPubDate` | | - - -| *OpenAIRE Relation field path* | PubMed record field xpath | Notes | -|--------------------------------|---------------------------|-------| -| | | | - -#TODO - -Missing item mapped - - - - - - - - - - - +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | \ No newline at end of file diff --git a/sidebars.js b/sidebars.js index e7501f1..8063572 100644 --- a/sidebars.js +++ b/sidebars.js @@ -66,7 +66,8 @@ const sidebars = { items: [ { type: 'doc', id: 'data-provision/aggregation/doiboost' }, { type: 'doc', id: 'data-provision/aggregation/pubmed' }, - { type: 'doc', id: 'data-provision/aggregation/datacite' } + { type: 'doc', id: 'data-provision/aggregation/datacite' }, + { type: 'doc', id: 'data-provision/aggregation/ebi' }, ] }, { From 8222f554db980d3f8c7eea9e38a835633379ce99 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 2 Nov 2022 14:36:46 +0100 Subject: [PATCH 14/25] updated Datacite documentation added result.type --- docs/data-provision/aggregation/datacite.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 13b67cd..565ef1a 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -36,7 +36,8 @@ The table below describes the mapping from the XML baseline records to the OpenA | OpenAIRE Result field path | Datacite record JSON path | # Notes | |--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | -|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | +|`result.type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| | `pid` | `\attributes\doi` | `scheme = doi` | | `originalid` | `\attributes\doi` | | | `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | From ea99f8fc3c527cd27893604e671f5fc605a9207b Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 2 Nov 2022 14:38:29 +0100 Subject: [PATCH 15/25] updated Pubmed documentation added result.type --- docs/data-provision/aggregation/pubmed.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md index d2355ff..96f957c 100644 --- a/docs/data-provision/aggregation/pubmed.md +++ b/docs/data-provision/aggregation/pubmed.md @@ -37,6 +37,7 @@ The table below describes the mapping from the XML baseline records to the OpenA | `container.iss` | `//Journal/Issue` | The journal issue | | **Instance Mapping** | | | | `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | +|`result.type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| | `instance.pid` | `//PMID` | map the pmid in the pid in the instance | | `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | | `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | From df026bcba4ed5973a868936e4e56b40ced81e574 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 2 Nov 2022 14:42:56 +0100 Subject: [PATCH 16/25] Added EBI --- docs/data-provision/aggregation/ebi.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 docs/data-provision/aggregation/ebi.md diff --git a/docs/data-provision/aggregation/ebi.md b/docs/data-provision/aggregation/ebi.md new file mode 100644 index 0000000..11a8507 --- /dev/null +++ b/docs/data-provision/aggregation/ebi.md @@ -0,0 +1,18 @@ +# EMBL-EBIs Protein Data Bank in Europe + +This section describes the mapping implemented for [EMBL-EBIs Protein Data Bank in Europe](https://www.ebi.ac.uk/). + +The Europe PMC RESTful Web Service gives the [datalinks API](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API)to retrieve data-literature links in Scholix format . + +## how data is collected +Starting from the Pubmed collection, we exploit this API to get all the related bioentities related to a Publication with a specific PubMed identifier. + +Following this request: `https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json` we store for each pubmedID the links related. + + +## Mapping +The table below describes the mapping from the EBI links records to the OpenAIRE Graph dump format. + + +| *OpenAIRE Result field path* | PubMed record field xpath | Notes | +|--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| \ No newline at end of file From 3730f52cd3a6fb01c1733ceebc1655ed587622b2 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 2 Nov 2022 14:48:47 +0100 Subject: [PATCH 17/25] minor fix --- docs/data-provision/aggregation/datacite.md | 2 +- docs/data-provision/aggregation/pubmed.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 565ef1a..b268e14 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -37,7 +37,7 @@ The table below describes the mapping from the XML baseline records to the OpenA |--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | |
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | -|`result.type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +|`type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| | `pid` | `\attributes\doi` | `scheme = doi` | | `originalid` | `\attributes\doi` | | | `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md index 96f957c..a223c35 100644 --- a/docs/data-provision/aggregation/pubmed.md +++ b/docs/data-provision/aggregation/pubmed.md @@ -37,7 +37,7 @@ The table below describes the mapping from the XML baseline records to the OpenA | `container.iss` | `//Journal/Issue` | The journal issue | | **Instance Mapping** | | | | `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | -|`result.type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +|`type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| | `instance.pid` | `//PMID` | map the pmid in the pid in the instance | | `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | | `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | From 77ad2700b29d84cbb0a3afa7b09f9cfd64705ffa Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 09:18:04 +0100 Subject: [PATCH 18/25] datacite tables and EBI texts --- docs/data-provision/aggregation/datacite.md | 69 ++-- docs/data-provision/aggregation/ebi.md | 398 +++++++++++++++++++- 2 files changed, 428 insertions(+), 39 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index b268e14..0de7b98 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -32,46 +32,45 @@ The metadata collection process identifies the most recent record date available ## Datacite Mapping The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. - -| OpenAIRE Result field path | Datacite record JSON path | # Notes | -|--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | -|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | -|`type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| -| `pid` | `\attributes\doi` | `scheme = doi` | -| `originalid` | `\attributes\doi` | | -| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | -| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped** | -| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | -| `author.rank` | | Incremental index starting from 1 | -| `author.name` | `\attributes\creators\givenName` | | -| `author.surname` | `\attributes\creators\familyName` | | -| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | -| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | -| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | -| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | -| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | -| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | -| `publicationdate` | `\attributes\dates` | where `dateType` is **issued** | -| `publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | -| `embargoenddate` | `\attributes\dates` | where `dateType` is **available** | -| `subjects` | `\attributes\subject` | `scheme=keywords` | -| `description` | `\attributes\descriptions` | | -| `publisher` | `\attributes\publisher` | | -| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | -| `publisher` | `\attributes\publisher` | | -| `instance.license` | `\attributes\rightsList` | if right value starts with http and matches a particular regex | -| `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is _figshare_:`open`
  • If `embargo_date < today()`: _OPEN_
| +| OpenAIRE Result field path | Datacite record JSON path | # Notes | +|--------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | +|
  • `instance`
  • `instance.type`
|
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | +| `type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +| `pid` | `\attributes\doi` | `scheme = doi` | +| `originalid` | `\attributes\doi` | | +| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | +| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped** | +| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | +| `author.rank` | | Incremental index starting from 1 | +| `author.name` | `\attributes\creators\givenName` | | +| `author.surname` | `\attributes\creators\familyName` | | +| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | +| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | +| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | +| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | +| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | +| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | +| `publicationdate` | `\attributes\dates` | where `dateType` is **issued** | +| `publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | +| `embargoenddate` | `\attributes\dates` | where `dateType` is **available** | +| `subjects` | `\attributes\subject` | `scheme=keywords` | +| `description` | `\attributes\descriptions` | | +| `publisher` | `\attributes\publisher` | | +| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | +| `publisher` | `\attributes\publisher` | | +| `instance.license` | `\attributes\rightsList` | if the rights value starts with http and matches a particular regex | +| `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is Figshare:`open`
  • If `embargo_date < today()`: OPEN
| ### Mapping Relation -| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | -|-------------------------------------------|-------------------------------|-------------------------------|---------| -| `isProducedBy` |`attributes\fundingReferences` | `Result/Project`| we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)`| -| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite| -| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` |we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | +| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | +|----------------------------------------|---------------------------------------|----------------------|---------------------------------------------------------------------------------------------------| +| `isProducedBy` | `attributes\fundingReferences` | `Result/Project` | we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)` | +| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite | +| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` | we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | ### Relation Resolution diff --git a/docs/data-provision/aggregation/ebi.md b/docs/data-provision/aggregation/ebi.md index 11a8507..fdbcc7a 100644 --- a/docs/data-provision/aggregation/ebi.md +++ b/docs/data-provision/aggregation/ebi.md @@ -2,13 +2,403 @@ This section describes the mapping implemented for [EMBL-EBIs Protein Data Bank in Europe](https://www.ebi.ac.uk/). -The Europe PMC RESTful Web Service gives the [datalinks API](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API)to retrieve data-literature links in Scholix format . +The Europe PMC RESTful Web Service gives the [datalinks API](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API) to retrieve data-literature links in Scholix format. -## how data is collected -Starting from the Pubmed collection, we exploit this API to get all the related bioentities related to a Publication with a specific PubMed identifier. +## How the data is collected -Following this request: `https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json` we store for each pubmedID the links related. +Starting from the Pubmed collection, the API below is used to obtain the bioentities related to publications for each PubMed identifier. +Example: + +```commandline +curl -s "https://www.ebi.ac.uk/europepmc/webservices/rest/MED/33024307/datalinks?format=json" | jq '.' +{ + "version": "6.8", + "hitCount": 9, + "request": { + "id": "33024307", + "source": "MED" + }, + "dataLinkList": { + "Category": [ + { + "Name": "Nucleotide Sequences", + "CategoryLinkCount": 5, + "Section": [ + { + "ObtainedBy": "tm_accession", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 5, + "Linklist": { + "Link": [ + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "AY278488", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:AY278488" + }, + "Title": "AY278488", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "MT121216", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:MT121216" + }, + "Title": "MT121216", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "KF367457", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:KF367457" + }, + "Title": "KF367457", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "MN996532", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:MN996532" + }, + "Title": "MN996532", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "MT072864", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:MT072864" + }, + "Title": "MT072864", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + } + ] + } + } + ] + }, + { + "Name": "Protein Structures", + "NameLong": "Protein structures in PDBe", + "CategoryLinkCount": 2, + "Section": [ + { + "ObtainedBy": "tm_accession", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 2, + "Linklist": { + "Link": [ + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "6VW1", + "IDScheme": "PDB", + "IDURL": "http://identifiers.org/pdbe/pdb:6VW1" + }, + "Title": "6VW1", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "2AJF", + "IDScheme": "PDB", + "IDURL": "http://identifiers.org/pdbe/pdb:2AJF" + }, + "Title": "2AJF", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + } + ] + } + } + ] + }, + { + "Name": "Altmetric", + "CategoryLinkCount": 1, + "Section": [ + { + "ObtainedBy": "ext_links", + "Tags": [ + "altmetrics" + ], + "SectionLinkCount": 1, + "Linklist": { + "Link": [ + { + "ObtainedBy": "ext_links", + "PublicationDate": "15-10-2020", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "https://www.altmetric.com/details/91880755", + "IDScheme": "URL", + "IDURL": "https://www.altmetric.com/details/91880755" + }, + "Title": "Characteristics of SARS-CoV-2 and COVID-19", + "Publisher": { + "Name": "Altmetric" + }, + "ImageURL": "https://api.altmetric.com/v1/donut/91880755_64.png" + } + } + ] + } + } + ] + }, + { + "Name": "BioStudies: supplemental material and supporting data", + "CategoryLinkCount": 1, + "Section": [ + { + "ObtainedBy": "ext_links", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 1, + "Linklist": { + "Link": [ + { + "ObtainedBy": "ext_links", + "PublicationDate": "11-03-2021", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC7537588?xr=true", + "IDScheme": "URL", + "IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC7537588?xr=true" + }, + "Title": "Characteristics of SARS-CoV-2 and COVID-19.", + "Publisher": { + "Name": "BioStudies: supplemental material and supporting data" + } + } + } + ] + } + } + ] + } + ] + } +} +``` ## Mapping The table below describes the mapping from the EBI links records to the OpenAIRE Graph dump format. From 1f1e5c8d4913f25d61b563696d5b5322e89d9f17 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 09:35:10 +0100 Subject: [PATCH 19/25] minor --- docs/data-provision/aggregation/datacite.md | 5 ++++- docs/data-provision/aggregation/pubmed.md | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 0de7b98..c5b549f 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -30,6 +30,9 @@ The collection workflow is responsible for aggregating new records. Each record The metadata collection process identifies the most recent record date available locally and uses such date to requests the records to the Datacite API, populating the **FROM_DATE_TIMESAMP** variable. The records in the API response are included in the local storage in upsert mode. ## Datacite Mapping + +### Entity Mapping + The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. | OpenAIRE Result field path | Datacite record JSON path | # Notes | @@ -63,7 +66,7 @@ The table below describes the mapping from the XML baseline records to the OpenA | `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is Figshare:`open`
  • If `embargo_date < today()`: OPEN
| -### Mapping Relation +### Relation Mapping | OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md index a223c35..f45a974 100644 --- a/docs/data-provision/aggregation/pubmed.md +++ b/docs/data-provision/aggregation/pubmed.md @@ -9,7 +9,8 @@ It contains XML records compliant with the schema available at https://www.nlm.n ## Incremental harvesting Pubmed exposes an entry point FTP with all the updates for each one. [ftp baseline update](https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/). We collect the new file and generate the new dataset by upserting the existing item. -## Mapping + +## Entity Mapping The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. From 268bb23545c30b5b244703a97753058c82ba4977 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 8 Nov 2022 15:40:48 +0100 Subject: [PATCH 20/25] minor fix --- docs/data-provision/aggregation/datacite.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index b268e14..69d6501 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -72,9 +72,6 @@ The table below describes the mapping from the XML baseline records to the OpenA | `isProducedBy` |`attributes\fundingReferences` | `Result/Project`| we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)`| | `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite| | `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` |we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | - - -### Relation Resolution - +| | `\attribute\relatedIdentifiers` | result/result | we create relationships whenever the pid of the target is resolved on the Research Graph | From b007a67a3cc95f7d7245bc3456fd795c4b3badef Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 8 Nov 2022 15:58:21 +0100 Subject: [PATCH 21/25] added EBI mapping --- docs/data-provision/aggregation/datacite.md | 12 ----------- docs/data-provision/aggregation/ebi.md | 23 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 64d3cdb..722f393 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -69,23 +69,11 @@ The table below describes the mapping from the XML baseline records to the OpenA ### Relation Mapping -<<<<<<< HEAD | OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | |-------------------------------------------|-------------------------------|-------------------------------|---------| | `isProducedBy` |`attributes\fundingReferences` | `Result/Project`| we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)`| | `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite| | `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` |we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | | | `\attribute\relatedIdentifiers` | result/result | we create relationships whenever the pid of the target is resolved on the Research Graph | -======= -| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | -|----------------------------------------|---------------------------------------|----------------------|---------------------------------------------------------------------------------------------------| -| `isProducedBy` | `attributes\fundingReferences` | `Result/Project` | we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)` | -| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite | -| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` | we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | - - -### Relation Resolution - ->>>>>>> 92baad5acb3ecfb774510b48fee6aeeba92738df diff --git a/docs/data-provision/aggregation/ebi.md b/docs/data-provision/aggregation/ebi.md index fdbcc7a..f03d49d 100644 --- a/docs/data-provision/aggregation/ebi.md +++ b/docs/data-provision/aggregation/ebi.md @@ -402,7 +402,26 @@ curl -s "https://www.ebi.ac.uk/europepmc/webservices/rest/MED/33024307/datalinks ## Mapping The table below describes the mapping from the EBI links records to the OpenAIRE Graph dump format. +We filter all the target links with pid type **ena**, **pdb** or **uniprot** +For each target we construct a Bioentity with the following mapping -| *OpenAIRE Result field path* | PubMed record field xpath | Notes | -|--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| \ No newline at end of file +| *OpenAIRE Result field path* | EBI record field xpath | Notes | +|--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `target/identifier/ID` and `target/identifier/IDScheme` | id in the form `SCHEMA_________::md5(pid)`| +| `pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema`| +| `publicationdate` | `target/PublicationDate` | clean and normalize the format of the date to be `YYYY-mm-dd` | +| `maintitle` | `target/Title` | | +| **Instance Mapping** | | | +| `instance.type` | | `Bioentity` | +|`type` | | `Dataset` | +| `instance.pid` |`target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `instance.url` | `target/identifier/IDURL` | Copy the value as it is | + | +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd + + +### Relation Mapping +| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | +|-------------------------------------------|-------------------------------|-------------------------------|---------| +| `IsRelatedTo` | | result/result | we create relationships between the BioEntity and the pubmed publication | From e9296f1a4085a313a462e676c5c9758c1f696bfc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 16:23:45 +0100 Subject: [PATCH 22/25] fixed typos and tables --- docs/data-provision/aggregation/datacite.md | 14 ++++------ docs/data-provision/aggregation/ebi.md | 31 ++++++++++----------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index 722f393..e1fd166 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -65,15 +65,13 @@ The table below describes the mapping from the XML baseline records to the OpenA | `instance.license` | `\attributes\rightsList` | if the rights value starts with http and matches a particular regex | | `instance.accessright` | `\attributes\rightsList` |
  • if not present :`unknown`
  • if datasource is Figshare:`open`
  • If `embargo_date < today()`: OPEN
| - ### Relation Mapping - -| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | -|-------------------------------------------|-------------------------------|-------------------------------|---------| -| `isProducedBy` |`attributes\fundingReferences` | `Result/Project`| we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)`| -| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite| -| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` |we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | -| | `\attribute\relatedIdentifiers` | result/result | we create relationships whenever the pid of the target is resolved on the Research Graph | +| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Target type | #Notes | +|----------------------------------------|---------------------------------------|---------------------|------------------------------------------------------------------------------------------------------------| +| `isProducedBy/produces` | `attributes\fundingReferences` | `result/project` | only when the fundingReferences matches the pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)` | +| `IsProvidedBy/provides` | | `result/datasource` | Datasource is always set to `Datacite` | +| `isHostedBy/host` | `\attributes\relationships\client\id` | `result/datasource` | we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | +| `isRelatedTo` | `\attribute\relatedIdentifiers` | `result/result` | we create relationships whenever the pid of the target is resolved on the Research Graph | diff --git a/docs/data-provision/aggregation/ebi.md b/docs/data-provision/aggregation/ebi.md index f03d49d..ce653b6 100644 --- a/docs/data-provision/aggregation/ebi.md +++ b/docs/data-provision/aggregation/ebi.md @@ -406,22 +406,21 @@ We filter all the target links with pid type **ena**, **pdb** or **uniprot** For each target we construct a Bioentity with the following mapping -| *OpenAIRE Result field path* | EBI record field xpath | Notes | -|--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `id` | `target/identifier/ID` and `target/identifier/IDScheme` | id in the form `SCHEMA_________::md5(pid)`| -| `pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema`| -| `publicationdate` | `target/PublicationDate` | clean and normalize the format of the date to be `YYYY-mm-dd` | -| `maintitle` | `target/Title` | | -| **Instance Mapping** | | | -| `instance.type` | | `Bioentity` | -|`type` | | `Dataset` | -| `instance.pid` |`target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | -| `instance.url` | `target/identifier/IDURL` | Copy the value as it is | - | -| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd +| *OpenAIRE Result field path* | EBI record field xpath | Notes | +|------------------------------|----------------------------------------------------------|---------------------------------------------------------------| +| `id` | `target/identifier/ID` and `target/identifier/IDScheme` | id in the form `SCHEMA_________::md5(pid)` | +| `pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `publicationdate` | `target/PublicationDate` | clean and normalize the format of the date to be `YYYY-mm-dd` | +| `maintitle` | `target/Title` | | +| **Instance Mapping** | | | +| `instance.type` | | `Bioentity` | +| `type` | | `Dataset` | +| `instance.pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `instance.url` | `target/identifier/IDURL` | Copy the value as it is | +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | ### Relation Mapping -| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | -|-------------------------------------------|-------------------------------|-------------------------------|---------| -| `IsRelatedTo` | | result/result | we create relationships between the BioEntity and the pubmed publication | +| OpenAIRE Relation Semantic and inverse | Source/Target type | #Notes | +|----------------------------------------|---------------------|--------------------------------------------------------------------------| +| `IsRelatedTo` | `result/result` | we create relationships between the BioEntity and the pubmed publication | From 12263fca62f71a7570d5a66493e046ad766d21d3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 16:54:39 +0100 Subject: [PATCH 23/25] addressing comments from the code review --- docs/data-model/pids-and-identifiers.md | 18 +- .../data-provision/aggregation/aggregation.md | 4 +- docs/data-provision/aggregation/doiboost.md | 54 +-- docs/data-provision/aggregation/ebi.md | 360 +----------------- docs/data-provision/aggregation/pubmed.md | 59 ++- 5 files changed, 84 insertions(+), 411 deletions(-) diff --git a/docs/data-model/pids-and-identifiers.md b/docs/data-model/pids-and-identifiers.md index a6b0afd..95378fc 100644 --- a/docs/data-model/pids-and-identifiers.md +++ b/docs/data-model/pids-and-identifiers.md @@ -13,11 +13,11 @@ Such a policy defines a list of data sources that are considered authoritative f * OpenAIRE IDs depend on persistent IDs when they are provided by the authority responsible to create them; * PIDs are included in the graph according to a tight criterion: the PID Types declared in the table below are considered to be mapped as PIDs only when they are collected from the relative PID authority data source. -| *PID Type* | *Authority* | -|------------|-----------------------------------------------------------------------------------------------------| -| doi | [Crossref](https://www.crossref.org), [Datacite](https://datacite.org) | -| pmc, pmid | [Europe PubMed Central](https://europepmc.org/), [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc) | -| arXiv | [arXiv.org e-Print Archive](https://arxiv.org/) | +| PID Type | Authority | +|-----------|-----------------------------------------------------------------------------------------------------| +| doi | [Crossref](https://www.crossref.org), [Datacite](https://datacite.org) | +| pmc, pmid | [Europe PubMed Central](https://europepmc.org/), [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc) | +| arXiv | [arXiv.org e-Print Archive](https://arxiv.org/) | There is an exception though: Handle(s) are minted by several repositories; as listing them all would not be a viable option, to avoid losing them as PIDs, Handles bypass the PID authority filtering rule. In all other cases, PIDs are be included in the graph as alternate Identifiers. @@ -31,10 +31,10 @@ assigns PIDs to their scientific products from a given PID minter. This "selection" can be performed when the entities in the graph sharing the same identifier are grouped together. The list of the delegated authorities currently includes -| *Datasource delegated* | *Datasource delegating* | *Pid Type* | -|--------------------------------------|----------------------------------|------------| -| [Zenodo](https://zenodo.org) | [Datacite](https://datacite.org) | doi | -| [RoHub](https://reliance.rohub.org/) | [W3ID](https://w3id.org/) | w3id | +| Datasource delegated | Datasource delegating | Pid Type | +|--------------------------------------|----------------------------------|-----------| +| [Zenodo](https://zenodo.org) | [Datacite](https://datacite.org) | doi | +| [RoHub](https://reliance.rohub.org/) | [W3ID](https://w3id.org/) | w3id | ## Identifiers in the Graph diff --git a/docs/data-provision/aggregation/aggregation.md b/docs/data-provision/aggregation/aggregation.md index 4e159e9..8198e51 100644 --- a/docs/data-provision/aggregation/aggregation.md +++ b/docs/data-provision/aggregation/aggregation.md @@ -10,14 +10,14 @@ OpenAIRE materializes an open, participatory research graph (the OpenAIRE Resear OpenAIRE aggregates metadata records describing objects of the research life-cycle from content providers compliant to the [OpenAIRE guidelines](https://guidelines.openaire.eu/) and from entity registries (i.e. data sources offering authoritative lists of entities, like [OpenDOAR](https://v2.sherpa.ac.uk/opendoar/), [re3data](https://www.re3data.org/), [DOAJ](https://doaj.org/), and various funder databases). After collection, metadata are transformed according to the OpenAIRE internal metadata model, which is used to generate the final OpenAIRE Research Graph, accessible from the [OpenAIRE EXPLORE portal](https://explore.openaire.eu) and the [APIs](https://graph.openaire.eu/develop/). -The transformation process includes the application of cleaning functions whose goal is to ensure that values are harmonised according to a common format (e.g. dates as YYYY-MM-dd) and, whenever applicable, to a common controlled vocabulary. The controlled vocabularies used for cleansing are accessible at http://api.openaire.eu/vocabularies. Each vocabulary features a set of controlled terms, each with one code, one label, and a set of synonyms. If a synonym is found as field value, the value is updated with the corresponding term. +The transformation process includes the application of cleaning functions whose goal is to ensure that values are harmonised according to a common format (e.g. dates as YYYY-MM-dd) and, whenever applicable, to a common controlled vocabulary. The controlled vocabularies used for cleansing are accessible at [api.openaire.eu/vocabularies](https://api.openaire.eu/vocabularies/). Each vocabulary features a set of controlled terms, each with one code, one label, and a set of synonyms. If a synonym is found as field value, the value is updated with the corresponding term. Also, the OpenAIRE Research Graph is extended with other relevant scholarly communication sources that do not follow the OpenAIRE Guidelines and/or are too large to be integrated via the “normal” aggregation mechanism: DOIBoost (which merges Crossref, ORCID, Microsoft Academic Graph, and Unpaywall).

Aggregation

-The OpenAIRE aggregation system collects information about objects of the research life-cycle compliant to the [OpenAIRE acquisition policy](https://www.openaire.eu/content-aquisition-policy1) from [different types of data sources](https://explore.openaire.eu/search/find/dataproviders): +The OpenAIRE aggregation system collects information about objects of the research life-cycle compliant to the [OpenAIRE acquisition policy](https://www.openaire.eu/content-acquisition-policy) from [different types of data sources](https://explore.openaire.eu/search/find/dataproviders): 1. Scientific literature metadata and full-texts from institutional and thematic repositories, CRIS (Common Research Information Systems), Open Access journals and publishers; 2. Dataset metadata from data repositories and data journals; diff --git a/docs/data-provision/aggregation/doiboost.md b/docs/data-provision/aggregation/doiboost.md index 9a039b2..cf42b92 100644 --- a/docs/data-provision/aggregation/doiboost.md +++ b/docs/data-provision/aggregation/doiboost.md @@ -4,10 +4,6 @@ DOIBoost is a dataset that combines research outputs and links among them from a It enriches the records available on Crossref with what's available on Unpaywall, Microsoft Academic Graph, ORCID intersecting all those datasets by DOI. As consequence, DOIBoost does not contain any record from MAG, Unpaywall, or ORCID that doesn't provide a DOI available in Crossref. -The idea behind DOIBoost and its origin can be found in the paper (and related resources) at: - -* La Bruzzo S., Manghi P., Mannocci A. (2019) OpenAIRE's DOIBoost - Boosting CrossRef for Research. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, doi:10.1007/978-3-030-11226-4_11 . Open Access version available at: [10.5281/zenodo.1441071](https://doi.org/10.5281/zenodo.1441071) - Each Crossref record is enriched with: * ORCID identifiers of authors from ORCID * Open Access instance (with OA color/route and license) from Unpaywall @@ -29,7 +25,11 @@ The Open Access status is also set by intersecting the journal information of a The construction of the DOIBoost dataset consists of the following phases: -## 1. Crossref filtering +## Process + +The following section describes the processing steps needed to build DOIBoost starting from the input data. + +### Crossref filtering Records in Crossref are ruled out according to the following criteria @@ -68,7 +68,7 @@ Records in Crossref are ruled out according to the following criteria Records with `type=dataset` are mapped into OpenAIRE results of type dataset. All others are mapped as OpenAIRE results of type publication. -## 2. Mapping Crossref properties into the OpenAIRE Research Graph +### Mapping Crossref properties into the OpenAIRE Research Graph Properties in OpenAIRE results are set based on the logic described in the following table: @@ -133,9 +133,9 @@ Possible improvements: h3. 2 Map Crossref links to projects/funders -Links to funding available in Crossref are mapped as funding relationships (`result -- isProducedBy --> project`) applying the following mapping: +Links to funding available in Crossref are mapped as funding relationships (`result -- isProducedBy -- project`) applying the following mapping: -| *funder* | *grant code* | *Link to* | +| Funder | Grant code | Link to | |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | DOI: `{10.13039/100010663, 10.13039/100010661, 10.13039/501100007601, 10.13039/501100000780, 10.13039/100010665}` or name: `'European Union’s Horizon 2020 research and innovation program'` | series of `4-9` digits in `award` | Link to H2020 project | | DOI: `{10.13039/100011199, 10.13039/100004431, 10.13039/501100004963, 10.13039/501100000780}` | series of `4-9` digits in `award` | Link to FP7 project | @@ -159,7 +159,7 @@ Links to funding available in Crossref are mapped as funding relationships (`res | DOI: `10.13039/501100004410` | `award` | Link to TUBITAK project | | DOI: `10.10.13039/100004440` or name: `Wellcome Trust Masters Fellowship` | `award` | Link to Wellcome Trust specific project and to the `unidentified` project. | -## 3. Intersect Crossref with UnpayWall by DOI +### Intersect Crossref with UnpayWall by DOI The fields we consider from UnpayWall are: * `is_oa` @@ -168,7 +168,7 @@ The fields we consider from UnpayWall are: The results of Crossref that intersect by DOI with UnpayWall records are enriched with one additional `instance` with the following properties: -| *OpenAIRE Result field path* | *Unpaywall field path* | *Notes* | +| OpenAIRE Result field path | Unpaywall field path | Notes | |----------------------------------------|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `instance` | | created only if `is_oa` and a `best_oa_location` is available | | `instance.accessright` | | default value `Open Access`: we do not add instances if UnpayWall says there is no open version | @@ -186,23 +186,23 @@ For the definition of UnpayWall's `oa_status` refer to the [Unpaywall FAQ](https The record will also feature a relation to the UnpayWall data source: `name="UnpayWall"`, `id=openaire____::8ac8380272269217cb09a928c8caa993`. -## 4. Intersect with ORCID +### Intersect with ORCID The fields we consider from ORCID are: * `doi` * `authors`, a list of authors, each with optional `name`, `surname`, `creditName`, `oid` -| *OpenAIRE field path* | *ORCID path* | *Notes* | -|-------------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------| -| `pid` | `doi` | | -| `author.name` | `capitalize(name)` | only mapped if not blank | -| `author.surname` | `capitalize(surname)` | only mapped if not blank | -| `author.fullname` | | if name and surname are not blank, they are concatenated (`capitalize(name) capitalize(surname)`), otherwise we use the `creditName` | -| `author.pid` | | only if the `ORCID` is available | -| `author.pid.id.scheme` | | Default `orcid` (meaning that it is confirmed by ORCID, (in contrast to the `orcid_pending` set from Crossref and Unpaywall) | -| `author.pid.id.value` | `oid` | | -| `author.pid.provenance.provenance` | | Default `Harvested` | -| `author.pid.provenance.trust` | | Default `0.9` | +| OpenAIRE field path | ORCID path | Notes | +|------------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------| +| `pid` | `doi` | | +| `author.name` | `capitalize(name)` | only mapped if not blank | +| `author.surname` | `capitalize(surname)` | only mapped if not blank | +| `author.fullname` | | if name and surname are not blank, they are concatenated (`capitalize(name) capitalize(surname)`), otherwise we use the `creditName` | +| `author.pid` | | only if the `ORCID` is available | +| `author.pid.id.scheme` | | Default `orcid` (meaning that it is confirmed by ORCID, (in contrast to the `orcid_pending` set from Crossref and Unpaywall) | +| `author.pid.id.value` | `oid` | | +| `author.pid.provenance.provenance` | | Default `Harvested` | +| `author.pid.provenance.trust` | | Default `0.9` | The records are enriched with the ORCID identifiers of their authors. @@ -216,7 +216,7 @@ Miriam will modify the process to ensure that: * the list of authors from Crossred always "win" * the identifiers from ORCID "win" -## 5. Intersect with Microsoft Academic Graph +### Intersect with Microsoft Academic Graph *Important Notes* * Only papers with DOI are considered @@ -238,10 +238,16 @@ The records are enriched with: * conference or journal information (in the `journal` field) TODO: or `container`, in case of the dump? * [TO BE REMOVED] instances with URL from MAG -## 6. Enrich DOIBoost3 with hosting data sources (`hostedby`) and access right information +### Enrich DOIBoost3 with hosting data sources (`hostedby`) and access right information In this phase, we intersect DOIBoost3 with a dataset composed of journals from OpenAIRE, Crossref, and the ISSN gold list. Each journal comes with its International Standard Serial Numbers (`issn`, `eissn`, `lissn`) and, when available, a flag that tells if the journal is open access. The intersection is done on the basis of the International Standard Serial Numbers. The records with a `journal.[l|e]issn` that match are enriched as follows: * Each instance gain the `hostedby` information corresponding to the journal * If the journal is open access, the access rights of the instances are also set to `Open Access` with `gold` route (because by construction, the journals we know are open are from DOAJ or Gold ISSN list) The hostedby of records that do not match are set to the `Unknown Repository`. + +## References + +The idea behind DOIBoost and its origin can be found in the paper (and related resources) at: + +* La Bruzzo S., Manghi P., Mannocci A. (2019) OpenAIRE's DOIBoost - Boosting CrossRef for Research. In: Manghi P., Candela L., Silvello G. (eds) Digital Libraries: Supporting Open Science. IRCDL 2019. Communications in Computer and Information Science, vol 988. Springer, doi:10.1007/978-3-030-11226-4_11 . Open Access version available at: [10.5281/zenodo.1441071](https://doi.org/10.5281/zenodo.1441071) diff --git a/docs/data-provision/aggregation/ebi.md b/docs/data-provision/aggregation/ebi.md index ce653b6..f5abf7a 100644 --- a/docs/data-provision/aggregation/ebi.md +++ b/docs/data-provision/aggregation/ebi.md @@ -65,339 +65,7 @@ curl -s "https://www.ebi.ac.uk/europepmc/webservices/rest/MED/33024307/datalinks "Name": "Europe PMC" } }, - "Frequency": 1 - }, - { - "ObtainedBy": "tm_accession", - "PublicationDate": "04-11-2022", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "References" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "MED" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "MT121216", - "IDScheme": "ENA", - "IDURL": "http://identifiers.org/ebi/ena.embl:MT121216" - }, - "Title": "MT121216", - "Publisher": { - "Name": "Europe PMC" - } - }, - "Frequency": 1 - }, - { - "ObtainedBy": "tm_accession", - "PublicationDate": "04-11-2022", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "References" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "MED" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "KF367457", - "IDScheme": "ENA", - "IDURL": "http://identifiers.org/ebi/ena.embl:KF367457" - }, - "Title": "KF367457", - "Publisher": { - "Name": "Europe PMC" - } - }, - "Frequency": 1 - }, - { - "ObtainedBy": "tm_accession", - "PublicationDate": "04-11-2022", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "References" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "MED" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "MN996532", - "IDScheme": "ENA", - "IDURL": "http://identifiers.org/ebi/ena.embl:MN996532" - }, - "Title": "MN996532", - "Publisher": { - "Name": "Europe PMC" - } - }, - "Frequency": 1 - }, - { - "ObtainedBy": "tm_accession", - "PublicationDate": "04-11-2022", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "References" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "MED" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "MT072864", - "IDScheme": "ENA", - "IDURL": "http://identifiers.org/ebi/ena.embl:MT072864" - }, - "Title": "MT072864", - "Publisher": { - "Name": "Europe PMC" - } - }, - "Frequency": 1 - } - ] - } - } - ] - }, - { - "Name": "Protein Structures", - "NameLong": "Protein structures in PDBe", - "CategoryLinkCount": 2, - "Section": [ - { - "ObtainedBy": "tm_accession", - "Tags": [ - "supporting_data" - ], - "SectionLinkCount": 2, - "Linklist": { - "Link": [ - { - "ObtainedBy": "tm_accession", - "PublicationDate": "04-11-2022", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "References" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "MED" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "6VW1", - "IDScheme": "PDB", - "IDURL": "http://identifiers.org/pdbe/pdb:6VW1" - }, - "Title": "6VW1", - "Publisher": { - "Name": "Europe PMC" - } - }, - "Frequency": 1 - }, - { - "ObtainedBy": "tm_accession", - "PublicationDate": "04-11-2022", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "References" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "MED" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "2AJF", - "IDScheme": "PDB", - "IDURL": "http://identifiers.org/pdbe/pdb:2AJF" - }, - "Title": "2AJF", - "Publisher": { - "Name": "Europe PMC" - } - }, - "Frequency": 1 - } - ] - } - } - ] - }, - { - "Name": "Altmetric", - "CategoryLinkCount": 1, - "Section": [ - { - "ObtainedBy": "ext_links", - "Tags": [ - "altmetrics" - ], - "SectionLinkCount": 1, - "Linklist": { - "Link": [ - { - "ObtainedBy": "ext_links", - "PublicationDate": "15-10-2020", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "IsReferencedBy" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "PMID" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "https://www.altmetric.com/details/91880755", - "IDScheme": "URL", - "IDURL": "https://www.altmetric.com/details/91880755" - }, - "Title": "Characteristics of SARS-CoV-2 and COVID-19", - "Publisher": { - "Name": "Altmetric" - }, - "ImageURL": "https://api.altmetric.com/v1/donut/91880755_64.png" - } - } - ] - } - } - ] - }, - { - "Name": "BioStudies: supplemental material and supporting data", - "CategoryLinkCount": 1, - "Section": [ - { - "ObtainedBy": "ext_links", - "Tags": [ - "supporting_data" - ], - "SectionLinkCount": 1, - "Linklist": { - "Link": [ - { - "ObtainedBy": "ext_links", - "PublicationDate": "11-03-2021", - "LinkProvider": { - "Name": "Europe PMC" - }, - "RelationshipType": { - "Name": "IsReferencedBy" - }, - "Source": { - "Type": { - "Name": "literature" - }, - "Identifier": { - "ID": "33024307", - "IDScheme": "PMID" - } - }, - "Target": { - "Type": { - "Name": "dataset" - }, - "Identifier": { - "ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC7537588?xr=true", - "IDScheme": "URL", - "IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC7537588?xr=true" - }, - "Title": "Characteristics of SARS-CoV-2 and COVID-19.", - "Publisher": { - "Name": "BioStudies: supplemental material and supporting data" - } - } - } - ] - } - } - ] - } - ] - } -} + [...] ``` ## Mapping @@ -406,21 +74,21 @@ We filter all the target links with pid type **ena**, **pdb** or **uniprot** For each target we construct a Bioentity with the following mapping -| *OpenAIRE Result field path* | EBI record field xpath | Notes | -|------------------------------|----------------------------------------------------------|---------------------------------------------------------------| -| `id` | `target/identifier/ID` and `target/identifier/IDScheme` | id in the form `SCHEMA_________::md5(pid)` | -| `pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | -| `publicationdate` | `target/PublicationDate` | clean and normalize the format of the date to be `YYYY-mm-dd` | -| `maintitle` | `target/Title` | | -| **Instance Mapping** | | | -| `instance.type` | | `Bioentity` | -| `type` | | `Dataset` | -| `instance.pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | -| `instance.url` | `target/identifier/IDURL` | Copy the value as it is | -| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | +| OpenAIRE Result field path | EBI record field xpath | Notes | +|-----------------------------|----------------------------------------------------------|---------------------------------------------------------------| +| `id` | `target/identifier/ID` and `target/identifier/IDScheme` | id in the form `SCHEMA_________::md5(pid)` | +| `pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `publicationdate` | `target/PublicationDate` | clean and normalize the format of the date to be `YYYY-mm-dd` | +| `maintitle` | `target/Title` | | +| **Instance Mapping** | | | +| `instance.type` | | `Bioentity` | +| `type` | | `Dataset` | +| `instance.pid` | `target/identifier/ID` and `target/identifier/IDScheme` | `classid = classname = schema` | +| `instance.url` | `target/identifier/IDURL` | Copy the value as it is | +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | ### Relation Mapping -| OpenAIRE Relation Semantic and inverse | Source/Target type | #Notes | +| OpenAIRE Relation Semantic and inverse | Source/Target type | Notes | |----------------------------------------|---------------------|--------------------------------------------------------------------------| | `IsRelatedTo` | `result/result` | we create relationships between the BioEntity and the pubmed publication | diff --git a/docs/data-provision/aggregation/pubmed.md b/docs/data-provision/aggregation/pubmed.md index f45a974..a6df81d 100644 --- a/docs/data-provision/aggregation/pubmed.md +++ b/docs/data-provision/aggregation/pubmed.md @@ -5,7 +5,7 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm ## Input The native data is collected from the [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) site. -It contains XML records compliant with the schema available at https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html. +It contains XML records compliant with the schema available at [www.nlm.nih.gov](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html). ## Incremental harvesting Pubmed exposes an entry point FTP with all the updates for each one. [ftp baseline update](https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/). We collect the new file and generate the new dataset by upserting the existing item. @@ -14,32 +14,31 @@ Pubmed exposes an entry point FTP with all the updates for each one. [ftp baseli The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. - -| *OpenAIRE Result field path* | PubMed record field xpath | Notes | -|--------------------------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **Publication Mapping** | | | -| `id` | ?? | id in the form `pmid_________::md5(pmid)` | -| `pid` | `//PMID` | `classid = classname = pmid` | -| `publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | -| `maintitle` | `//Title` | | -| `description` | `//AbstractText` | | -| `language` | `//Language` | cleaning vocabulary -> dnet:languages | -| `subjects` | `//DescriptorName` | classId, className = keyword | -| **Author Mapping** | | | -| `author.surname` | `//Author/LastName` | | -| `author.name` | `//Author/ForeName` | | -| `author.fullname` | `//Author/FullName` | Concatenation of forename + lastName if exist | -| `author.rank` | FOR ALL AUTHORS | sequential number starting from 1 | -| **Journal Mapping** | | | -| `container.conferencedate` | `//Journal/PubDate` | map the date of the Journal | -| `container.name` | `//Journal/Title` | name of the journal | -| `container.vol` | `//Journal/Volume` | journal volume | -| `container.issPrinted` | `//Journal/ISSN` | the journal issn | -| `container.iss` | `//Journal/Issue` | The journal issue | -| **Instance Mapping** | | | -| `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | -|`type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| -| `instance.pid` | `//PMID` | map the pmid in the pid in the instance | -| `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | -| `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | -| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | \ No newline at end of file +| OpenAIRE Result field path | PubMed record field xpath | Notes | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Publication Mapping** | | | +| `id` | `//PMID` | id in the form `pmid_________::md5(pmid)` | +| `pid` | `//PMID` | `classid = classname = pmid` | +| `publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | +| `maintitle` | `//Title` | | +| `description` | `//AbstractText` | | +| `language` | `//Language` | cleaning vocabulary -> dnet:languages | +| `subjects` | `//DescriptorName` | classId, className = keyword | +| **Author Mapping** | | | +| `author.surname` | `//Author/LastName` | | +| `author.name` | `//Author/ForeName` | | +| `author.fullname` | `//Author/FullName` | Concatenation of forename + lastName if exist | +| `author.rank` | FOR ALL AUTHORS | sequential number starting from 1 | +| **Journal Mapping** | | | +| `container.conferencedate` | `//Journal/PubDate` | map the date of the Journal | +| `container.name` | `//Journal/Title` | name of the journal | +| `container.vol` | `//Journal/Volume` | journal volume | +| `container.issPrinted` | `//Journal/ISSN` | the journal issn | +| `container.iss` | `//Journal/Issue` | The journal issue | +| **Instance Mapping** | | | +| `instance.type` | `//PublicationType` | if the article contains the typology `Journal Article` then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it | +| `type` |
  • `\attributes\types\resourceType`
  • `\attributes\types\resourceTypeGeneral`
  • `attributes\types\schemaOrg`
| Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities:
  • `publication`
  • `dataset`
  • `software`
  • `otherresearchproduct`
| +| `instance.pid` | `//PMID` | map the pmid in the pid in the instance | +| `instance.url` | `//PMID` | creates the URL by prepending `https://pubmed.ncbi.nlm.nih.gov/` to the PMId | +| `instance.alternateIdentifier` | `//ArticleId[./@IdType="doi"]` | | +| `instance.publicationdate` | `//PubmedPubDate` | clean and normalize the format of the date to be YYYY-mm-dd | \ No newline at end of file From 9524d0d0249fbb172c14d42b52e9eb2f669181e5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 17:12:26 +0100 Subject: [PATCH 24/25] addressing comments from the code review --- docs/data-provision/aggregation/aggregation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/data-provision/aggregation/aggregation.md b/docs/data-provision/aggregation/aggregation.md index 8198e51..22c2e52 100644 --- a/docs/data-provision/aggregation/aggregation.md +++ b/docs/data-provision/aggregation/aggregation.md @@ -26,7 +26,7 @@ The OpenAIRE aggregation system collects information about objects of the resear 5. Metadata of open source research software from software repositories and SoftwareHeritge 6. Metadata about other types of research products, like workflow, protocols, methods, research packages -Relationships between objects are collected from the data sources, but also automatically detected by [inference algorithms](https://www.openaire.eu/blogs/text-mining-services-in-openaire-1) and added by authenticated users, who can insert links between literature, datasets, software and projects via [the “Link” procedure available from the OpenAIRE explore portal](https://explore.openaire.eu/participate/claim). +Relationships between objects are collected from the data sources, but also automatically detected by [inference algorithms](https://www.openaire.eu/blogs/text-mining-services-in-openaire-1) and added by authenticated users, who can insert links between literature, datasets, software and projects via [the “Link” procedure available from the OpenAIRE explore portal](https://explore.openaire.eu). More information about the linking functionality can be found [here](https://www.openaire.eu/linking). ## What kind of data sources are in OpenAIRE? From dc04f19da15b3f51bc759432d021f3d5c7aa8292 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 17:16:11 +0100 Subject: [PATCH 25/25] added short names for some of the aggregation sub-sections --- sidebars.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sidebars.js b/sidebars.js index 8063572..b73b92b 100644 --- a/sidebars.js +++ b/sidebars.js @@ -64,10 +64,10 @@ const sidebars = { label: "Aggregation", link: {type: 'doc', id: 'data-provision/aggregation/aggregation'}, items: [ - { type: 'doc', id: 'data-provision/aggregation/doiboost' }, + { type: 'doc', id: 'data-provision/aggregation/doiboost', label: 'DOIBoost' }, { type: 'doc', id: 'data-provision/aggregation/pubmed' }, { type: 'doc', id: 'data-provision/aggregation/datacite' }, - { type: 'doc', id: 'data-provision/aggregation/ebi' }, + { type: 'doc', id: 'data-provision/aggregation/ebi', label: 'EMBL-EBI' }, ] }, {