From 77ad2700b29d84cbb0a3afa7b09f9cfd64705ffa Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 8 Nov 2022 09:18:04 +0100 Subject: [PATCH] datacite tables and EBI texts --- docs/data-provision/aggregation/datacite.md | 69 ++-- docs/data-provision/aggregation/ebi.md | 398 +++++++++++++++++++- 2 files changed, 428 insertions(+), 39 deletions(-) diff --git a/docs/data-provision/aggregation/datacite.md b/docs/data-provision/aggregation/datacite.md index b268e14..0de7b98 100644 --- a/docs/data-provision/aggregation/datacite.md +++ b/docs/data-provision/aggregation/datacite.md @@ -32,46 +32,45 @@ The metadata collection process identifies the most recent record date available ## Datacite Mapping The table below describes the mapping from the XML baseline records to the OpenAIRE Graph dump format. - -| OpenAIRE Result field path | Datacite record JSON path | # Notes | -|--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | -| | | Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | -|`type` | | Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities: | -| `pid` | `\attributes\doi` | `scheme = doi` | -| `originalid` | `\attributes\doi` | | -| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | -| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped** | -| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | -| `author.rank` | | Incremental index starting from 1 | -| `author.name` | `\attributes\creators\givenName` | | -| `author.surname` | `\attributes\creators\familyName` | | -| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | -| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | -| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | -| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | -| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | -| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | -| `publicationdate` | `\attributes\dates` | where `dateType` is **issued** | -| `publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | -| `embargoenddate` | `\attributes\dates` | where `dateType` is **available** | -| `subjects` | `\attributes\subject` | `scheme=keywords` | -| `description` | `\attributes\descriptions` | | -| `publisher` | `\attributes\publisher` | | -| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | -| `publisher` | `\attributes\publisher` | | -| `instance.license` | `\attributes\rightsList` | if right value starts with http and matches a particular regex | -| `instance.accessright` | `\attributes\rightsList` | | +| OpenAIRE Result field path | Datacite record JSON path | # Notes | +|--------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `id` | `\attributes\doi` | id in the form `doi_________::md5(doi)` | +| | | Use the vocabulary **_dnet:publication_resource_** to find a synonym to one of these terms and get the `instance.type`. | +| `type` | | Using the **_dnet:result_typologies_** vocabulary, we look up the `instance.type` synonym to generate one of the following main entities: | +| `pid` | `\attributes\doi` | `scheme = doi` | +| `originalid` | `\attributes\doi` | | +| `dateofcollection` | `attributes\updated` | the timestamp is defined in milliseconds we convert to "yyyy-MM-dd'T'HH:mm:ssZ" format | +| `author` | `\attributes\creators` | Each creator field will be mapped in the author entity below the subfield. **If the record has no Creator it will be skipped** | +| `author.fullname` | `\attributes\creators\name` | if name is not defined, we construct from given and family name | +| `author.rank` | | Incremental index starting from 1 | +| `author.name` | `\attributes\creators\givenName` | | +| `author.surname` | `\attributes\creators\familyName` | | +| `author.pid` | `\attributes\creators\nameIdentifiers` | this is a list of pids associated to the creator | +| `author.pid.scheme` | `\attributes\creators\nameIdentifiers` | mapping with vocabulary **dnet:pid_types** | +| `author.pid.value` | `\attributes\creators\nameIdentifiers/nameIdentifier` | the pid value | +| `maintitle` | `\attributes\titles` | Titles whose title type is null or title type is Main | +| `subtitle` | `\attributes\titles` | Titles whose title type is Subtitle since the title type vocabulary in OpenAIRE use the datacite title type vocabulary | +| **date section** | | for each date in particular for DOI starting with _10.14457_ we Apply a fix thai date convert a date to ThaiBuddhistDate and reformat to local one see ticket [#6791](https://support.openaire.eu/issues/6791) | +| `publicationdate` | `\attributes\dates` | where `dateType` is **issued** | +| `publicationdate` | `\attributes\publicationYear` | we create this date format `01-01-publicationYear` | +| `embargoenddate` | `\attributes\dates` | where `dateType` is **available** | +| `subjects` | `\attributes\subject` | `scheme=keywords` | +| `description` | `\attributes\descriptions` | | +| `publisher` | `\attributes\publisher` | | +| `language` | `\attributes\language` | cleaned by using vocabulary `dnet:languages` | +| `publisher` | `\attributes\publisher` | | +| `instance.license` | `\attributes\rightsList` | if the rights value starts with http and matches a particular regex | +| `instance.accessright` | `\attributes\rightsList` | | ### Mapping Relation -| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | -|-------------------------------------------|-------------------------------|-------------------------------|---------| -| `isProducedBy` |`attributes\fundingReferences` | `Result/Project`| we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)`| -| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite| -| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` |we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | +| OpenAIRE Relation Semantic and inverse | Datacite record JSON path | Source/Tartget type | #Notes | +|----------------------------------------|---------------------------------------|----------------------|---------------------------------------------------------------------------------------------------| +| `isProducedBy` | `attributes\fundingReferences` | `Result/Project` | we must identifi if match this pattern `(info:eu-repo/grantagreement/ec/h2020/)(\d{6})(.*)` | +| `IsProvidedBy` | | `Result/DataSource` | Datasource is always Datacite | +| `IsHostedBy` | `\attributes\relationships\client\id` | `Result/DataSource` | we defined a curated map clientId/Datasource if we found a match we create an _hostedBy Relation_ | ### Relation Resolution diff --git a/docs/data-provision/aggregation/ebi.md b/docs/data-provision/aggregation/ebi.md index 11a8507..fdbcc7a 100644 --- a/docs/data-provision/aggregation/ebi.md +++ b/docs/data-provision/aggregation/ebi.md @@ -2,13 +2,403 @@ This section describes the mapping implemented for [EMBL-EBIs Protein Data Bank in Europe](https://www.ebi.ac.uk/). -The Europe PMC RESTful Web Service gives the [datalinks API](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API)to retrieve data-literature links in Scholix format . +The Europe PMC RESTful Web Service gives the [datalinks API](https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API) to retrieve data-literature links in Scholix format. -## how data is collected -Starting from the Pubmed collection, we exploit this API to get all the related bioentities related to a Publication with a specific PubMed identifier. +## How the data is collected -Following this request: `https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json` we store for each pubmedID the links related. +Starting from the Pubmed collection, the API below is used to obtain the bioentities related to publications for each PubMed identifier. +Example: + +```commandline +curl -s "https://www.ebi.ac.uk/europepmc/webservices/rest/MED/33024307/datalinks?format=json" | jq '.' +{ + "version": "6.8", + "hitCount": 9, + "request": { + "id": "33024307", + "source": "MED" + }, + "dataLinkList": { + "Category": [ + { + "Name": "Nucleotide Sequences", + "CategoryLinkCount": 5, + "Section": [ + { + "ObtainedBy": "tm_accession", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 5, + "Linklist": { + "Link": [ + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "AY278488", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:AY278488" + }, + "Title": "AY278488", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "MT121216", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:MT121216" + }, + "Title": "MT121216", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "KF367457", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:KF367457" + }, + "Title": "KF367457", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "MN996532", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:MN996532" + }, + "Title": "MN996532", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "MT072864", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ebi/ena.embl:MT072864" + }, + "Title": "MT072864", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + } + ] + } + } + ] + }, + { + "Name": "Protein Structures", + "NameLong": "Protein structures in PDBe", + "CategoryLinkCount": 2, + "Section": [ + { + "ObtainedBy": "tm_accession", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 2, + "Linklist": { + "Link": [ + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "6VW1", + "IDScheme": "PDB", + "IDURL": "http://identifiers.org/pdbe/pdb:6VW1" + }, + "Title": "6VW1", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + }, + { + "ObtainedBy": "tm_accession", + "PublicationDate": "04-11-2022", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "2AJF", + "IDScheme": "PDB", + "IDURL": "http://identifiers.org/pdbe/pdb:2AJF" + }, + "Title": "2AJF", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + } + ] + } + } + ] + }, + { + "Name": "Altmetric", + "CategoryLinkCount": 1, + "Section": [ + { + "ObtainedBy": "ext_links", + "Tags": [ + "altmetrics" + ], + "SectionLinkCount": 1, + "Linklist": { + "Link": [ + { + "ObtainedBy": "ext_links", + "PublicationDate": "15-10-2020", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "https://www.altmetric.com/details/91880755", + "IDScheme": "URL", + "IDURL": "https://www.altmetric.com/details/91880755" + }, + "Title": "Characteristics of SARS-CoV-2 and COVID-19", + "Publisher": { + "Name": "Altmetric" + }, + "ImageURL": "https://api.altmetric.com/v1/donut/91880755_64.png" + } + } + ] + } + } + ] + }, + { + "Name": "BioStudies: supplemental material and supporting data", + "CategoryLinkCount": 1, + "Section": [ + { + "ObtainedBy": "ext_links", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 1, + "Linklist": { + "Link": [ + { + "ObtainedBy": "ext_links", + "PublicationDate": "11-03-2021", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "33024307", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC7537588?xr=true", + "IDScheme": "URL", + "IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC7537588?xr=true" + }, + "Title": "Characteristics of SARS-CoV-2 and COVID-19.", + "Publisher": { + "Name": "BioStudies: supplemental material and supporting data" + } + } + } + ] + } + } + ] + } + ] + } +} +``` ## Mapping The table below describes the mapping from the EBI links records to the OpenAIRE Graph dump format.