From 3ad0d6edfc88f066c51dbb8a8d551f1225086df8 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 18 Jan 2024 11:18:53 +0100 Subject: [PATCH] - --- .gitignore | 1 + .../jsonschemas/community_result_schema.json | 1141 +++++++++-------- .../resources/jsonschemas/result_schema.json | 954 +++++++------- .../src/test/java/GenerateJsonSchema.java | 5 +- .../dhp/oa/graph/dump/ResultMapper.java | 18 +- .../eu/dnetlib/dhp/oa/graph/dump/Utils.java | 5 + .../graph/dump/community/CommunitySplit.java | 2 +- .../community/SparkPrepareResultProject.java | 6 +- .../dhp/oa/graph/dump/complete/Extractor.java | 12 +- .../dump/complete/SparkDumpEntitiesJob.java | 8 +- .../dump/complete/SparkDumpRelationJob.java | 6 +- .../complete/SparkOrganizationRelation.java | 18 +- .../dhp/oa/zenodoapi/ZenodoAPIClient.java | 12 +- .../countryresults/oozie_app/workflow.xml | 2 +- .../RelationFromOrganizationTest.java | 2 + 15 files changed, 1174 insertions(+), 1018 deletions(-) diff --git a/.gitignore b/.gitignore index 4f45f6e..14314ae 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ spark-warehouse /job.properties /*/job.properties /*/*/job.properties +/*/*/*/job.properties \ No newline at end of file diff --git a/dump-schema/src/main/resources/jsonschemas/community_result_schema.json b/dump-schema/src/main/resources/jsonschemas/community_result_schema.json index a30fe6a..9b86d7b 100644 --- a/dump-schema/src/main/resources/jsonschemas/community_result_schema.json +++ b/dump-schema/src/main/resources/jsonschemas/community_result_schema.json @@ -1,621 +1,684 @@ { - "$schema" : "http://json-schema.org/draft-07/schema#", - "definitions" : { - "CfHbKeyValue" : { - "type" : "object", - "properties" : { - "key" : { - "type" : "string", - "description" : "the OpenAIRE identifier of the data source" + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "CfHbKeyValue": { + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "Description of key" }, - "value" : { - "type" : "string", - "description" : "the name of the data source" + "value": { + "type": "string", + "description": "Description of value" } } }, - "Provenance" : { - "type" : "object", - "properties" : { - "provenance" : { - "type" : "string" + "Provenance": { + "type": "object", + "properties": { + "provenance": { + "type": "string", + "description": "Description of provenance" }, - "trust" : { - "type" : "string" + "trust": { + "type": "string", + "description": "Description of trust" } } }, - "ResultPid" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "The scheme of the persistent identifier for the result (i.e. doi). If the pid is here it means the information for the pid has been collected from an authority for that pid type (i.e. Crossref/Datacite for doi). The set of authoritative pid is: doi when collected from Crossref or Datacite pmid when collected from EuroPubmed, arxiv when collected from arXiv, handle from the repositories" + "ResultPid": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" }, - "value" : { - "type" : "string", - "description" : "The value expressed in the scheme (i.e. 10.1000/182)" + "value": { + "type": "string", + "description": "Description of value" } } } }, - "type" : "object", - "properties" : { - "author" : { - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "fullname" : { - "type" : "string" + "type": "object", + "properties": { + "author": { + "description": "Description of author", + "type": "array", + "items": { + "type": "object", + "properties": { + "fullname": { + "type": "string", + "description": "Description of fullname" }, - "name" : { - "type" : "string" + "name": { + "type": "string", + "description": "Description of name" }, - "pid" : { - "type" : "object", - "properties" : { - "id" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "The author's pid scheme. OpenAIRE currently supports 'ORCID'" + "pid": { + "type": "object", + "properties": { + "id": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" }, - "value" : { - "type" : "string", - "description" : "The author's pid value in that scheme (i.e. 0000-1111-2222-3333)" + "value": { + "type": "string", + "description": "Description of value" } - } - }, - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "The reason why the pid was associated to the author" - } ] - } - }, - "description" : "The author's persistent identifiers" - }, - "rank" : { - "type" : "integer" - }, - "surname" : { - "type" : "string" - } - } - } - }, - "bestaccessright" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label" : { - "type" : "string", - "description" : "Label for the access mode" - }, - "scheme" : { - "type" : "string", - "description" : "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - }, - "description" : "The openest of the access rights of this result." - }, - "codeRepositoryUrl" : { - "type" : "string", - "description" : "Only for results with type 'software': the URL to the repository with the source code" - }, - "collectedfrom" : { - "description" : "Information about the sources from which the record has been collected", - "type" : "array", - "items" : { - "allOf" : [ { - "$ref" : "#/definitions/CfHbKeyValue" - }, { - "description" : "Information about the sources from which the record has been collected" - } ] - } - }, - "contactgroup" : { - "description" : "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource" - } - }, - "contactperson" : { - "description" : "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource" - } - }, - "container" : { - "type" : "object", - "properties" : { - "conferencedate" : { - "type" : "string" - }, - "conferenceplace" : { - "type" : "string" - }, - "edition" : { - "type" : "string", - "description" : "Edition of the journal or conference proceeding" - }, - "ep" : { - "type" : "string", - "description" : "End page" - }, - "iss" : { - "type" : "string", - "description" : "Journal issue number" - }, - "issnLinking" : { - "type" : "string" - }, - "issnOnline" : { - "type" : "string" - }, - "issnPrinted" : { - "type" : "string" - }, - "name" : { - "type" : "string", - "description" : "Name of the journal or conference" - }, - "sp" : { - "type" : "string", - "description" : "Start page" - }, - "vol" : { - "type" : "string", - "description" : "Volume" - } - }, - "description" : "Container has information about the conference or journal where the result has been presented or published" - }, - "context" : { - "description" : "Reference to a relevant research infrastructure, initiative or community (RI/RC) among those collaborating with OpenAIRE. Please see https://connect.openaire.eu", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "Code identifying the RI/RC" - }, - "label" : { - "type" : "string", - "description" : "Label of the RI/RC" - }, - "provenance" : { - "description" : "Why this result is associated to the RI/RC.", - "type" : "array", - "items" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "Why this result is associated to the RI/RC." - } ] - } - } - }, - "description" : "Reference to a relevant research infrastructure, initiative or community (RI/RC) among those collaborating with OpenAIRE. Please see https://connect.openaire.eu" - } - }, - "contributor" : { - "description" : "Contributors for the result", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Contributors for the result" - } - }, - "country" : { - "description" : "The list of countries associated to this result", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "ISO 3166-1 alpha-2 country code (i.e. IT)" - }, - "label" : { - "type" : "string", - "description" : "The label for that code (i.e. Italy)" - }, - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "Why this result is associated to the country." - } ] - } - }, - "description" : "The list of countries associated to this result" - } - }, - "coverage" : { - "type" : "array", - "items" : { - "type" : "string" - } - }, - "dateofcollection" : { - "type" : "string", - "description" : "When OpenAIRE collected the record the last time" - }, - "description" : { - "type" : "array", - "items" : { - "type" : "string" - } - }, - "documentationUrl" : { - "description" : "Only for results with type 'software': URL to the software documentation", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'software': URL to the software documentation" - } - }, - "embargoenddate" : { - "type" : "string", - "description" : "Date when the embargo ends and this result turns Open Access" - }, - "format" : { - "type" : "array", - "items" : { - "type" : "string" - } - }, - "geolocation" : { - "description" : "Geolocation information", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "box" : { - "type" : "string" - }, - "place" : { - "type" : "string" - }, - "point" : { - "type" : "string" - } - }, - "description" : "Geolocation information" - } - }, - "id" : { - "type" : "string", - "description" : "The OpenAIRE identifiers for this result" - }, - "indicators" : { - "type" : "object", - "properties" : { - "bipIndicators" : { - "description" : "The impact measures (i.e. popularity)", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "clazz" : { - "type" : "string" - }, - "indicator" : { - "type" : "string" - }, - "score" : { - "type" : "string" - } - }, - "description" : "The impact measures (i.e. popularity)" - } - }, - "usageCounts" : { - "type" : "object", - "properties" : { - "downloads" : { - "type" : "string" - }, - "views" : { - "type" : "string" - } - }, - "description" : "The usage counts (i.e. downloads)" - } - }, - "description" : "Indicators computed for this result, for example UsageCount ones" - }, - "instance" : { - "description" : "Each instance is one specific materialisation or version of the result. For example, you can have one result with three instance: one is the pre-print, one is the post-print, one is te published version", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "accessright" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label" : { - "type" : "string", - "description" : "Label for the access mode" - }, - "openAccessRoute" : { - "type" : "string", - "enum" : [ "gold", "green", "hybrid", "bronze" ] - }, - "scheme" : { - "type" : "string", - "description" : "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - }, - "description" : "The accessRights for this materialization of the result" - }, - "alternateIdentifier" : { - "description" : "All the identifiers other than pids forged by an authorithy for the pid type (i.e. Crossref for DOIs", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "The scheme of the identifier. It can be a persistent identifier (i.e. doi). If it is present in the alternate identifiers it means it has not been forged by an authority for that pid. For example we collect metadata from an institutional repository that provides as identifier for the result also the doi" }, - "value" : { - "type" : "string", - "description" : "The value expressed in the scheme" + "description": "Description of id" + }, + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] + } + }, + "description": "Description of pid" + }, + "rank": { + "type": "integer", + "description": "Description of rank" + }, + "surname": { + "type": "string", + "description": "Description of surname" + } + }, + "description": "Description of author" + } + }, + "bestaccessright": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" + }, + "label": { + "type": "string", + "description": "Description of label" + }, + "scheme": { + "type": "string", + "description": "Description of scheme" + } + }, + "description": "Description of bestaccessright" + }, + "codeRepositoryUrl": { + "type": "string", + "description": "Description of codeRepositoryUrl" + }, + "collectedfrom": { + "description": "Description of collectedfrom", + "type": "array", + "items": { + "allOf": [ + {"$ref": "#/definitions/CfHbKeyValue"}, + {"description": "Description of collectedfrom"} + ] + } + }, + "contactgroup": { + "description": "Description of contactgroup", + "type": "array", + "items": { + "type": "string", + "description": "Description of contactgroup" + } + }, + "contactperson": { + "description": "Description of contactperson", + "type": "array", + "items": { + "type": "string", + "description": "Description of contactperson" + } + }, + "container": { + "type": "object", + "properties": { + "conferencedate": { + "type": "string", + "description": "Description of conferencedate" + }, + "conferenceplace": { + "type": "string", + "description": "Description of conferenceplace" + }, + "edition": { + "type": "string", + "description": "Description of edition" + }, + "ep": { + "type": "string", + "description": "Description of ep" + }, + "iss": { + "type": "string", + "description": "Description of iss" + }, + "issnLinking": { + "type": "string", + "description": "Description of issnLinking" + }, + "issnOnline": { + "type": "string", + "description": "Description of issnOnline" + }, + "issnPrinted": { + "type": "string", + "description": "Description of issnPrinted" + }, + "name": { + "type": "string", + "description": "Description of name" + }, + "sp": { + "type": "string", + "description": "Description of sp" + }, + "vol": { + "type": "string", + "description": "Description of vol" + } + }, + "description": "Description of container" + }, + "context": { + "description": "Description of context", + "type": "array", + "items": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" + }, + "label": { + "type": "string", + "description": "Description of label" + }, + "provenance": { + "description": "Description of provenance", + "type": "array", + "items": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] + } + } + }, + "description": "Description of context" + } + }, + "contributor": { + "description": "Description of contributor", + "type": "array", + "items": { + "type": "string", + "description": "Description of contributor" + } + }, + "country": { + "description": "Description of country", + "type": "array", + "items": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" + }, + "label": { + "type": "string", + "description": "Description of label" + }, + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] + } + }, + "description": "Description of country" + } + }, + "coverage": { + "description": "Description of coverage", + "type": "array", + "items": { + "type": "string", + "description": "Description of coverage" + } + }, + "dateofcollection": { + "type": "string", + "description": "Description of dateofcollection" + }, + "description": { + "description": "Description of description", + "type": "array", + "items": { + "type": "string", + "description": "Description of description" + } + }, + "documentationUrl": { + "description": "Description of documentationUrl", + "type": "array", + "items": { + "type": "string", + "description": "Description of documentationUrl" + } + }, + "embargoenddate": { + "type": "string", + "description": "Description of embargoenddate" + }, + "format": { + "description": "Description of format", + "type": "array", + "items": { + "type": "string", + "description": "Description of format" + } + }, + "geolocation": { + "description": "Description of geolocation", + "type": "array", + "items": { + "type": "object", + "properties": { + "box": { + "type": "string", + "description": "Description of box" + }, + "place": { + "type": "string", + "description": "Description of place" + }, + "point": { + "type": "string", + "description": "Description of point" + } + }, + "description": "Description of geolocation" + } + }, + "id": { + "type": "string", + "description": "Description of id" + }, + "indicators": { + "type": "object", + "properties": { + "bipIndicators": { + "description": "Description of bipIndicators", + "type": "array", + "items": { + "type": "object", + "properties": { + "clazz": { + "type": "string", + "description": "Description of clazz" + }, + "indicator": { + "type": "string", + "description": "Description of indicator" + }, + "score": { + "type": "string", + "description": "Description of score" + } + }, + "description": "Description of bipIndicators" + } + }, + "usageCounts": { + "type": "object", + "properties": { + "downloads": { + "type": "string", + "description": "Description of downloads" + }, + "views": { + "type": "string", + "description": "Description of views" + } + }, + "description": "Description of usageCounts" + } + }, + "description": "Description of indicators" + }, + "instance": { + "description": "Description of instance", + "type": "array", + "items": { + "type": "object", + "properties": { + "accessright": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" + }, + "label": { + "type": "string", + "description": "Description of label" + }, + "openAccessRoute": { + "type": "string", + "enum": [ + "gold", + "green", + "hybrid", + "bronze" + ], + "description": "Description of openAccessRoute" + }, + "scheme": { + "type": "string", + "description": "Description of scheme" + } + }, + "description": "Description of accessright" + }, + "alternateIdentifier": { + "description": "Description of alternateIdentifier", + "type": "array", + "items": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" + }, + "value": { + "type": "string", + "description": "Description of value" } }, - "description" : "All the identifiers other than pids forged by an authorithy for the pid type (i.e. Crossref for DOIs" + "description": "Description of alternateIdentifier" } }, - "articleprocessingcharge" : { - "type" : "object", - "properties" : { - "amount" : { - "type" : "string" + "articleprocessingcharge": { + "type": "object", + "properties": { + "amount": { + "type": "string", + "description": "Description of amount" }, - "currency" : { - "type" : "string" + "currency": { + "type": "string", + "description": "Description of currency" } }, - "description" : "The money spent to make this book or article available in Open Access. Source for this information is the OpenAPC initiative." + "description": "Description of articleprocessingcharge" }, - "collectedfrom" : { - "allOf" : [ { - "$ref" : "#/definitions/CfHbKeyValue" - }, { - "description" : "Information about the source from which the record has been collected" - } ] + "collectedfrom": { + "allOf": [ + {"$ref": "#/definitions/CfHbKeyValue"}, + {"description": "Description of collectedfrom"} + ] }, - "hostedby" : { - "allOf" : [ { - "$ref" : "#/definitions/CfHbKeyValue" - }, { - "description" : "Information about the source from which the instance can be viewed or downloaded." - } ] + "hostedby": { + "allOf": [ + {"$ref": "#/definitions/CfHbKeyValue"}, + {"description": "Description of hostedby"} + ] }, - "license" : { - "type" : "string" + "license": { + "type": "string", + "description": "Description of license" }, - "pid" : { - "type" : "array", - "items" : { - "$ref" : "#/definitions/ResultPid" + "pid": { + "description": "Description of pid", + "type": "array", + "items": { + "allOf": [ + {"$ref": "#/definitions/ResultPid"}, + {"description": "Description of pid"} + ] } }, - "publicationdate" : { - "type" : "string", - "description" : "Date of the research product" + "publicationdate": { + "type": "string", + "description": "Description of publicationdate" }, - "refereed" : { - "type" : "string", - "description" : "If this instance has been peer-reviewed or not. Allowed values are peerReviewed, nonPeerReviewed, UNKNOWN (as defined in https://api.openaire.eu/vocabularies/dnet:review_levels)" + "refereed": { + "type": "string", + "description": "Description of refereed" }, - "type" : { - "type" : "string", - "description" : "The specific sub-type of this instance (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links)" + "type": { + "type": "string", + "description": "Description of type" }, - "url" : { - "description" : "URLs to the instance. They may link to the actual full-text or to the landing page at the hosting source. ", - "type" : "array", - "items" : { - "type" : "string", - "description" : "URLs to the instance. They may link to the actual full-text or to the landing page at the hosting source. " + "url": { + "description": "Description of url", + "type": "array", + "items": { + "type": "string", + "description": "Description of url" } } }, - "description" : "Each instance is one specific materialisation or version of the result. For example, you can have one result with three instance: one is the pre-print, one is the post-print, one is te published version" + "description": "Description of instance" } }, - "language" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "alpha-3/ISO 639-2 code of the language" + "isGreen": { + "type": "boolean", + "description": "Description of isGreen" + }, + "isInDiamondJournal": { + "type": "boolean", + "description": "Description of isInDiamondJournal" + }, + "language": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" }, - "label" : { - "type" : "string", - "description" : "Language label in English" + "label": { + "type": "string", + "description": "Description of label" } + }, + "description": "Description of language" + }, + "lastupdatetimestamp": { + "type": "integer", + "description": "Description of lastupdatetimestamp" + }, + "maintitle": { + "type": "string", + "description": "Description of maintitle" + }, + "openAccessColor": { + "type": "string", + "enum": [ + "gold", + "hybrid", + "bronze" + ], + "description": "Description of openAccessColor" + }, + "originalId": { + "description": "Description of originalId", + "type": "array", + "items": { + "type": "string", + "description": "Description of originalId" } }, - "lastupdatetimestamp" : { - "type" : "integer", - "description" : "Timestamp of last update of the record in OpenAIRE" - }, - "maintitle" : { - "type" : "string", - "description" : "A name or title by which a scientific result is known. May be the title of a publication, of a dataset or the name of a piece of software." - }, - "originalId" : { - "description" : "Identifiers of the record at the original sources", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Identifiers of the record at the original sources" + "pid": { + "description": "Description of pid", + "type": "array", + "items": { + "allOf": [ + {"$ref": "#/definitions/ResultPid"}, + {"description": "Description of pid"} + ] } }, - "pid" : { - "description" : "Persistent identifiers of the result", - "type" : "array", - "items" : { - "allOf" : [ { - "$ref" : "#/definitions/ResultPid" - }, { - "description" : "Persistent identifiers of the result" - } ] - } + "programmingLanguage": { + "type": "string", + "description": "Description of programmingLanguage" }, - "programmingLanguage" : { - "type" : "string", - "description" : "Only for results with type 'software': the programming language" - }, - "projects" : { - "description" : "List of projects (i.e. grants) that (co-)funded the production ofn the research results", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "acronym" : { - "type" : "string", - "description" : "The acronym of the project" + "projects": { + "description": "Description of projects", + "type": "array", + "items": { + "type": "object", + "properties": { + "acronym": { + "type": "string", + "description": "Description of acronym" }, - "code" : { - "type" : "string", - "description" : "The grant agreement number" + "code": { + "type": "string", + "description": "Description of code" }, - "funder" : { - "type" : "object", - "properties" : { - "fundingStream" : { - "type" : "string", - "description" : "Stream of funding (e.g. for European Commission can be H2020 or FP7)" + "funder": { + "type": "object", + "properties": { + "fundingStream": { + "type": "string", + "description": "Description of fundingStream" }, - "jurisdiction" : { - "type" : "string", - "description" : "Geographical jurisdiction (e.g. for European Commission is EU, for Croatian Science Foundation is HR)" + "jurisdiction": { + "type": "string", + "description": "Description of jurisdiction" }, - "name" : { - "type" : "string", - "description" : "The name of the funder (European Commission)" + "name": { + "type": "string", + "description": "Description of name" }, - "shortName" : { - "type" : "string", - "description" : "The short name of the funder (EC)" + "shortName": { + "type": "string", + "description": "Description of shortName" } }, - "description" : "Information about the funder funding the project" + "description": "Description of funder" }, - "id" : { - "type" : "string", - "description" : "The OpenAIRE id for the project" + "id": { + "type": "string", + "description": "Description of id" }, - "provenance" : { - "$ref" : "#/definitions/Provenance" + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] }, - "title" : { - "type" : "string" + "title": { + "type": "string", + "description": "Description of title" }, - "validated" : { - "type" : "object", - "properties" : { - "validatedByFunder" : { - "type" : "boolean" + "validated": { + "type": "object", + "properties": { + "validatedByFunder": { + "type": "boolean", + "description": "Description of validatedByFunder" }, - "validationDate" : { - "type" : "string" + "validationDate": { + "type": "string", + "description": "Description of validationDate" } - } + }, + "description": "Description of validated" } }, - "description" : "List of projects (i.e. grants) that (co-)funded the production ofn the research results" + "description": "Description of projects" } }, - "publicationdate" : { - "type" : "string", - "description" : "Main date of the research product: typically the publication or issued date. In case of a research result with different versions with different dates, the date of the result is selected as the most frequent well-formatted date. If not available, then the most recent and complete date among those that are well-formatted. For statistics, the year is extracted and the result is counted only among the result of that year. Example: Pre-print date: 2019-02-03, Article date provided by repository: 2020-02, Article date provided by Crossref: 2020, OpenAIRE will set as date 2019-02-03, because it’s the most recent among the complete and well-formed dates. If then the repository updates the metadata and set a complete date (e.g. 2020-02-12), then this will be the new date for the result because it becomes the most recent most complete date. However, if OpenAIRE then collects the pre-print from another repository with date 2019-02-03, then this will be the “winning date” because it becomes the most frequent well-formatted date." + "publicationdate": { + "type": "string", + "description": "Description of publicationdate" }, - "publisher" : { - "type" : "string", - "description" : "The name of the entity that holds, archives, publishes prints, distributes, releases, issues, or produces the resource." + "publiclyFunded": { + "type": "boolean", + "description": "Description of publiclyFunded" }, - "size" : { - "type" : "string", - "description" : "Only for results with type 'dataset': the declared size of the dataset" + "publisher": { + "type": "string", + "description": "Description of publisher" }, - "source" : { - "description" : "See definition of Dublin Core field dc:source", - "type" : "array", - "items" : { - "type" : "string", - "description" : "See definition of Dublin Core field dc:source" + "size": { + "type": "string", + "description": "Description of size" + }, + "source": { + "description": "Description of source", + "type": "array", + "items": { + "type": "string", + "description": "Description of source" } }, - "subjects" : { - "description" : "Keywords associated to the result", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "Why this subject is associated to the result" - } ] + "subjects": { + "description": "Description of subjects", + "type": "array", + "items": { + "type": "object", + "properties": { + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] }, - "subject" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies)." + "subject": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" }, - "value" : { - "type" : "string", - "description" : "The value for the subject in the selected scheme. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)." + "value": { + "type": "string", + "description": "Description of value" } - } + }, + "description": "Description of subject" } }, - "description" : "Keywords associated to the result" + "description": "Description of subjects" } }, - "subtitle" : { - "type" : "string", - "description" : "Explanatory or alternative name by which a scientific result is known." + "subtitle": { + "type": "string", + "description": "Description of subtitle" }, - "tool" : { - "description" : "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product" + "tool": { + "description": "Description of tool", + "type": "array", + "items": { + "type": "string", + "description": "Description of tool" } }, - "type" : { - "type" : "string", - "description" : "Type of the result: one of 'publication', 'dataset', 'software', 'other' (see also https://api.openaire.eu/vocabularies/dnet:result_typologies)" + "type": { + "type": "string", + "description": "Description of type" }, - "version" : { - "type" : "string", - "description" : "Version of the result" + "version": { + "type": "string", + "description": "Description of version" } } -} +} \ No newline at end of file diff --git a/dump-schema/src/main/resources/jsonschemas/result_schema.json b/dump-schema/src/main/resources/jsonschemas/result_schema.json index 7120763..5d6cd62 100644 --- a/dump-schema/src/main/resources/jsonschemas/result_schema.json +++ b/dump-schema/src/main/resources/jsonschemas/result_schema.json @@ -1,493 +1,553 @@ { - "$schema" : "http://json-schema.org/draft-07/schema#", - "definitions" : { - "Provenance" : { - "type" : "object", - "properties" : { - "provenance" : { - "type" : "string" + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "Provenance": { + "type": "object", + "properties": { + "provenance": { + "type": "string", + "description": "Description of provenance" }, - "trust" : { - "type" : "string" + "trust": { + "type": "string", + "description": "Description of trust" } } }, - "ResultPid" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "The scheme of the persistent identifier for the result (i.e. doi). If the pid is here it means the information for the pid has been collected from an authority for that pid type (i.e. Crossref/Datacite for doi). The set of authoritative pid is: doi when collected from Crossref or Datacite pmid when collected from EuroPubmed, arxiv when collected from arXiv, handle from the repositories" + "ResultPid": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" }, - "value" : { - "type" : "string", - "description" : "The value expressed in the scheme (i.e. 10.1000/182)" + "value": { + "type": "string", + "description": "Description of value" } } } }, - "type" : "object", - "properties" : { - "author" : { - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "fullname" : { - "type" : "string" + "type": "object", + "properties": { + "author": { + "description": "Description of author", + "type": "array", + "items": { + "type": "object", + "properties": { + "fullname": { + "type": "string", + "description": "Description of fullname" }, - "name" : { - "type" : "string" + "name": { + "type": "string", + "description": "Description of name" }, - "pid" : { - "type" : "object", - "properties" : { - "id" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "The author's pid scheme. OpenAIRE currently supports 'ORCID'" + "pid": { + "type": "object", + "properties": { + "id": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" }, - "value" : { - "type" : "string", - "description" : "The author's pid value in that scheme (i.e. 0000-1111-2222-3333)" + "value": { + "type": "string", + "description": "Description of value" } - } - }, - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "The reason why the pid was associated to the author" - } ] - } - }, - "description" : "The author's persistent identifiers" - }, - "rank" : { - "type" : "integer" - }, - "surname" : { - "type" : "string" - } - } - } - }, - "bestaccessright" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label" : { - "type" : "string", - "description" : "Label for the access mode" - }, - "scheme" : { - "type" : "string", - "description" : "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - }, - "description" : "The openest of the access rights of this result." - }, - "codeRepositoryUrl" : { - "type" : "string", - "description" : "Only for results with type 'software': the URL to the repository with the source code" - }, - "contactgroup" : { - "description" : "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource" - } - }, - "contactperson" : { - "description" : "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource" - } - }, - "container" : { - "type" : "object", - "properties" : { - "conferencedate" : { - "type" : "string" - }, - "conferenceplace" : { - "type" : "string" - }, - "edition" : { - "type" : "string", - "description" : "Edition of the journal or conference proceeding" - }, - "ep" : { - "type" : "string", - "description" : "End page" - }, - "iss" : { - "type" : "string", - "description" : "Journal issue number" - }, - "issnLinking" : { - "type" : "string" - }, - "issnOnline" : { - "type" : "string" - }, - "issnPrinted" : { - "type" : "string" - }, - "name" : { - "type" : "string", - "description" : "Name of the journal or conference" - }, - "sp" : { - "type" : "string", - "description" : "Start page" - }, - "vol" : { - "type" : "string", - "description" : "Volume" - } - }, - "description" : "Container has information about the conference or journal where the result has been presented or published" - }, - "contributor" : { - "description" : "Contributors for the result", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Contributors for the result" - } - }, - "country" : { - "description" : "The list of countries associated to this result", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "ISO 3166-1 alpha-2 country code (i.e. IT)" - }, - "label" : { - "type" : "string", - "description" : "The label for that code (i.e. Italy)" - }, - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "Why this result is associated to the country." - } ] - } - }, - "description" : "The list of countries associated to this result" - } - }, - "coverage" : { - "type" : "array", - "items" : { - "type" : "string" - } - }, - "dateofcollection" : { - "type" : "string", - "description" : "When OpenAIRE collected the record the last time" - }, - "description" : { - "type" : "array", - "items" : { - "type" : "string" - } - }, - "documentationUrl" : { - "description" : "Only for results with type 'software': URL to the software documentation", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'software': URL to the software documentation" - } - }, - "embargoenddate" : { - "type" : "string", - "description" : "Date when the embargo ends and this result turns Open Access" - }, - "format" : { - "type" : "array", - "items" : { - "type" : "string" - } - }, - "geolocation" : { - "description" : "Geolocation information", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "box" : { - "type" : "string" - }, - "place" : { - "type" : "string" - }, - "point" : { - "type" : "string" - } - }, - "description" : "Geolocation information" - } - }, - "id" : { - "type" : "string", - "description" : "The OpenAIRE identifiers for this result" - }, - "indicators" : { - "type" : "object", - "properties" : { - "bipIndicators" : { - "description" : "The impact measures (i.e. popularity)", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "class" : { - "type" : "string" - }, - "indicator" : { - "type" : "string" - }, - "score" : { - "type" : "string" - } - }, - "description" : "The impact measures (i.e. popularity)" - } - }, - "usageCounts" : { - "type" : "object", - "properties" : { - "downloads" : { - "type" : "string" - }, - "views" : { - "type" : "string" - } - }, - "description" : "The usage counts (i.e. downloads)" - } - }, - "description" : "Indicators computed for this result, for example UsageCount ones" - }, - "instance" : { - "description" : "Each instance is one specific materialisation or version of the result. For example, you can have one result with three instance: one is the pre-print, one is the post-print, one is te published version", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "accessright" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label" : { - "type" : "string", - "description" : "Label for the access mode" - }, - "openAccessRoute" : { - "type" : "string", - "enum" : [ "gold", "green", "hybrid", "bronze" ] - }, - "scheme" : { - "type" : "string", - "description" : "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - }, - "description" : "The accessRights for this materialization of the result" - }, - "alternateIdentifier" : { - "description" : "All the identifiers other than pids forged by an authorithy for the pid type (i.e. Crossref for DOIs", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "The scheme of the identifier. It can be a persistent identifier (i.e. doi). If it is present in the alternate identifiers it means it has not been forged by an authority for that pid. For example we collect metadata from an institutional repository that provides as identifier for the result also the doi" }, - "value" : { - "type" : "string", - "description" : "The value expressed in the scheme" - } + "description": "Description of id" }, - "description" : "All the identifiers other than pids forged by an authorithy for the pid type (i.e. Crossref for DOIs" - } - }, - "articleprocessingcharge" : { - "type" : "object", - "properties" : { - "amount" : { - "type" : "string" - }, - "currency" : { - "type" : "string" + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] } }, - "description" : "The money spent to make this book or article available in Open Access. Source for this information is the OpenAPC initiative." + "description": "Description of pid" }, - "license" : { - "type" : "string" + "rank": { + "type": "integer", + "description": "Description of rank" }, - "pid" : { - "type" : "array", - "items" : { - "$ref" : "#/definitions/ResultPid" - } - }, - "publicationdate" : { - "type" : "string", - "description" : "Date of the research product" - }, - "refereed" : { - "type" : "string", - "description" : "If this instance has been peer-reviewed or not. Allowed values are peerReviewed, nonPeerReviewed, UNKNOWN (as defined in https://api.openaire.eu/vocabularies/dnet:review_levels)" - }, - "type" : { - "type" : "string", - "description" : "The specific sub-type of this instance (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links)" - }, - "url" : { - "description" : "URLs to the instance. They may link to the actual full-text or to the landing page at the hosting source. ", - "type" : "array", - "items" : { - "type" : "string", - "description" : "URLs to the instance. They may link to the actual full-text or to the landing page at the hosting source. " - } + "surname": { + "type": "string", + "description": "Description of surname" } }, - "description" : "Each instance is one specific materialisation or version of the result. For example, you can have one result with three instance: one is the pre-print, one is the post-print, one is te published version" + "description": "Description of author" } }, - "language" : { - "type" : "object", - "properties" : { - "code" : { - "type" : "string", - "description" : "alpha-3/ISO 639-2 code of the language" + "bestaccessright": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" }, - "label" : { - "type" : "string", - "description" : "Language label in English" + "label": { + "type": "string", + "description": "Description of label" + }, + "scheme": { + "type": "string", + "description": "Description of scheme" } + }, + "description": "Description of bestaccessright" + }, + "codeRepositoryUrl": { + "type": "string", + "description": "Description of codeRepositoryUrl" + }, + "contactgroup": { + "description": "Description of contactgroup", + "type": "array", + "items": { + "type": "string", + "description": "Description of contactgroup" } }, - "lastupdatetimestamp" : { - "type" : "integer", - "description" : "Timestamp of last update of the record in OpenAIRE" - }, - "maintitle" : { - "type" : "string", - "description" : "A name or title by which a scientific result is known. May be the title of a publication, of a dataset or the name of a piece of software." - }, - "originalId" : { - "description" : "Identifiers of the record at the original sources", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Identifiers of the record at the original sources" + "contactperson": { + "description": "Description of contactperson", + "type": "array", + "items": { + "type": "string", + "description": "Description of contactperson" } }, - "pid" : { - "description" : "Persistent identifiers of the result", - "type" : "array", - "items" : { - "allOf" : [ { - "$ref" : "#/definitions/ResultPid" - }, { - "description" : "Persistent identifiers of the result" - } ] + "container": { + "type": "object", + "properties": { + "conferencedate": { + "type": "string", + "description": "Description of conferencedate" + }, + "conferenceplace": { + "type": "string", + "description": "Description of conferenceplace" + }, + "edition": { + "type": "string", + "description": "Description of edition" + }, + "ep": { + "type": "string", + "description": "Description of ep" + }, + "iss": { + "type": "string", + "description": "Description of iss" + }, + "issnLinking": { + "type": "string", + "description": "Description of issnLinking" + }, + "issnOnline": { + "type": "string", + "description": "Description of issnOnline" + }, + "issnPrinted": { + "type": "string", + "description": "Description of issnPrinted" + }, + "name": { + "type": "string", + "description": "Description of name" + }, + "sp": { + "type": "string", + "description": "Description of sp" + }, + "vol": { + "type": "string", + "description": "Description of vol" + } + }, + "description": "Description of container" + }, + "contributor": { + "description": "Description of contributor", + "type": "array", + "items": { + "type": "string", + "description": "Description of contributor" } }, - "programmingLanguage" : { - "type" : "string", - "description" : "Only for results with type 'software': the programming language" - }, - "publicationdate" : { - "type" : "string", - "description" : "Main date of the research product: typically the publication or issued date. In case of a research result with different versions with different dates, the date of the result is selected as the most frequent well-formatted date. If not available, then the most recent and complete date among those that are well-formatted. For statistics, the year is extracted and the result is counted only among the result of that year. Example: Pre-print date: 2019-02-03, Article date provided by repository: 2020-02, Article date provided by Crossref: 2020, OpenAIRE will set as date 2019-02-03, because it’s the most recent among the complete and well-formed dates. If then the repository updates the metadata and set a complete date (e.g. 2020-02-12), then this will be the new date for the result because it becomes the most recent most complete date. However, if OpenAIRE then collects the pre-print from another repository with date 2019-02-03, then this will be the “winning date” because it becomes the most frequent well-formatted date." - }, - "publisher" : { - "type" : "string", - "description" : "The name of the entity that holds, archives, publishes prints, distributes, releases, issues, or produces the resource." - }, - "size" : { - "type" : "string", - "description" : "Only for results with type 'dataset': the declared size of the dataset" - }, - "source" : { - "description" : "See definition of Dublin Core field dc:source", - "type" : "array", - "items" : { - "type" : "string", - "description" : "See definition of Dublin Core field dc:source" - } - }, - "subjects" : { - "description" : "Keywords associated to the result", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "Why this subject is associated to the result" - } ] + "country": { + "description": "Description of country", + "type": "array", + "items": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" }, - "subject" : { - "type" : "object", - "properties" : { - "scheme" : { - "type" : "string", - "description" : "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies)." + "label": { + "type": "string", + "description": "Description of label" + }, + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] + } + }, + "description": "Description of country" + } + }, + "coverage": { + "description": "Description of coverage", + "type": "array", + "items": { + "type": "string", + "description": "Description of coverage" + } + }, + "dateofcollection": { + "type": "string", + "description": "Description of dateofcollection" + }, + "description": { + "description": "Description of description", + "type": "array", + "items": { + "type": "string", + "description": "Description of description" + } + }, + "documentationUrl": { + "description": "Description of documentationUrl", + "type": "array", + "items": { + "type": "string", + "description": "Description of documentationUrl" + } + }, + "embargoenddate": { + "type": "string", + "description": "Description of embargoenddate" + }, + "format": { + "description": "Description of format", + "type": "array", + "items": { + "type": "string", + "description": "Description of format" + } + }, + "geolocation": { + "description": "Description of geolocation", + "type": "array", + "items": { + "type": "object", + "properties": { + "box": { + "type": "string", + "description": "Description of box" + }, + "place": { + "type": "string", + "description": "Description of place" + }, + "point": { + "type": "string", + "description": "Description of point" + } + }, + "description": "Description of geolocation" + } + }, + "id": { + "type": "string", + "description": "Description of id" + }, + "indicators": { + "type": "object", + "properties": { + "bipIndicators": { + "description": "Description of bipIndicators", + "type": "array", + "items": { + "type": "object", + "properties": { + "clazz": { + "type": "string", + "description": "Description of clazz" }, - "value" : { - "type" : "string", - "description" : "The value for the subject in the selected scheme. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)." + "indicator": { + "type": "string", + "description": "Description of indicator" + }, + "score": { + "type": "string", + "description": "Description of score" } + }, + "description": "Description of bipIndicators" + } + }, + "usageCounts": { + "type": "object", + "properties": { + "downloads": { + "type": "string", + "description": "Description of downloads" + }, + "views": { + "type": "string", + "description": "Description of views" + } + }, + "description": "Description of usageCounts" + } + }, + "description": "Description of indicators" + }, + "instance": { + "description": "Description of instance", + "type": "array", + "items": { + "type": "object", + "properties": { + "accessright": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" + }, + "label": { + "type": "string", + "description": "Description of label" + }, + "openAccessRoute": { + "type": "string", + "enum": [ + "gold", + "green", + "hybrid", + "bronze" + ], + "description": "Description of openAccessRoute" + }, + "scheme": { + "type": "string", + "description": "Description of scheme" + } + }, + "description": "Description of accessright" + }, + "alternateIdentifier": { + "description": "Description of alternateIdentifier", + "type": "array", + "items": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" + }, + "value": { + "type": "string", + "description": "Description of value" + } + }, + "description": "Description of alternateIdentifier" + } + }, + "articleprocessingcharge": { + "type": "object", + "properties": { + "amount": { + "type": "string", + "description": "Description of amount" + }, + "currency": { + "type": "string", + "description": "Description of currency" + } + }, + "description": "Description of articleprocessingcharge" + }, + "license": { + "type": "string", + "description": "Description of license" + }, + "pid": { + "description": "Description of pid", + "type": "array", + "items": { + "allOf": [ + {"$ref": "#/definitions/ResultPid"}, + {"description": "Description of pid"} + ] + } + }, + "publicationdate": { + "type": "string", + "description": "Description of publicationdate" + }, + "refereed": { + "type": "string", + "description": "Description of refereed" + }, + "type": { + "type": "string", + "description": "Description of type" + }, + "url": { + "description": "Description of url", + "type": "array", + "items": { + "type": "string", + "description": "Description of url" } } }, - "description" : "Keywords associated to the result" + "description": "Description of instance" } }, - "subtitle" : { - "type" : "string", - "description" : "Explanatory or alternative name by which a scientific result is known." + "isGreen": { + "type": "boolean", + "description": "Description of isGreen" }, - "tool" : { - "description" : "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product", - "type" : "array", - "items" : { - "type" : "string", - "description" : "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product" + "isInDiamondJournal": { + "type": "boolean", + "description": "Description of isInDiamondJournal" + }, + "language": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Description of code" + }, + "label": { + "type": "string", + "description": "Description of label" + } + }, + "description": "Description of language" + }, + "lastupdatetimestamp": { + "type": "integer", + "description": "Description of lastupdatetimestamp" + }, + "maintitle": { + "type": "string", + "description": "Description of maintitle" + }, + "openAccessColor": { + "type": "string", + "enum": [ + "gold", + "hybrid", + "bronze" + ], + "description": "Description of openAccessColor" + }, + "originalId": { + "description": "Description of originalId", + "type": "array", + "items": { + "type": "string", + "description": "Description of originalId" } }, - "type" : { - "type" : "string", - "description" : "Type of the result: one of 'publication', 'dataset', 'software', 'other' (see also https://api.openaire.eu/vocabularies/dnet:result_typologies)" + "pid": { + "description": "Description of pid", + "type": "array", + "items": { + "allOf": [ + {"$ref": "#/definitions/ResultPid"}, + {"description": "Description of pid"} + ] + } }, - "version" : { - "type" : "string", - "description" : "Version of the result" + "programmingLanguage": { + "type": "string", + "description": "Description of programmingLanguage" + }, + "publicationdate": { + "type": "string", + "description": "Description of publicationdate" + }, + "publiclyFunded": { + "type": "boolean", + "description": "Description of publiclyFunded" + }, + "publisher": { + "type": "string", + "description": "Description of publisher" + }, + "size": { + "type": "string", + "description": "Description of size" + }, + "source": { + "description": "Description of source", + "type": "array", + "items": { + "type": "string", + "description": "Description of source" + } + }, + "subjects": { + "description": "Description of subjects", + "type": "array", + "items": { + "type": "object", + "properties": { + "provenance": { + "allOf": [ + {"$ref": "#/definitions/Provenance"}, + {"description": "Description of provenance"} + ] + }, + "subject": { + "type": "object", + "properties": { + "scheme": { + "type": "string", + "description": "Description of scheme" + }, + "value": { + "type": "string", + "description": "Description of value" + } + }, + "description": "Description of subject" + } + }, + "description": "Description of subjects" + } + }, + "subtitle": { + "type": "string", + "description": "Description of subtitle" + }, + "tool": { + "description": "Description of tool", + "type": "array", + "items": { + "type": "string", + "description": "Description of tool" + } + }, + "type": { + "type": "string", + "description": "Description of type" + }, + "version": { + "type": "string", + "description": "Description of version" } } -} +} \ No newline at end of file diff --git a/dump-schema/src/test/java/GenerateJsonSchema.java b/dump-schema/src/test/java/GenerateJsonSchema.java index 35d20ff..63f8121 100644 --- a/dump-schema/src/test/java/GenerateJsonSchema.java +++ b/dump-schema/src/test/java/GenerateJsonSchema.java @@ -1,5 +1,6 @@ import java.io.IOException; +import eu.dnetlib.dhp.oa.model.Result; import org.junit.jupiter.api.Test; import com.fasterxml.jackson.core.JsonProcessingException; @@ -24,7 +25,7 @@ class GenerateJsonSchema { configBuilder.forFields().withDescriptionResolver(field -> "Description of " + field.getDeclaredName()); SchemaGeneratorConfig config = configBuilder.build(); SchemaGenerator generator = new SchemaGenerator(config); - JsonNode jsonSchema = generator.generateSchema(GraphResult.class); + JsonNode jsonSchema = generator.generateSchema(CommunityResult.class); System.out.println(jsonSchema.toString()); } @@ -41,7 +42,7 @@ class GenerateJsonSchema { .without(Option.NONPUBLIC_NONSTATIC_FIELDS_WITHOUT_GETTERS); SchemaGeneratorConfig config = configBuilder.build(); SchemaGenerator generator = new SchemaGenerator(config); - JsonNode jsonSchema = generator.generateSchema(Project.class); + JsonNode jsonSchema = generator.generateSchema(Result.class); System.out.println(jsonSchema.toString()); } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index a966dba..3c45e7f 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -2,6 +2,8 @@ package eu.dnetlib.dhp.oa.graph.dump; import static eu.dnetlib.dhp.oa.graph.dump.Constants.*; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId; import java.io.Serializable; import java.util.*; @@ -29,6 +31,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; public class ResultMapper implements Serializable { + private static final String NULL = "null"; public static Result map( E in, Map communityMap, String dumpType) @@ -60,7 +63,7 @@ public class ResultMapper implements Serializable { mapDescription(out, input); mapEmbargo(out, input); mapFormat(out, input); - out.setId(input.getId().substring(3)); + out.setId(getEntityId(input.getId(), ENTITY_ID_SEPARATOR)); mapOriginalId(out, input); mapInstance(dumpType, out, input); mapLanguage(out, input); @@ -100,7 +103,7 @@ public class ResultMapper implements Serializable { break; } - + } private static void mapContext(Map communityMap, CommunityResult out, @@ -175,7 +178,7 @@ public class ResultMapper implements Serializable { input .getCollectedfrom() .stream() - .map(cf -> CfHbKeyValue.newInstance(cf.getKey().substring(3), cf.getValue())) + .map(cf -> CfHbKeyValue.newInstance(getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), cf.getValue())) .collect(Collectors.toList())); } @@ -207,6 +210,7 @@ public class ResultMapper implements Serializable { // .getProvenanceaction() // .getClassid() // .equalsIgnoreCase("subject:sdg")))) + .filter(s -> !s.getValue().equalsIgnoreCase(NULL)) .forEach(s -> subjectList.add(getSubject(s)))); out.setSubjects(subjectList); @@ -541,14 +545,18 @@ public class ResultMapper implements Serializable { instance .setCollectedfrom( CfHbKeyValue - .newInstance(i.getCollectedfrom().getKey().substring(3), i.getCollectedfrom().getValue())); + .newInstance( + getEntityId(i.getCollectedfrom().getKey(), ENTITY_ID_SEPARATOR), + i.getCollectedfrom().getValue())); if (Optional.ofNullable(i.getHostedby()).isPresent() && Optional.ofNullable(i.getHostedby().getKey()).isPresent() && StringUtils.isNotBlank(i.getHostedby().getKey())) instance .setHostedby( - CfHbKeyValue.newInstance(i.getHostedby().getKey().substring(3), i.getHostedby().getValue())); + CfHbKeyValue + .newInstance( + getEntityId(i.getHostedby().getKey(), ENTITY_ID_SEPARATOR), i.getHostedby().getValue())); return instance; diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Utils.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Utils.java index 8bbda00..c72955c 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Utils.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Utils.java @@ -37,6 +37,7 @@ import scala.Tuple2; public class Utils { public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static final String ENTITY_ID_SEPARATOR = "|"; private Utils() { } @@ -83,6 +84,10 @@ public class Utils { return new Gson().fromJson(sb.toString(), CommunityMap.class); } + public static String getEntityId(String id, String separator) { + return id.substring(id.indexOf(separator) + 1); + } + public static Dataset getEntitiesId(SparkSession spark, String inputPath) { Dataset dumpedIds = Utils .readPath(spark, inputPath + "/publication", GraphResult.class) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java index 7fa3764..a4f9e2d 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java @@ -67,7 +67,7 @@ public class CommunitySplit implements Serializable { .write() .option("compression", "gzip") .mode(SaveMode.Overwrite) - .text(outputPath + "/" + communities.get(c).replace(" ", "_")); + .text(outputPath + "/" + c.replace(" ", "_")); }); } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/SparkPrepareResultProject.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/SparkPrepareResultProject.java index 8c4faba..42fd683 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/SparkPrepareResultProject.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/SparkPrepareResultProject.java @@ -2,6 +2,8 @@ package eu.dnetlib.dhp.oa.graph.dump.community; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId; import java.io.Serializable; import java.io.StringReader; @@ -110,7 +112,7 @@ public class SparkPrepareResultProject implements Serializable { Tuple2 first = it.next(); ResultProject rp = new ResultProject(); if (substring) - rp.setResultId(s.substring(3)); + rp.setResultId(getEntityId(s, ENTITY_ID_SEPARATOR)); else rp.setResultId(s); eu.dnetlib.dhp.schema.oaf.Project p = first._1(); @@ -142,7 +144,7 @@ public class SparkPrepareResultProject implements Serializable { private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op, Relation relation) { Project p = Project .newInstance( - op.getId().substring(3), + getEntityId(op.getId(), ENTITY_ID_SEPARATOR), op.getCode().getValue(), Optional .ofNullable(op.getAcronym()) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/Extractor.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/Extractor.java index 8315808..794e769 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/Extractor.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/Extractor.java @@ -2,6 +2,8 @@ package eu.dnetlib.dhp.oa.graph.dump.complete; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId; import java.io.Serializable; import java.util.*; @@ -84,7 +86,7 @@ public class Extractor implements Serializable { .orElse(null)) .orElse(null); Relation r = getRelation( - value.getId().substring(3), contextId, + getEntityId(value.getId(), ENTITY_ID_SEPARATOR), contextId, Constants.RESULT_ENTITY, Constants.CONTEXT_ENTITY, ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP, provenance); @@ -94,7 +96,7 @@ public class Extractor implements Serializable { hashCodes.add(r.hashCode()); } r = getRelation( - contextId, value.getId().substring(3), + contextId, getEntityId(value.getId(), ENTITY_ID_SEPARATOR), Constants.CONTEXT_ENTITY, Constants.RESULT_ENTITY, ModelConstants.IS_RELATED_TO, @@ -163,8 +165,8 @@ public class Extractor implements Serializable { eu.dnetlib.dhp.oa.graph.dump.Constants.HARVESTED, eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)); Relation r = getRelation( - value.getId().substring(3), - cf.getKey().substring(3), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY, + getEntityId(value.getId(), ENTITY_ID_SEPARATOR), + getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY, resultDatasource, ModelConstants.PROVISION, provenance); if (!hashCodes.contains(r.hashCode())) { @@ -174,7 +176,7 @@ public class Extractor implements Serializable { } r = getRelation( - cf.getKey().substring(3), value.getId().substring(3), + getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), getEntityId(value.getId(), ENTITY_ID_SEPARATOR), Constants.DATASOURCE_ENTITY, Constants.RESULT_ENTITY, datasourceResult, ModelConstants.PROVISION, provenance); diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java index e9ad376..218a123 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java @@ -2,6 +2,8 @@ package eu.dnetlib.dhp.oa.graph.dump.complete; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId; import java.io.Serializable; import java.io.StringReader; @@ -216,7 +218,7 @@ public class SparkDumpEntitiesJob implements Serializable { return null; Datasource datasource = new Datasource(); - datasource.setId(d.getId().substring(3)); + datasource.setId(getEntityId(d.getId(), ENTITY_ID_SEPARATOR)); Optional .ofNullable(d.getOriginalId()) @@ -406,7 +408,7 @@ public class SparkDumpEntitiesJob implements Serializable { Optional .ofNullable(p.getId()) - .ifPresent(id -> project.setId(id.substring(3))); + .ifPresent(id -> project.setId(getEntityId(id, ENTITY_ID_SEPARATOR))); Optional .ofNullable(p.getWebsiteurl()) @@ -619,7 +621,7 @@ public class SparkDumpEntitiesJob implements Serializable { Optional .ofNullable(org.getId()) - .ifPresent(value -> organization.setId(value.substring(3))); + .ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR))); Optional .ofNullable(org.getPid()) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java index 5c84c55..0abf994 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java @@ -2,6 +2,8 @@ package eu.dnetlib.dhp.oa.graph.dump.complete; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId; import java.io.Serializable; import java.util.Collections; @@ -85,11 +87,11 @@ public class SparkDumpRelationJob implements Serializable { .map((MapFunction) relation -> { eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation(); relNew - .setSource(relation.getSource().substring(3)); + .setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR)); relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2))); relNew - .setTarget(relation.getTarget().substring(3)); + .setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR)); relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2))); relNew diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java index 73894b6..aa9b7a0 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java @@ -2,6 +2,9 @@ package eu.dnetlib.dhp.oa.graph.dump.complete; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR; +import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId; +import static eu.dnetlib.dhp.schema.common.ModelSupport.idPrefixMap; import java.io.Serializable; import java.util.ArrayList; @@ -27,8 +30,10 @@ import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.model.Provenance; import eu.dnetlib.dhp.oa.model.graph.RelType; +import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; /** @@ -103,7 +108,7 @@ public class SparkOrganizationRelation implements Serializable { .as(Encoders.bean(MergedRels.class)); mergedRelsDataset.map((MapFunction) mergedRels -> { - if (organizationMap.containsKey(mergedRels.getOrganizationId())) { + if (organizationMap.containsKey(getEntityId(mergedRels.getOrganizationId(), ENTITY_ID_SEPARATOR))) { return mergedRels; } return null; @@ -135,12 +140,13 @@ public class SparkOrganizationRelation implements Serializable { private static Consumer getMergedRelsConsumer(CommunityEntityMap organizationMap, List relList, CommunityMap communityMap) { return mergedRels -> { - String oId = mergedRels.getOrganizationId(); + String oId = getEntityId(mergedRels.getOrganizationId(), ENTITY_ID_SEPARATOR); organizationMap .get(oId) .forEach(community -> { if (communityMap.containsKey(community)) { - addRelations(relList, community, mergedRels.getRepresentativeId()); + addRelations( + relList, community, getEntityId(mergedRels.getRepresentativeId(), ENTITY_ID_SEPARATOR)); } }); @@ -158,8 +164,8 @@ public class SparkOrganizationRelation implements Serializable { eu.dnetlib.dhp.oa.model.graph.Relation .newInstance( id, Constants.CONTEXT_ENTITY, - organization.substring(3), - ModelSupport.idPrefixEntity.get(organization.substring(0, 2)), + organization, + ModelSupport.idPrefixEntity.get(idPrefixMap.get(Organization.class)), RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), Provenance .newInstance( @@ -170,7 +176,7 @@ public class SparkOrganizationRelation implements Serializable { .add( eu.dnetlib.dhp.oa.model.graph.Relation .newInstance( - organization.substring(3), ModelSupport.idPrefixEntity.get(organization.substring(0, 2)), + organization, ModelSupport.idPrefixEntity.get(idPrefixMap.get(Organization.class)), id, Constants.CONTEXT_ENTITY, RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP), Provenance diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/zenodoapi/ZenodoAPIClient.java b/dump/src/main/java/eu/dnetlib/dhp/oa/zenodoapi/ZenodoAPIClient.java index 7e9231e..7aeaff7 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/zenodoapi/ZenodoAPIClient.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/zenodoapi/ZenodoAPIClient.java @@ -204,11 +204,12 @@ public class ZenodoAPIClient implements Serializable { .build(); log.info("URL: " + request.url().toString()); - log.info("Headers: " + request.headers().toString()); + // log.info("Headers: " + request.headers().toString()); try (Response response = httpClient.newCall(request).execute()) { if (!response.isSuccessful()) - System.out.println("Unexpected code " + response + response.body().string()); + log.info("Unexpected code " + response + response.body().string()); + System.out.println("Unexpected code " + response + response.body().string()); return response.code(); } } @@ -354,13 +355,14 @@ public class ZenodoAPIClient implements Serializable { .build(); log.info("URL: " + request.url().toString()); - log.info("Headers: " + request.headers().toString()); + // log.info("Headers: " + request.headers().toString()); try (Response response = httpClient.newCall(request).execute()) { - if (!response.isSuccessful()) + if (!response.isSuccessful()) { + log.info("Unexpected code " + response + response.body().string()); throw new IOException("Unexpected code " + response + response.body().string()); - + } ZenodoModel zenodoModel = new Gson() .fromJson(response.body().string(), ZenodoModel.class); bucket = zenodoModel.getLinks().getBucket(); diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/countryresults/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/countryresults/oozie_app/workflow.xml index a6e68d0..0abe7cb 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/countryresults/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/countryresults/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + sourcePath diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/RelationFromOrganizationTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/RelationFromOrganizationTest.java index b9a0814..78b1374 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/RelationFromOrganizationTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/RelationFromOrganizationTest.java @@ -5,6 +5,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf;