[EOSC Dump] refactoring

This commit is contained in:
Miriam Baglioni 2022-10-04 10:10:59 +02:00
parent 6dc00dc801
commit 5489ed9834
3 changed files with 465 additions and 487 deletions

View File

@ -17,12 +17,8 @@
"Provenance": {
"type": "object",
"properties": {
"provenance" : {
"type" : "string"
},
"trust" : {
"type" : "string"
}
"provenance": {"type": "string"},
"trust": {"type": "string"}
}
},
"ResultPid": {
@ -41,17 +37,49 @@
},
"type": "object",
"properties": {
"affiliation": {
"description": "The list of organizations the result is affiliated to",
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "the OpenAIRE id of the organizaiton"
},
"name": {
"type": "string",
"description": "the name of the organization"
},
"pid": {
"description": "the list of pids we have in OpenAIRE for the organization",
"type": "array",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "the type of the organization pid"
},
"value": {
"type": "string",
"description": "the value of the organization pid"
}
},
"description": "the list of pids we have in OpenAIRE for the organization"
}
}
},
"description": "The list of organizations the result is affiliated to"
}
},
"author": {
"type": "array",
"items": {
"type": "object",
"properties": {
"fullname" : {
"type" : "string"
},
"name" : {
"type" : "string"
},
"fullname": {"type": "string"},
"name": {"type": "string"},
"pid": {
"type": "object",
"properties": {
@ -69,21 +97,16 @@
}
},
"provenance": {
"allOf" : [ {
"$ref" : "#/definitions/Provenance"
}, {
"description" : "The reason why the pid was associated to the author"
} ]
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "The reason why the pid was associated to the author"}
]
}
},
"description": "The author's persistent identifiers"
},
"rank" : {
"type" : "integer"
},
"surname" : {
"type" : "string"
}
"rank": {"type": "integer"},
"surname": {"type": "string"}
}
}
},
@ -128,12 +151,8 @@
"container": {
"type": "object",
"properties": {
"conferencedate" : {
"type" : "string"
},
"conferenceplace" : {
"type" : "string"
},
"conferencedate": {"type": "string"},
"conferenceplace": {"type": "string"},
"edition": {
"type": "string",
"description": "Edition of the journal or conference proceeding"
@ -146,15 +165,9 @@
"type": "string",
"description": "Journal issue number"
},
"issnLinking" : {
"type" : "string"
},
"issnOnline" : {
"type" : "string"
},
"issnPrinted" : {
"type" : "string"
},
"issnLinking": {"type": "string"},
"issnOnline": {"type": "string"},
"issnPrinted": {"type": "string"},
"name": {
"type": "string",
"description": "Name of the journal or conference"
@ -188,11 +201,10 @@
"description": "Why this result is associated to the RI/RC.",
"type": "array",
"items": {
"allOf" : [ {
"$ref" : "#/definitions/Provenance"
}, {
"description" : "Why this result is associated to the RI/RC."
} ]
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this result is associated to the RI/RC."}
]
}
}
},
@ -222,11 +234,10 @@
"description": "The label for that code (i.e. Italy)"
},
"provenance": {
"allOf" : [ {
"$ref" : "#/definitions/Provenance"
}, {
"description" : "Why this result is associated to the country."
} ]
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this result is associated to the country."}
]
}
},
"description": "The list of countries associated to this result"
@ -234,9 +245,7 @@
},
"coverage": {
"type": "array",
"items" : {
"type" : "string"
}
"items": {"type": "string"}
},
"dateofcollection": {
"type": "string",
@ -244,9 +253,7 @@
},
"description": {
"type": "array",
"items" : {
"type" : "string"
}
"items": {"type": "string"}
},
"documentationUrl": {
"description": "Only for results with type 'software': URL to the software documentation",
@ -284,9 +291,7 @@
},
"format": {
"type": "array",
"items" : {
"type" : "string"
}
"items": {"type": "string"}
},
"geolocation": {
"description": "Geolocation information",
@ -294,21 +299,13 @@
"items": {
"type": "object",
"properties": {
"box" : {
"type" : "string"
},
"place" : {
"type" : "string"
},
"point" : {
"type" : "string"
}
"box": {"type": "string"},
"place": {"type": "string"},
"point": {"type": "string"}
},
"description": "Geolocation information"
}
},
"keywords": {
},
"id": {
"type": "string",
"description": "The OpenAIRE identifiers for this result"
@ -332,7 +329,12 @@
},
"openAccessRoute": {
"type": "string",
"enum" : [ "gold", "green", "hybrid", "bronze" ]
"enum": [
"gold",
"green",
"hybrid",
"bronze"
]
},
"scheme": {
"type": "string",
@ -362,26 +364,18 @@
"articleprocessingcharge": {
"type": "object",
"properties": {
"amount" : {
"type" : "string"
},
"currency" : {
"type" : "string"
}
"amount": {"type": "string"},
"currency": {"type": "string"}
},
"description": "The money spent to make this book or article available in Open Access. Source for this information is the OpenAPC initiative."
},
"hostedby": {
"allOf" : [ {
"$ref" : "#/definitions/CfHbKeyValue"
}, {
"description" : "Information about the source from which the instance can be viewed or downloaded."
} ]
},
"license" : {
"type" : "string"
"allOf": [
{"$ref": "#/definitions/CfHbKeyValue"},
{"description": "Information about the source from which the instance can be viewed or downloaded."}
]
},
"license": {"type": "string"},
"measures": {
"description": "Measures computed for this instance, for example Bip!Finder ones",
"type": "array",
@ -402,9 +396,7 @@
},
"pid": {
"type": "array",
"items" : {
"$ref" : "#/definitions/ResultPid"
}
"items": {"$ref": "#/definitions/ResultPid"}
},
"publicationdate": {
"type": "string",
@ -430,6 +422,14 @@
"description": "Each instance is one specific materialisation or version of the result. For example, you can have one result with three instance: one is the pre-print, one is the post-print, one is te published version"
}
},
"keywords": {
"description": "Te list of keywords associated to the result",
"type": "array",
"items": {
"type": "string",
"description": "Te list of keywords associated to the result"
}
},
"language": {
"type": "object",
"properties": {
@ -463,11 +463,10 @@
"description": "Persistent identifiers of the result",
"type": "array",
"items": {
"allOf" : [ {
"$ref" : "#/definitions/ResultPid"
}, {
"description" : "Persistent identifiers of the result"
} ]
"allOf": [
{"$ref": "#/definitions/ResultPid"},
{"description": "Persistent identifiers of the result"}
]
}
},
"programmingLanguage": {
@ -514,21 +513,13 @@
"type": "string",
"description": "The OpenAIRE id for the project"
},
"provenance" : {
"$ref" : "#/definitions/Provenance"
},
"title" : {
"type" : "string"
},
"provenance": {"$ref": "#/definitions/Provenance"},
"title": {"type": "string"},
"validated": {
"type": "object",
"properties": {
"validatedByFunder" : {
"type" : "boolean"
},
"validationDate" : {
"type" : "string"
}
"validatedByFunder": {"type": "boolean"},
"validationDate": {"type": "string"}
}
}
},
@ -537,7 +528,7 @@
},
"publicationdate": {
"type": "string",
"description" : "Main date of the research product: typically the publication or issued date. In case of a research result with different versions with different dates, the date of the result is selected as the most frequent well-formatted date. If not available, then the most recent and complete date among those that are well-formatted. For statistics, the year is extracted and the result is counted only among the result of that year. Example: Pre-print date: 2019-02-03, Article date provided by repository: 2020-02, Article date provided by Crossref: 2020, OpenAIRE will set as date 2019-02-03, because its the most recent among the complete and well-formed dates. If then the repository updates the metadata and set a complete date (e.g. 2020-02-12), then this will be the new date for the result because it becomes the most recent most complete date. However, if OpenAIRE then collects the pre-print from another repository with date 2019-02-03, then this will be the “winning date” because it becomes the most frequent well-formatted date."
"description": "Main date of the research product: typically the publication or issued date. In case of a research result with different versions with different dates, the date of the result is selected as the most frequent well-formatted date. If not available, then the most recent and complete date among those that are well-formatted. For statistics, the year is extracted and the result is counted only among the result of that year. Example: Pre-print date: 2019-02-03, Article date provided by repository: 2020-02, Article date provided by Crossref: 2020, OpenAIRE will set as date 2019-02-03, because it\u2019s the most recent among the complete and well-formed dates. If then the repository updates the metadata and set a complete date (e.g. 2020-02-12), then this will be the new date for the result because it becomes the most recent most complete date. However, if OpenAIRE then collects the pre-print from another repository with date 2019-02-03, then this will be the \u201cwinning date\u201d because it becomes the most frequent well-formatted date."
},
"publisher": {
"type": "string",
@ -557,20 +548,7 @@
},
"subject": {
"type": "object",
"properties" : {
"scheme" : {
"type" : "string",
"description" : "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies)."
},
"value" : {
"type" : "string",
"description" : "The value for the subject in the selected scheme. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."
}
}
}
},
"description" : "Keywords associated to the result"
}
"description": "The subject dumped by type associated to the result"
},
"subtitle": {
"type": "string",

View File

@ -96,7 +96,7 @@
<arg>--outputPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--singleDeposition</arg><arg>${singleDeposition}</arg>
<arg>--singleDeposition</arg><arg>false</arg>
<arg>--communityId</arg><arg>${communityId}</arg>
</java>
<ok to="fork_dump_eosc_result"/>

View File

@ -99,7 +99,7 @@
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-dump-${projectVersion}.jar</jar>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
@ -124,7 +124,7 @@
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectsSubsetSparkJob</class>
<jar>dhp-graph-dump-${projectVersion}.jar</jar>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}