From ae10ae979330cd6172db81c78a7b1fb0719786fc Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 13 Sep 2022 12:10:47 +0200 Subject: [PATCH] [EOSC DUMP] extention to the schema to add the organization affiliated to the result --- .../eu/dnetlib/dhp/eosc/model/EoscResult.java | 16 ++++- .../dnetlib/dhp/eosc/model/Organization.java | 45 ++++++++++++ .../dhp/eosc/model/OrganizationPid.java | 35 ++++++++++ .../jsonschemas/eosc_result_schema.json | 34 +-------- .../src/test/java/GenerateJsonSchema.java | 2 +- .../dhp/oa/graph/dump/ResultMapper.java | 69 +++++++++++++++---- .../dump/eosc/SelectEoscResultsJobStep1.java | 9 ++- .../dump/eoscdump/oozie_app/workflow.xml | 2 +- 8 files changed, 161 insertions(+), 51 deletions(-) create mode 100644 dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java create mode 100644 dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java index 88699f9..8abd123 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java @@ -20,9 +20,19 @@ public class EoscResult extends CommunityResult { @JsonSchema(description = "The subject dumped by type associated to the result") private Map> subject; -// public EoscResult() { -// super(); -// } + @JsonSchema(description = "Te list of keywords associated to the result") + private List keywords; + + @JsonSchema(description = "The list of organizations the result is affiliated to") + private List affiliation; + + public List getKeywords() { + return keywords; + } + + public void setKeywords(List keywords) { + this.keywords = keywords; + } public EoscInteroperabilityFramework getEoscIF() { return eoscIF; diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java new file mode 100644 index 0000000..f6a2b28 --- /dev/null +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java @@ -0,0 +1,45 @@ +package eu.dnetlib.dhp.eosc.model; + +import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; + +import java.io.Serializable; +import java.util.List; + +/** + * @author miriam.baglioni + * @Date 13/09/22 + */ +public class Organization implements Serializable { + @JsonSchema(description = "the OpenAIRE id of the organizaiton") + private String id; + + @JsonSchema(description = "the name of the organization") + private String name; + + @JsonSchema(description = "the list of pids we have in OpenAIRE for the organization") + private List pid ; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getPid() { + return pid; + } + + public void setPid(List pid) { + this.pid = pid; + } +} diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java new file mode 100644 index 0000000..824c20b --- /dev/null +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java @@ -0,0 +1,35 @@ +package eu.dnetlib.dhp.eosc.model; + +import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 13/09/22 + */ +public class OrganizationPid implements Serializable { + + @JsonSchema(description = "the type of the organization pid") + private String type; + + @JsonSchema(description = "the value of the organization pid") + private String value; + + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } +} diff --git a/dump-schema/src/main/resources/jsonschemas/eosc_result_schema.json b/dump-schema/src/main/resources/jsonschemas/eosc_result_schema.json index 27fd7b6..4c8d98e 100644 --- a/dump-schema/src/main/resources/jsonschemas/eosc_result_schema.json +++ b/dump-schema/src/main/resources/jsonschemas/eosc_result_schema.json @@ -109,17 +109,6 @@ "type" : "string", "description" : "Only for results with type 'software': the URL to the repository with the source code" }, - "collectedfrom" : { - "description" : "Information about the sources from which the record has been collected", - "type" : "array", - "items" : { - "allOf" : [ { - "$ref" : "#/definitions/CfHbKeyValue" - }, { - "description" : "Information about the sources from which the record has been collected" - } ] - } - }, "contactgroup" : { "description" : "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource", "type" : "array", @@ -318,6 +307,8 @@ "description" : "Geolocation information" } }, + "keywords": { + }, "id" : { "type" : "string", "description" : "The OpenAIRE identifiers for this result" @@ -380,13 +371,7 @@ }, "description" : "The money spent to make this book or article available in Open Access. Source for this information is the OpenAPC initiative." }, - "collectedfrom" : { - "allOf" : [ { - "$ref" : "#/definitions/CfHbKeyValue" - }, { - "description" : "Information about the source from which the record has been collected" - } ] - }, + "hostedby" : { "allOf" : [ { "$ref" : "#/definitions/CfHbKeyValue" @@ -570,19 +555,6 @@ "description" : "See definition of Dublin Core field dc:source" } }, - "subjects" : { - "description" : "Keywords associated to the result", - "type" : "array", - "items" : { - "type" : "object", - "properties" : { - "provenance" : { - "allOf" : [ { - "$ref" : "#/definitions/Provenance" - }, { - "description" : "Why this subject is associated to the result" - } ] - }, "subject" : { "type" : "object", "properties" : { diff --git a/dump-schema/src/test/java/GenerateJsonSchema.java b/dump-schema/src/test/java/GenerateJsonSchema.java index 7fe8076..b3a02fa 100644 --- a/dump-schema/src/test/java/GenerateJsonSchema.java +++ b/dump-schema/src/test/java/GenerateJsonSchema.java @@ -60,7 +60,7 @@ class GenerateJsonSchema { SchemaGenerator generator = new SchemaGenerator(config); JsonNode jsonSchema = generator.generateSchema(EoscResult.class); - System.out.println(new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema)); + System.out.println(new ObjectMapper().writeValueAsString(jsonSchema)); } @Test diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index 097efc4..11abd99 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.dump; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; @@ -231,7 +232,44 @@ public class ResultMapper implements Serializable { out.setSubjects(subjectList); } else { - ((EoscResult) out).setSubject(createSubjectMap(input)); + + if (Optional.ofNullable(input.getSubject()).isPresent()) { + ((EoscResult) out).setSubject(createSubjectMap(input)); + ((EoscResult) out) + .setKeywords( + input + .getSubject() + .stream() + .filter( + s -> s.getQualifier().getClassid().equalsIgnoreCase("keyword") && + !s.getValue().equalsIgnoreCase("EOSC::RO-crate")) + .map(s -> s.getValue()) + .collect(Collectors.toList())); + long eoscSubjectNumber = input + .getSubject() + .stream() + .filter(s -> s.getValue().equalsIgnoreCase("EOSC::RO-crate")) + .count(); + if (eoscSubjectNumber > 1) { + throw new CardinalityTooHighException( + "EOSC IF in the result has cardinality greater than one. Change dump!"); + } + if (eoscSubjectNumber == 1) { + StructuredProperty ifra = input + .getSubject() + .stream() + .filter(s -> s.getValue().equalsIgnoreCase("EOSC::RO-crate")) + .findFirst() + .get(); + ((EoscResult) out) + .setEoscIF( + EoscInteroperabilityFramework + .newInstance( + ifra.getValue(), ifra.getValue(), "", + "compliesWith")); + + } + } } out.setType(input.getResulttype().getClassid()); @@ -341,21 +379,24 @@ public class ResultMapper implements Serializable { private static Map> createSubjectMap( eu.dnetlib.dhp.schema.oaf.Result input) { Map> map = new HashMap<>(); - if (!Optional.ofNullable(input.getSubject()).isPresent()) - return map; - input.getSubject().stream().forEach(s -> { - String key = s.getQualifier().getClassid(); - if (!map.containsKey(key) && !(key.equals("fos") || key.equals("sdg"))) { - map.put(key, new ArrayList<>()); + input.getSubject().stream().forEach(s -> { + String key = s.getQualifier().getClassid().toLowerCase(); + if (!key.equalsIgnoreCase("http://www.abs.gov.au/ausstats/abs@.nsf/0/6BB427AB9696C225CA2574180004463E") && + !key.equalsIgnoreCase("keyword") && + !key.equalsIgnoreCase("eosc")) { + if (!map.containsKey(key)) { + + map.put(key, new ArrayList<>()); + } + eu.dnetlib.dhp.eosc.model.Subject subject = new eu.dnetlib.dhp.eosc.model.Subject(); + subject.setValue(s.getValue()); + Provenance p = getProvenance(s); + if (p != null) { + subject.setProvenance(p); + } + map.get(key).add(subject); } - eu.dnetlib.dhp.eosc.model.Subject subject = new eu.dnetlib.dhp.eosc.model.Subject(); - subject.setValue(s.getValue()); - Provenance p = getProvenance(s); - if (p != null) { - subject.setProvenance(p); - } - map.get(key).add(subject); }); return map; } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java index 4df4b4f..b93b2b2 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java @@ -80,7 +80,14 @@ public class SelectEoscResultsJobStep1 implements Serializable { .readPath(spark, inputPath, inputClazz) .filter( (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible() - && r.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + && (r.getContext().stream().anyMatch(c -> c.getId().equals("eosc")) || + Optional + .ofNullable(r.getSubject()) + .map( + s -> s + .stream() + .anyMatch(sbj -> sbj.getValue().equalsIgnoreCase("EOSC::RO-crate"))) + .orElse(false))) .map( (MapFunction) r -> (EoscResult) ResultMapper .map(r, communityMap, Constants.DUMPTYPE.EOSC.getType()), diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml index 182c935..cdb3001 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml @@ -314,7 +314,7 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --sourcePath${workingDir}/dump/otherresearchproduct - --outputPath${workingDir}/tar/orp + --outputPath${workingDir}/tar/otherresearchproduct --preparedInfoPath${workingDir}/preparedInfo --dumpTypeeosc