From 752fd896e4e350ac29b0e9ce4e65a68244ca24de Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 1 Mar 2024 09:35:15 +0100 Subject: [PATCH] [SKG-IF] refactoring and fixing issues --- .../dnetlib/dhp/skgif/model/Contributor.java | 8 +- .../dnetlib/dhp/skgif/model/Datasource.java | 222 ++++--- .../eu/dnetlib/dhp/skgif/model/Grant.java | 221 +++---- .../eu/dnetlib/dhp/skgif/model/Licence.java | 1 + .../dnetlib/dhp/skgif/model/Organization.java | 113 ++-- .../dhp/skgif/model/OrganizationTypes.java | 23 +- .../model/PersistentIdentitySystems.java | 1 + .../eu/dnetlib/dhp/skgif/model/Prefixes.java | 26 +- .../dnetlib/dhp/skgif/model/RelationType.java | 13 +- .../dhp/skgif/model/ResearchProduct.java | 2 +- .../eu/dnetlib/dhp/skgif/model/Venue.java | 141 ++-- .../dhp/skgif/model/VenueContribution.java | 29 +- .../dhp/skgif/model/VenueIdentifierType.java | 20 +- .../eu/dnetlib/dhp/skgif/model/VenueType.java | 19 +- .../CardinalityTooHighException.java | 1 + .../NoAvailableEntityTypeException.java | 1 + .../oa/graph/dump/skgif/DumpDatasource.java | 244 ++++--- .../dhp/oa/graph/dump/skgif/DumpGrant.java | 285 ++++---- .../oa/graph/dump/skgif/DumpOrganization.java | 178 ++--- .../dhp/oa/graph/dump/skgif/DumpResult.java | 305 ++++----- .../dhp/oa/graph/dump/skgif/DumpVenue.java | 259 ++++---- .../oa/graph/dump/skgif/EmitFromResults.java | 55 +- .../dhp/oa/graph/dump/skgif/ResultMapper.java | 44 +- .../dhp/oa/graph/dump/skgif/Utils.java | 20 +- .../dump/skgif/oozie_app/config-default.xml | 30 + .../graph/dump/skgif/oozie_app/workflow.xml | 216 ++++++ .../dhp/oa/graph/dump/ZenodoUploadTest.java | 5 +- .../graph/dump/skgif/DumpDatasourceTest.java | 150 ++--- .../oa/graph/dump/skgif/DumpGrantTest.java | 111 ++-- .../dump/skgif/DumpOrganizationTest.java | 117 ++-- .../oa/graph/dump/skgif/DumpResultTest.java | 620 ++++++++++++------ .../dump/skgif/EmitFromResultJobTest.java | 98 ++- 32 files changed, 2082 insertions(+), 1496 deletions(-) create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/config-default.xml create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/workflow.xml diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Contributor.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Contributor.java index ab41a8c..b47c0f5 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Contributor.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Contributor.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; @@ -7,8 +8,9 @@ import java.io.Serializable; * @Date 22/02/24 */ public class Contributor implements Serializable { - private String person; //I would not map it because we have only information regarding the person (if any) associated to the leading organization - private String organization ; //contributors.person + private String person; // I would not map it because we have only information regarding the person (if any) + // associated to the leading organization + private String organization; // contributors.person - private String role ;//private + private String role;// private } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Datasource.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Datasource.java index 6a84982..d8b94ec 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Datasource.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Datasource.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; @@ -8,143 +9,146 @@ import java.util.List; * @Date 21/02/24 */ public class Datasource implements Serializable { - private String local_identifier ;//id - private List identifiers; //.schema pid.qualifier.classid;identifiers.value pid.value - private String name; //officialname.value - private String submission_policy_url;// submissionpolicyurl - private String preservation_policy_url;// preservationpolicyurl - private Boolean version_control;// versioncontrol bool - private List persistent_identity_systems;//. product_type researchentitytype list type to be remapped to the eosc types - //persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values - private String jurisdiction;// jurisdiction.classname - private String data_source_classification;// eoscdatasourcetype.classname - private List research_product_type;// researchentitytype list type to be remapped to the eosc types - private Boolean thematic ;//thematic bool - private List research_product_license; //.name not mappable listresearch_product_license.url not mappable - private List research_product_access_policy;// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) - //if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) - //if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list - private List research_product_metadata_license; //.name not mappable list - //research_product_metadata_license.url not mappable - private Listresearch_product_metadata_access_policy ;//researchproductmetadataccesspolicies list with the same mapping of research_product_access_policy + private String local_identifier;// id + private List identifiers; // .schema pid.qualifier.classid;identifiers.value pid.value + private String name; // officialname.value + private String submission_policy_url;// submissionpolicyurl + private String preservation_policy_url;// preservationpolicyurl + private Boolean version_control;// versioncontrol bool + private List persistent_identity_systems;// . product_type researchentitytype list type + // to be remapped to the eosc types + // persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values + private String jurisdiction;// jurisdiction.classname + private String data_source_classification;// eoscdatasourcetype.classname + private List research_product_type;// researchentitytype list type to be remapped to the eosc types + private Boolean thematic;// thematic bool + private List research_product_license; // .name not mappable listresearch_product_license.url not mappable + private List research_product_access_policy;// "databaseaccesstype if open => open access + // (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) + // if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) + // if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list + private List research_product_metadata_license; // .name not mappable list + // research_product_metadata_license.url not mappable + private List research_product_metadata_access_policy;// researchproductmetadataccesspolicies list with the + // same mapping of research_product_access_policy - public String getLocal_identifier() { - return local_identifier; - } + public String getLocal_identifier() { + return local_identifier; + } - public void setLocal_identifier(String local_identifier) { - this.local_identifier = local_identifier; - } + public void setLocal_identifier(String local_identifier) { + this.local_identifier = local_identifier; + } - public List getIdentifiers() { - return identifiers; - } + public List getIdentifiers() { + return identifiers; + } - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } + public void setIdentifiers(List identifiers) { + this.identifiers = identifiers; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSubmission_policy_url() { - return submission_policy_url; - } + public String getSubmission_policy_url() { + return submission_policy_url; + } - public void setSubmission_policy_url(String submission_policy_url) { - this.submission_policy_url = submission_policy_url; - } + public void setSubmission_policy_url(String submission_policy_url) { + this.submission_policy_url = submission_policy_url; + } - public String getPreservation_policy_url() { - return preservation_policy_url; - } + public String getPreservation_policy_url() { + return preservation_policy_url; + } - public void setPreservation_policy_url(String preservation_policy_url) { - this.preservation_policy_url = preservation_policy_url; - } + public void setPreservation_policy_url(String preservation_policy_url) { + this.preservation_policy_url = preservation_policy_url; + } - public Boolean getVersion_control() { - return version_control; - } + public Boolean getVersion_control() { + return version_control; + } - public void setVersion_control(Boolean version_control) { - this.version_control = version_control; - } + public void setVersion_control(Boolean version_control) { + this.version_control = version_control; + } - public List getPersistent_identity_systems() { - return persistent_identity_systems; - } + public List getPersistent_identity_systems() { + return persistent_identity_systems; + } - public void setPersistent_identity_systems(List persistent_identity_systems) { - this.persistent_identity_systems = persistent_identity_systems; - } + public void setPersistent_identity_systems(List persistent_identity_systems) { + this.persistent_identity_systems = persistent_identity_systems; + } - public String getJurisdiction() { - return jurisdiction; - } + public String getJurisdiction() { + return jurisdiction; + } - public void setJurisdiction(String jurisdiction) { - this.jurisdiction = jurisdiction; - } + public void setJurisdiction(String jurisdiction) { + this.jurisdiction = jurisdiction; + } - public String getData_source_classification() { - return data_source_classification; - } + public String getData_source_classification() { + return data_source_classification; + } - public void setData_source_classification(String data_source_classification) { - this.data_source_classification = data_source_classification; - } + public void setData_source_classification(String data_source_classification) { + this.data_source_classification = data_source_classification; + } - public List getResearch_product_type() { - return research_product_type; - } + public List getResearch_product_type() { + return research_product_type; + } - public void setResearch_product_type(List research_product_type) { - this.research_product_type = research_product_type; - } + public void setResearch_product_type(List research_product_type) { + this.research_product_type = research_product_type; + } - public Boolean getThematic() { - return thematic; - } + public Boolean getThematic() { + return thematic; + } - public void setThematic(Boolean thematic) { - this.thematic = thematic; - } + public void setThematic(Boolean thematic) { + this.thematic = thematic; + } - public List getResearch_product_license() { - return research_product_license; - } + public List getResearch_product_license() { + return research_product_license; + } - public void setResearch_product_license(List research_product_license) { - this.research_product_license = research_product_license; - } + public void setResearch_product_license(List research_product_license) { + this.research_product_license = research_product_license; + } - public List getResearch_product_access_policy() { - return research_product_access_policy; - } + public List getResearch_product_access_policy() { + return research_product_access_policy; + } - public void setResearch_product_access_policy(List research_product_access_policy) { - this.research_product_access_policy = research_product_access_policy; - } + public void setResearch_product_access_policy(List research_product_access_policy) { + this.research_product_access_policy = research_product_access_policy; + } - public List getResearch_product_metadata_license() { - return research_product_metadata_license; - } + public List getResearch_product_metadata_license() { + return research_product_metadata_license; + } - public void setResearch_product_metadata_license(List research_product_metadata_license) { - this.research_product_metadata_license = research_product_metadata_license; - } + public void setResearch_product_metadata_license(List research_product_metadata_license) { + this.research_product_metadata_license = research_product_metadata_license; + } - public List getResearch_product_metadata_access_policy() { - return research_product_metadata_access_policy; - } + public List getResearch_product_metadata_access_policy() { + return research_product_metadata_access_policy; + } - public void setResearch_product_metadata_access_policy(List research_product_metadata_access_policy) { - this.research_product_metadata_access_policy = research_product_metadata_access_policy; - } + public void setResearch_product_metadata_access_policy(List research_product_metadata_access_policy) { + this.research_product_metadata_access_policy = research_product_metadata_access_policy; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Grant.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Grant.java index 780437d..05abf66 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Grant.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Grant.java @@ -1,153 +1,154 @@ + package eu.dnetlib.dhp.skgif.model; +import java.io.Serializable; import java.util.List; import org.codehaus.jackson.annotate.JsonProperty; -import java.io.Serializable; - /** * @author miriam.baglioni * @Date 22/02/24 */ public class Grant implements Serializable { - private String local_identifier;// id - private List identifiers;//.schema pid.qualifier.classid identifiers.value pid.value - //identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname - //identifiers.value project.code + private String local_identifier;// id + private List identifiers;// .schema pid.qualifier.classid identifiers.value pid.value + // identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname + // identifiers.value project.code - private String title;// title.value - @JsonProperty(value="abstract") - private String summary ;//summary.value - private String acronym; //acronym.value - private String funder ;//fundingtree to be used the xpath //funder/name - private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] - private String currency;// currency.value - private Float funded_amount;//' fundedamount.value - private List keywords;// subject.value - private String start_date;// startdate.value - private String end_date;// enddate.value - private String website;// websiteurl.value - private List beneficiaries;// organization.id for the organizations in the relation with semantic class isParticipant produces the list of organization internal identifiers - private List contributors;// + private String title;// title.value + @JsonProperty(value = "abstract") + private String summary;// summary.value + private String acronym; // acronym.value + private String funder;// fundingtree to be used the xpath //funder/name + private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] + private String currency;// currency.value + private Float funded_amount;// ' fundedamount.value + private List keywords;// subject.value + private String start_date;// startdate.value + private String end_date;// enddate.value + private String website;// websiteurl.value + private List beneficiaries;// organization.id for the organizations in the relation with semantic class + // isParticipant produces the list of organization internal identifiers + private List contributors;// - public String getLocal_identifier() { - return local_identifier; - } + public String getLocal_identifier() { + return local_identifier; + } - public void setLocal_identifier(String local_identifier) { - this.local_identifier = local_identifier; - } + public void setLocal_identifier(String local_identifier) { + this.local_identifier = local_identifier; + } - public List getIdentifiers() { - return identifiers; - } + public List getIdentifiers() { + return identifiers; + } - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } + public void setIdentifiers(List identifiers) { + this.identifiers = identifiers; + } - public String getTitle() { - return title; - } + public String getTitle() { + return title; + } - public void setTitle(String title) { - this.title = title; - } + public void setTitle(String title) { + this.title = title; + } - public String getSummary() { - return summary; - } + public String getSummary() { + return summary; + } - public void setSummary(String summary) { - this.summary = summary; - } + public void setSummary(String summary) { + this.summary = summary; + } - public String getAcronym() { - return acronym; - } + public String getAcronym() { + return acronym; + } - public void setAcronym(String acronym) { - this.acronym = acronym; - } + public void setAcronym(String acronym) { + this.acronym = acronym; + } - public String getFunder() { - return funder; - } + public String getFunder() { + return funder; + } - public void setFunder(String funder) { - this.funder = funder; - } + public void setFunder(String funder) { + this.funder = funder; + } - public String getFunding_stream() { - return funding_stream; - } + public String getFunding_stream() { + return funding_stream; + } - public void setFunding_stream(String funding_stream) { - this.funding_stream = funding_stream; - } + public void setFunding_stream(String funding_stream) { + this.funding_stream = funding_stream; + } - public String getCurrency() { - return currency; - } + public String getCurrency() { + return currency; + } - public void setCurrency(String currency) { - this.currency = currency; - } + public void setCurrency(String currency) { + this.currency = currency; + } - public Float getFunded_amount() { - return funded_amount; - } + public Float getFunded_amount() { + return funded_amount; + } - public void setFunded_amount(Float funded_amount) { - this.funded_amount = funded_amount; - } + public void setFunded_amount(Float funded_amount) { + this.funded_amount = funded_amount; + } - public List getKeywords() { - return keywords; - } + public List getKeywords() { + return keywords; + } - public void setKeywords(List keywords) { - this.keywords = keywords; - } + public void setKeywords(List keywords) { + this.keywords = keywords; + } - public String getStart_date() { - return start_date; - } + public String getStart_date() { + return start_date; + } - public void setStart_date(String start_date) { - this.start_date = start_date; - } + public void setStart_date(String start_date) { + this.start_date = start_date; + } - public String getEnd_date() { - return end_date; - } + public String getEnd_date() { + return end_date; + } - public void setEnd_date(String end_date) { - this.end_date = end_date; - } + public void setEnd_date(String end_date) { + this.end_date = end_date; + } - public String getWebsite() { - return website; - } + public String getWebsite() { + return website; + } - public void setWebsite(String website) { - this.website = website; - } + public void setWebsite(String website) { + this.website = website; + } - public List getBeneficiaries() { - return beneficiaries; - } + public List getBeneficiaries() { + return beneficiaries; + } - public void setBeneficiaries(List beneficiaries) { - this.beneficiaries = beneficiaries; - } + public void setBeneficiaries(List beneficiaries) { + this.beneficiaries = beneficiaries; + } - public List getContributors() { - return contributors; - } + public List getContributors() { + return contributors; + } - public void setContributors(List contributors) { - this.contributors = contributors; - } + public void setContributors(List contributors) { + this.contributors = contributors; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Licence.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Licence.java index ad73b44..6aa8016 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Licence.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Licence.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Organization.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Organization.java index f21e36f..c1ea3d7 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Organization.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Organization.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; @@ -8,77 +9,77 @@ import java.util.List; * @Date 21/02/24 */ public class Organization implements Serializable { - private String local_identifier; // id - private List identifiers; // pid.qualifier.classid; pid.value list - private String name ; //legalname.value + private String local_identifier; // id + private List identifiers; // pid.qualifier.classid; pid.value list + private String name; // legalname.value - private String short_name; // legalshortname.value - private List other_names;// alternative_names.value list - private String website ;//websiteurl.value - private String country; // country.classid - private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other" + private String short_name; // legalshortname.value + private List other_names;// alternative_names.value list + private String website;// websiteurl.value + private String country; // country.classid + private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other" - public String getLocal_identifier() { - return local_identifier; - } + public String getLocal_identifier() { + return local_identifier; + } - public void setLocal_identifier(String local_identifier) { - this.local_identifier = local_identifier; - } + public void setLocal_identifier(String local_identifier) { + this.local_identifier = local_identifier; + } - public List getIdentifiers() { - return identifiers; - } + public List getIdentifiers() { + return identifiers; + } - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } + public void setIdentifiers(List identifiers) { + this.identifiers = identifiers; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getShort_name() { - return short_name; - } + public String getShort_name() { + return short_name; + } - public void setShort_name(String short_name) { - this.short_name = short_name; - } + public void setShort_name(String short_name) { + this.short_name = short_name; + } - public List getOther_names() { - return other_names; - } + public List getOther_names() { + return other_names; + } - public void setOther_names(List other_names) { - this.other_names = other_names; - } + public void setOther_names(List other_names) { + this.other_names = other_names; + } - public String getWebsite() { - return website; - } + public String getWebsite() { + return website; + } - public void setWebsite(String website) { - this.website = website; - } + public void setWebsite(String website) { + this.website = website; + } - public String getCountry() { - return country; - } + public String getCountry() { + return country; + } - public void setCountry(String country) { - this.country = country; - } + public void setCountry(String country) { + this.country = country; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/OrganizationTypes.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/OrganizationTypes.java index e0d1e37..221be99 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/OrganizationTypes.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/OrganizationTypes.java @@ -1,20 +1,17 @@ + package eu.dnetlib.dhp.skgif.model; public enum OrganizationTypes { - ARCHIVE ("archive"), + ARCHIVE("archive"), - COMPANY("company"), + COMPANY("company"), - EDUCATION("education"), -FACILITY("facility"), - GOVERNMENT("government"), - HEALTHCARE("healthcare"), -NONPROFIT("nonprofit"), -FUNDER("funder"), -OTHER("other"); - public final String label; + EDUCATION("education"), FACILITY("facility"), GOVERNMENT("government"), HEALTHCARE("healthcare"), NONPROFIT( + "nonprofit"), FUNDER("funder"), OTHER("other"); - private OrganizationTypes(String label) { - this.label = label; - } + public final String label; + + private OrganizationTypes(String label) { + this.label = label; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/PersistentIdentitySystems.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/PersistentIdentitySystems.java index db80ed6..fe9d700 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/PersistentIdentitySystems.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/PersistentIdentitySystems.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Prefixes.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Prefixes.java index ab7f159..d5e51bc 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Prefixes.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Prefixes.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.skgif.model; +package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; @@ -8,24 +8,22 @@ import java.io.Serializable; * @Date 21/02/24 */ public enum Prefixes implements Serializable { - RESEARCH_PRODUCT("product_____::"), + RESEARCH_PRODUCT("product_____::"), - ORGANIZATION("organization::"), + ORGANIZATION("organization::"), - GRANT("grant_______::"), + GRANT("grant_______::"), - PERSON( + PERSON( "person______::"), - TEMPORARY_PERSON("temp_person_::"), + TEMPORARY_PERSON("temp_person_::"), - DATASOURCE("datasource__::"), - TOPIC("topic_______::"), - VENUE("venue_______::"); + DATASOURCE("datasource__::"), TOPIC("topic_______::"), VENUE("venue_______::"); - public final String label; + public final String label; - private Prefixes(String label) { - this.label = label; - } -} \ No newline at end of file + private Prefixes(String label) { + this.label = label; + } +} diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/RelationType.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/RelationType.java index 36d79b7..477bf77 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/RelationType.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/RelationType.java @@ -8,15 +8,10 @@ import java.io.Serializable; * @Date 05/09/23 */ public enum RelationType implements Serializable { - RESULT_OUTCOME_FUNDING("isProducedBy"), - RESULT_AFFILIATIED_TO_ORGANIZATION("hasAuthorInstitution"), - ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), - SUPPLEMENT("IsSupplementedBy"), - DOCUMENTS( - "IsDocumentedBy"), - PART("IsPartOf"), - VERSION("IsNewVersionOf"), - CITATION("Cites"); + RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION( + "hasAuthorInstitution"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), SUPPLEMENT( + "IsSupplementedBy"), DOCUMENTS( + "IsDocumentedBy"), PART("IsPartOf"), VERSION("IsNewVersionOf"), CITATION("Cites"); public final String label; diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/ResearchProduct.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/ResearchProduct.java index 6517667..3b52fd6 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/ResearchProduct.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/ResearchProduct.java @@ -16,7 +16,7 @@ public class ResearchProduct implements Serializable { private String local_identifier; private List identifiers; private Map> titles; - private Map> abstracts; + private Map> abstracts; @JsonProperty("product_type") private String product_type; private List topics; diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Venue.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Venue.java index 5b6b120..aac90c6 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Venue.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/Venue.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; @@ -8,95 +9,95 @@ import java.util.List; * @Date 27/02/24 */ public class Venue implements Serializable { - private String local_identifier; - private List identifiers; - private String name; - private String acronym; - private String type; - private String publisher; - private String series; - private Boolean is_currently_full_oa; + private String local_identifier; + private List identifiers; + private String name; + private String acronym; + private String type; + private String publisher; + private String series; + private Boolean is_currently_full_oa; - private String creation_date; - private List contributions; + private String creation_date; + private List contributions; - public String getLocal_identifier() { - return local_identifier; - } + public String getLocal_identifier() { + return local_identifier; + } - public void setLocal_identifier(String local_identifier) { - this.local_identifier = local_identifier; - } + public void setLocal_identifier(String local_identifier) { + this.local_identifier = local_identifier; + } - public List getIdentifiers() { - return identifiers; - } + public List getIdentifiers() { + return identifiers; + } - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } + public void setIdentifiers(List identifiers) { + this.identifiers = identifiers; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getAcronym() { - return acronym; - } + public String getAcronym() { + return acronym; + } - public void setAcronym(String acronym) { - this.acronym = acronym; - } + public void setAcronym(String acronym) { + this.acronym = acronym; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public String getPublisher() { - return publisher; - } + public String getPublisher() { + return publisher; + } - public void setPublisher(String publisher) { - this.publisher = publisher; - } + public void setPublisher(String publisher) { + this.publisher = publisher; + } - public String getSeries() { - return series; - } + public String getSeries() { + return series; + } - public void setSeries(String series) { - this.series = series; - } + public void setSeries(String series) { + this.series = series; + } - public Boolean getIs_currently_full_oa() { - return is_currently_full_oa; - } + public Boolean getIs_currently_full_oa() { + return is_currently_full_oa; + } - public void setIs_currently_full_oa(Boolean is_currently_full_oa) { - this.is_currently_full_oa = is_currently_full_oa; - } + public void setIs_currently_full_oa(Boolean is_currently_full_oa) { + this.is_currently_full_oa = is_currently_full_oa; + } - public String getCreation_date() { - return creation_date; - } + public String getCreation_date() { + return creation_date; + } - public void setCreation_date(String creation_date) { - this.creation_date = creation_date; - } + public void setCreation_date(String creation_date) { + this.creation_date = creation_date; + } - public List getContributions() { - return contributions; - } + public List getContributions() { + return contributions; + } - public void setContributions(List contributions) { - this.contributions = contributions; - } + public void setContributions(List contributions) { + this.contributions = contributions; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueContribution.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueContribution.java index ee41147..ffeb8f0 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueContribution.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueContribution.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; @@ -8,23 +9,23 @@ import java.util.List; * @Date 27/02/24 */ public class VenueContribution implements Serializable { - private String person; - private List roles; + private String person; + private List roles; - public String getPerson() { + public String getPerson() { - return person; - } + return person; + } - public void setPerson(String person) { - this.person = person; - } + public void setPerson(String person) { + this.person = person; + } - public List getRoles() { - return roles; - } + public List getRoles() { + return roles; + } - public void setRoles(List roles) { - this.roles = roles; - } + public void setRoles(List roles) { + this.roles = roles; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueIdentifierType.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueIdentifierType.java index 6583147..41e573d 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueIdentifierType.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueIdentifierType.java @@ -1,22 +1,16 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; public enum VenueIdentifierType implements Serializable { - EISSN("eissn"), - ISSN("issn"), - LISSN("lissn"), - ISBN("isbn"), - OPENDOAR( - "opendoar"), - R3DATA("re3data.org"), - FAIRSHARING("fairsharing"); + EISSN("eissn"), ISSN("issn"), LISSN("lissn"), ISBN("isbn"), OPENDOAR( + "opendoar"), R3DATA("re3data.org"), FAIRSHARING("fairsharing"); + public final String label; - public final String label; - - private VenueIdentifierType(String label) { - this.label = label; - } + private VenueIdentifierType(String label) { + this.label = label; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueType.java b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueType.java index 65f36d3..2f396a9 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueType.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/skgif/model/VenueType.java @@ -1,21 +1,16 @@ + package eu.dnetlib.dhp.skgif.model; import java.io.Serializable; public enum VenueType implements Serializable { - REPOSITORY("repository"), - JOURNAL("journal"), - CONFERENCE("conference"), - BOOK("book"), - OTHER( - "other"), - UNKNOWN("unknown"); + REPOSITORY("repository"), JOURNAL("journal"), CONFERENCE("conference"), BOOK("book"), OTHER( + "other"), UNKNOWN("unknown"); + public final String label; - public final String label; - - private VenueType(String label) { - this.label = label; - } + private VenueType(String label) { + this.label = label; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/CardinalityTooHighException.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/CardinalityTooHighException.java index c4cf084..0a4f25b 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/CardinalityTooHighException.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/CardinalityTooHighException.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.exceptions; public class CardinalityTooHighException extends Exception { diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/NoAvailableEntityTypeException.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/NoAvailableEntityTypeException.java index 3d614eb..f3fbf74 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/NoAvailableEntityTypeException.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/exceptions/NoAvailableEntityTypeException.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.exceptions; public class NoAvailableEntityTypeException extends Exception { diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasource.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasource.java index f2e2578..6081b33 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasource.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasource.java @@ -1,9 +1,12 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.skgif.model.Identifier; -import eu.dnetlib.dhp.skgif.model.Prefixes; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -15,133 +18,156 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.skgif.model.Identifier; +import eu.dnetlib.dhp.skgif.model.Prefixes; /** * @author miriam.baglioni * @Date 21/02/24 */ public class DumpDatasource implements Serializable { - private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class); + private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - DumpDatasource.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + DumpDatasource.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String workingDir = parser.get("workingDir"); - log.info("workingDir: {}", workingDir); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath + "Datasources"); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath + "Datasources"); - mapDatasource(spark, inputPath, outputPath); - }); - } + mapDatasource(spark, inputPath, outputPath); + }); + } - private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) { - Utils.readPath(spark, inputPath + "datasource", Datasource.class) - .filter((FilterFunction) d -> !d.getDataInfo().getInvisible() && ! d.getDataInfo().getDeletedbyinference()) - .map((MapFunction) d -> { - eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource(); - datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId())); - datasource.setIdentifiers(d.getPid() - .stream() - .map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())) - .collect(Collectors.toList())); + private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) { + Utils + .readPath(spark, inputPath + "datasource", Datasource.class) + .filter( + (FilterFunction) d -> !d.getDataInfo().getInvisible() + && !d.getDataInfo().getDeletedbyinference()) + .map((MapFunction) d -> { + eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource(); + datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId())); + datasource + .setIdentifiers( + d + .getPid() + .stream() + .map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())) + .collect(Collectors.toList())); - datasource.setName(d.getOfficialname().getValue()); - datasource.setSubmission_policy_url(d.getSubmissionpolicyurl()); - datasource.setJurisdiction(Optional.ofNullable(d.getJurisdiction()) - .map(v -> v.getClassid()). - orElse(new String())); - datasource.setPreservation_policy_url(d.getPreservationpolicyurl()); - datasource.setVersion_control(d.getVersioncontrol()); + datasource.setName(d.getOfficialname().getValue()); + datasource.setSubmission_policy_url(d.getSubmissionpolicyurl()); + datasource + .setJurisdiction( + Optional + .ofNullable(d.getJurisdiction()) + .map(v -> v.getClassid()) + .orElse(new String())); + datasource.setPreservation_policy_url(d.getPreservationpolicyurl()); + datasource.setVersion_control(d.getVersioncontrol()); - datasource.setData_source_classification(Optional.ofNullable(d.getEoscdatasourcetype()) - .map(v -> v.getClassname()). - orElse(new String())); - datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes())); - datasource.setThematic(d.getThematic()); - datasource.setResearch_product_access_policy(Optional.ofNullable(d.getDatabaseaccesstype()) - .map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue())) - .orElse(new ArrayList<>())); - datasource.setResearch_product_metadata_access_policy(Optional.ofNullable(d.getResearchproductmetadataaccesspolicies()) - .map(v->getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies())) - .orElse(new ArrayList<>())); - return datasource; - }, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class) ) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(outputPath + "Datasource"); - } + datasource + .setData_source_classification( + Optional + .ofNullable(d.getEoscdatasourcetype()) + .map(v -> v.getClassname()) + .orElse(new String())); + datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes())); + datasource.setThematic(d.getThematic()); + datasource + .setResearch_product_access_policy( + Optional + .ofNullable(d.getDatabaseaccesstype()) + .map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue())) + .orElse(new ArrayList<>())); + datasource + .setResearch_product_metadata_access_policy( + Optional + .ofNullable(d.getResearchproductmetadataaccesspolicies()) + .map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies())) + .orElse(new ArrayList<>())); + return datasource; + }, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "Datasource"); + } - private static List getResearchProductAccessPolicy(List value) { + private static List getResearchProductAccessPolicy(List value) { - return value.stream().map(v -> getResearchProductAccessPolicy(v)).filter(Objects::nonNull) - .map(v -> v.get(0)).distinct().collect(Collectors.toList()); - } - private static List getResearchProductAccessPolicy(String value) { - // "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) - //if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) - //if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " - switch(value){ - case "open"://(https://vocabularies.coar-repositories.org/access_rights/c_abf2/) - return Arrays.asList("open access"); - case "restricted"://(https://vocabularies.coar-repositories.org/access_rights/c_16ec/) - return Arrays.asList("restricted access"); - case "closed"://(https://vocabularies.coar-repositories.org/access_rights/c_14cb/) - return Arrays.asList("metadata only access"); - default: - return null; - } - } + return value + .stream() + .map(v -> getResearchProductAccessPolicy(v)) + .filter(Objects::nonNull) + .map(v -> v.get(0)) + .distinct() + .collect(Collectors.toList()); + } - private static List getEoscProductType(List researchentitytypes) { + private static List getResearchProductAccessPolicy(String value) { + // "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) + // if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) + // if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " + switch (value) { + case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) + return Arrays.asList("open access"); + case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) + return Arrays.asList("restricted access"); + case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) + return Arrays.asList("metadata only access"); + default: + return null; + } + } - List eoscProductType = new ArrayList<>(); - if(researchentitytypes != null) { + private static List getEoscProductType(List researchentitytypes) { - if (researchentitytypes.contains("Software")) - eoscProductType.add("Research Software"); - if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature")) - eoscProductType.add("Research Literature"); - if (researchentitytypes.contains("Research Data")) - eoscProductType.add("Research Data"); - if (researchentitytypes.contains("Organization") || - researchentitytypes.contains("Organizations") || - researchentitytypes.contains("Services") || - researchentitytypes.contains("Projects")) - eoscProductType.add("Other research product"); - } - return eoscProductType; - } + List eoscProductType = new ArrayList<>(); + if (researchentitytypes != null) { + + if (researchentitytypes.contains("Software")) + eoscProductType.add("Research Software"); + if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature")) + eoscProductType.add("Research Literature"); + if (researchentitytypes.contains("Research Data")) + eoscProductType.add("Research Data"); + if (researchentitytypes.contains("Organization") || + researchentitytypes.contains("Organizations") || + researchentitytypes.contains("Services") || + researchentitytypes.contains("Projects")) + eoscProductType.add("Other research product"); + } + return eoscProductType; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrant.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrant.java index b63b68f..cc1c684 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrant.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrant.java @@ -1,13 +1,16 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.skgif.model.Grant; -import eu.dnetlib.dhp.skgif.model.Identifier; -import eu.dnetlib.dhp.skgif.model.Prefixes; -import eu.dnetlib.dhp.skgif.model.RelationType; +import java.io.Serializable; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.avro.generic.GenericData; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -22,141 +25,185 @@ import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.skgif.model.Grant; +import eu.dnetlib.dhp.skgif.model.Identifier; +import eu.dnetlib.dhp.skgif.model.Prefixes; +import eu.dnetlib.dhp.skgif.model.RelationType; import scala.Tuple2; - -import java.io.Serializable; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** * @author miriam.baglioni * @Date 22/02/24 */ public class DumpGrant implements Serializable { - private static final Logger log = LoggerFactory.getLogger(DumpGrant.class); + private static final Logger log = LoggerFactory.getLogger(DumpGrant.class); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - DumpGrant.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + DumpGrant.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String workingDir = parser.get("workingDir"); - log.info("workingDir: {}", workingDir); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath + "Grant"); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath + "Grant"); - mapGrants(spark, inputPath, outputPath); - }); - } + mapGrants(spark, inputPath, outputPath); + }); + } - private static void mapGrants(SparkSession spark, String inputPath, String outputPath) { - Dataset projects = Utils.readPath(spark, inputPath + "project", Project.class) - .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && - !p.getDataInfo().getInvisible()); - Dataset relations = Utils.readPath(spark, inputPath + "relation", Relation.class) - .filter((FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - !r.getDataInfo().getInvisible() && - r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label)); - projects.joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left") - .groupByKey((MapFunction, String>) t2 -> t2._1().getId(), Encoders.STRING() ) - .mapGroups((MapGroupsFunction, Grant>) (k,v) ->{ - Grant g = new Grant(); - Tuple2 first = v.next(); - g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k)); - g.setIdentifiers(getProjectIdentifier(first._1())); - g.setTitle(first._1().getTitle().getValue()); - g.setSummary(Optional.ofNullable(first._1().getSummary()) - .map(value->value.getValue()).orElse(new String())); - g.setAcronym(Optional.ofNullable(first._1().getAcronym()) - .map(value->value.getValue()).orElse(new String())); - g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue())); - // * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] - g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue())); - g.setCurrency(Optional.ofNullable(first._1().getCurrency()) - .map(value -> value.getValue()).orElse(new String())); - g.setFunded_amount(Optional.ofNullable(first._1().getFundedamount()) - .orElse(null)); - g.setKeywords(first._1().getSubjects() - .stream().map(s -> s.getValue()).collect(Collectors.toList())); - g.setStart_date(Optional.ofNullable(first._1().getStartdate()) - .map(value -> value.getValue()).orElse(new String())); - g.setEnd_date(Optional.ofNullable(first._1().getEnddate()) - .map(value -> value.getValue()).orElse(new String())); - g.setWebsite(Optional.ofNullable(first._1().getWebsiteurl()) - .map(value -> value.getValue()).orElse(new String())); - if(Optional.ofNullable(first._2()).isPresent()) { - List relevantOrganizatios = new ArrayList<>(); - relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource())); - v.forEachRemaining(t2 -> relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource()))); - g.setBeneficiaries(relevantOrganizatios); - } - return g; - } , Encoders.bean(Grant.class)) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(outputPath + "Grant"); - } + private static void mapGrants(SparkSession spark, String inputPath, String outputPath) { + Dataset projects = Utils + .readPath(spark, inputPath + "project", Project.class) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible()); + Dataset relations = Utils + .readPath(spark, inputPath + "relation", Relation.class) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + !r.getDataInfo().getInvisible() && + r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label)); + projects + .joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left") + .groupByKey((MapFunction, String>) t2 -> t2._1().getId(), Encoders.STRING()) + .mapGroups((MapGroupsFunction, Grant>) (k, v) -> { + Grant g = new Grant(); + Tuple2 first = v.next(); + g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k)); + g.setIdentifiers(getProjectIdentifier(first._1())); + g.setTitle(first._1().getTitle().getValue()); + g + .setSummary( + Optional + .ofNullable(first._1().getSummary()) + .map(value -> value.getValue()) + .orElse(new String())); + g + .setAcronym( + Optional + .ofNullable(first._1().getAcronym()) + .map(value -> value.getValue()) + .orElse(new String())); + g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue())); + // * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] + g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue())); + g + .setCurrency( + Optional + .ofNullable(first._1().getCurrency()) + .map(value -> value.getValue()) + .orElse(new String())); + g + .setFunded_amount( + Optional + .ofNullable(first._1().getFundedamount()) + .orElse(null)); + g + .setKeywords( + first + ._1() + .getSubjects() + .stream() + .map(s -> s.getValue()) + .collect(Collectors.toList())); + g + .setStart_date( + Optional + .ofNullable(first._1().getStartdate()) + .map(value -> value.getValue()) + .orElse(new String())); + g + .setEnd_date( + Optional + .ofNullable(first._1().getEnddate()) + .map(value -> value.getValue()) + .orElse(new String())); + g + .setWebsite( + Optional + .ofNullable(first._1().getWebsiteurl()) + .map(value -> value.getValue()) + .orElse(new String())); + if (Optional.ofNullable(first._2()).isPresent()) { + List relevantOrganizatios = new ArrayList<>(); + relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource())); + v + .forEachRemaining( + t2 -> relevantOrganizatios + .add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource()))); + g.setBeneficiaries(relevantOrganizatios); + } + return g; + }, Encoders.bean(Grant.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "Grant"); + } - private static String getFundingStream(String fundingtree) throws DocumentException { - final Document doc; + private static String getFundingStream(String fundingtree) throws DocumentException { + final Document doc; - doc = new SAXReader().read(new StringReader(fundingtree)); - if(Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() && - doc.selectNodes("//funding_level_0").size() > 0) - return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText(); - return new String(); + doc = new SAXReader().read(new StringReader(fundingtree)); + if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() && + doc.selectNodes("//funding_level_0").size() > 0) + return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText(); + return new String(); - } + } - private static String getFunderName(String fundingtree) throws DocumentException { - final Document doc; + private static String getFunderName(String fundingtree) throws DocumentException { + final Document doc; - doc = new SAXReader().read(new StringReader(fundingtree)); - // f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText()); - return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText(); - //f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText()); + doc = new SAXReader().read(new StringReader(fundingtree)); + // f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText()); + return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText(); + // f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText()); - } + } - private static List getProjectIdentifier(Project project) { - if (project.getPid().size() > 0 ) - return project.getPid().stream().map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())).collect(Collectors.toList()); - return new ArrayList<>(); - // private List identifiers;//.schema pid.qualifier.classid identifiers.value pid.value - //identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname - //identifiers.value project.code + private static List getProjectIdentifier(Project project) throws DocumentException { + List identifiers = new ArrayList<>(); + if (project.getPid().size() > 0) + project + .getPid() + .stream() + .forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))); + identifiers + .add( + Identifier + .newInstance( + getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue())); + return identifiers; - - } -} \ No newline at end of file + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganization.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganization.java index b302f91..9f77bab 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganization.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganization.java @@ -1,10 +1,12 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.skgif.model.Identifier; -import eu.dnetlib.dhp.skgif.model.OrganizationTypes; -import eu.dnetlib.dhp.skgif.model.Prefixes; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.Optional; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -16,95 +18,117 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.Serializable; -import java.util.Optional; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.skgif.model.Identifier; +import eu.dnetlib.dhp.skgif.model.OrganizationTypes; +import eu.dnetlib.dhp.skgif.model.Prefixes; /** * @author miriam.baglioni * @Date 21/02/24 */ public class DumpOrganization implements Serializable { - private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class); + private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - DumpOrganization.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + DumpOrganization.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath + "Organization"); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath + "Organization"); - mapOrganization(spark, inputPath, outputPath); - }); - } + mapOrganization(spark, inputPath, outputPath); + }); + } - private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) { - Dataset organizations = Utils.readPath(spark, inputPath + "organization", Organization.class); - organizations.filter((FilterFunction) o -> !o.getDataInfo().getDeletedbyinference() && !o.getDataInfo().getInvisible()) - .map((MapFunction) o -> { - eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization(); - organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId())); - organization.setCountry(Optional.ofNullable(o.getCountry().getClassid()) - .orElse(new String())); - organization.setName(Optional.ofNullable(o.getLegalname().getValue()) - .orElse(new String())); - organization.setShort_name(Optional.ofNullable(o.getLegalshortname()) - .map(v-> v.getValue()) - .orElse(new String())); - organization.setIdentifiers(o.getPid() - .stream() - .map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())) - .collect(Collectors.toList())); - organization.setOther_names(o.getAlternativeNames().stream() - .map(a -> a.getValue()) - .collect(Collectors.toList())); - organization.setType(getOrganizationType(o)); - return organization; - } - , Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class)) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(outputPath + "Organization"); - } + private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) { + Dataset organizations = Utils.readPath(spark, inputPath + "organization", Organization.class); + organizations + .filter( + (FilterFunction) o -> !o.getDataInfo().getDeletedbyinference() + && !o.getDataInfo().getInvisible()) + .map((MapFunction) o -> { + eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization(); + organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId())); + organization + .setCountry( + Optional + .ofNullable(o.getCountry().getClassid()) + .orElse(new String())); + organization + .setName( + Optional + .ofNullable(o.getLegalname().getValue()) + .orElse(new String())); + organization + .setShort_name( + Optional + .ofNullable(o.getLegalshortname()) + .map(v -> v.getValue()) + .orElse(new String())); + organization + .setIdentifiers( + o + .getPid() + .stream() + .map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())) + .collect(Collectors.toList())); + organization + .setOther_names( + o + .getAlternativeNames() + .stream() + .map(a -> a.getValue()) + .collect(Collectors.toList())); + organization.setType(getOrganizationType(o)); + return organization; + }, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "Organization"); + } - private static String getOrganizationType(Organization o) { - if(Optional.ofNullable(o.getEcenterprise()).isPresent() && o.getEcenterprise().getValue().equalsIgnoreCase("true")) - return OrganizationTypes.COMPANY.label; - if(Optional.ofNullable(o.getEchighereducation()).isPresent() && o.getEchighereducation().getValue().equalsIgnoreCase("true")) - return OrganizationTypes.EDUCATION.label; - if(Optional.ofNullable(o.getEcresearchorganization()).isPresent() && o.getEcresearchorganization().getValue().equalsIgnoreCase("true")) - return OrganizationTypes.EDUCATION.label; - if(Optional.ofNullable(o.getEcnonprofit()).isPresent() && o.getEcnonprofit().getValue().equalsIgnoreCase("true")) - return OrganizationTypes.NONPROFIT.label; + private static String getOrganizationType(Organization o) { + if (Optional.ofNullable(o.getEcenterprise()).isPresent() + && o.getEcenterprise().getValue().equalsIgnoreCase("true")) + return OrganizationTypes.COMPANY.label; + if (Optional.ofNullable(o.getEchighereducation()).isPresent() + && o.getEchighereducation().getValue().equalsIgnoreCase("true")) + return OrganizationTypes.EDUCATION.label; + if (Optional.ofNullable(o.getEcresearchorganization()).isPresent() + && o.getEcresearchorganization().getValue().equalsIgnoreCase("true")) + return OrganizationTypes.EDUCATION.label; + if (Optional.ofNullable(o.getEcnonprofit()).isPresent() + && o.getEcnonprofit().getValue().equalsIgnoreCase("true")) + return OrganizationTypes.NONPROFIT.label; - return OrganizationTypes.OTHER.label; + return OrganizationTypes.OTHER.label; - } + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResult.java index 3f8f4c4..e02bbe2 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResult.java @@ -6,11 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; -import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.oaf.Datasource; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -22,9 +17,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; - +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct; +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; +import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.skgif.model.AccessRight; import eu.dnetlib.dhp.utils.DHPUtils; @@ -106,15 +105,20 @@ public class DumpResult implements Serializable { Dataset datasource = Utils .readPath(spark, inputPath + "/datasource", Datasource.class) .filter( - (FilterFunction) d -> Optional.ofNullable(d.getEosctype()).isPresent() && + (FilterFunction) d -> Optional.ofNullable(d.getEoscdatasourcetype()).isPresent() && d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive")); - + Dataset man = Utils .readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class); - Dataset partialResearchProduct = man.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left") - .groupByKey((MapFunction, String>) t2 -> t2._1().getResultId(), Encoders.STRING()) - .mapGroups((MapGroupsFunction, PartialResearchProduct>) (k, v) -> { + Dataset partialResearchProduct = man + .joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left") + .groupByKey( + (MapFunction, String>) t2 -> t2._1().getResultId(), + Encoders.STRING()) + .mapGroups( + (MapGroupsFunction, PartialResearchProduct>) ( + k, v) -> { PartialResearchProduct prp = new PartialResearchProduct(); prp.setResultId(k); List manifestationList = new ArrayList<>(); @@ -124,10 +128,13 @@ public class DumpResult implements Serializable { return prp; }, Encoders.bean(PartialResearchProduct.class)); partialResearchProduct - .joinWith(aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")), "left") - .map((MapFunction, PartialResearchProduct>) t2 -> { + .joinWith( + aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")), + "left") + .map( + (MapFunction, PartialResearchProduct>) t2 -> { PartialResearchProduct prp = t2._1(); - if(Optional.ofNullable(t2._2()).isPresent()){ + if (Optional.ofNullable(t2._2()).isPresent()) { prp.setRelated_products(t2._2().getRelatedProduct()); prp.setRelevant_organizations(t2._2().getOrganizations()); prp.setFunding(t2._2().getFunding()); @@ -144,148 +151,83 @@ public class DumpResult implements Serializable { private static Manifestation getManifestation(Tuple2 t2) { - // se il lato sinistro c'e' allora ho la biblio e la venue - // se non c'e' allora ho solo gli altri valori - EmitPerManifestation epm = t2._1(); - Manifestation manifestation = new Manifestation(); - manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname()); - manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename()); - if(Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent()) - manifestation - .setDates( - Arrays - .asList( - Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing"))); - if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent()) - switch (epm.getInstance().getRefereed().getClassid()) { - case "0000": - manifestation.setPeer_review(PeerReview.UNAVAILABLE.label); - break; - case "0001": - manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label); - break; - case "0002": - manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label); - break; - } + // se il lato sinistro c'e' allora ho la biblio e la venue + // se non c'e' allora ho solo gli altri valori + EmitPerManifestation epm = t2._1(); + Manifestation manifestation = new Manifestation(); + manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname()); + manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename()); + if (Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent()) + manifestation + .setDates( + Arrays + .asList( + Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing"))); + if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent()) + switch (epm.getInstance().getRefereed().getClassid()) { + case "0000": + manifestation.setPeer_review(PeerReview.UNAVAILABLE.label); + break; + case "0001": + manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label); + break; + case "0002": + manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label); + break; + } - manifestation.setMetadata_curation("unavailable"); - if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent()) - switch (epm.getInstance().getAccessright().getClassid()) { - case "OPEN": - case "OPEN DATA": - case "OPEN SOURCE": - manifestation.setAccess_right(AccessRight.OPEN.label); - break; - case "CLOSED": - manifestation.setAccess_right(AccessRight.CLOSED.label); - break; - case "RESTRICTED": - manifestation.setAccess_right(AccessRight.RESTRICTED.label); - break; - case "EMBARGO": - case "12MONTHS": - case "6MONTHS": - manifestation.setAccess_right(AccessRight.EMBARGO.label); - break; - default: - manifestation.setAccess_right(AccessRight.UNAVAILABLE.label); + manifestation.setMetadata_curation("unavailable"); + if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent()) + switch (epm.getInstance().getAccessright().getClassid()) { + case "OPEN": + case "OPEN DATA": + case "OPEN SOURCE": + manifestation.setAccess_right(AccessRight.OPEN.label); + break; + case "CLOSED": + manifestation.setAccess_right(AccessRight.CLOSED.label); + break; + case "RESTRICTED": + manifestation.setAccess_right(AccessRight.RESTRICTED.label); + break; + case "EMBARGO": + case "12MONTHS": + case "6MONTHS": + manifestation.setAccess_right(AccessRight.EMBARGO.label); + break; + default: + manifestation.setAccess_right(AccessRight.UNAVAILABLE.label); - } - manifestation.setLicence(Optional.ofNullable(epm.getInstance().getLicense()) - .map(value -> value.getValue()) - .orElse(null)); - manifestation.setUrl(Optional.ofNullable(epm.getInstance().getUrl()) - .map(value -> value.get(0)) - .orElse(null)); + } + manifestation + .setLicence( + Optional + .ofNullable(epm.getInstance().getLicense()) + .map(value -> value.getValue()) + .orElse(null)); + manifestation + .setUrl( + Optional + .ofNullable(epm.getInstance().getUrl()) + .map(value -> value.get(0)) + .orElse(null)); - if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) { - manifestation.setPid(epm.getInstance().getPid().get(0).getValue()); - } - if (Optional.ofNullable(t2._2()).isPresent()) { - manifestation.setBiblio(getBiblio(epm)); - if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent()) - manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted())); - else if(Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent()) - manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline())); - } - manifestation - .setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE,epm.getInstance().getHostedby().getKey())); + if (Optional.ofNullable(epm.getInstance().getPid()).isPresent() && epm.getInstance().getPid().size() > 0) { + manifestation.setPid(epm.getInstance().getPid().get(0).getValue()); + } + if (Optional.ofNullable(t2._2()).isPresent()) { + manifestation.setBiblio(getBiblio(epm)); + if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent()) + manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted())); + else if (Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent()) + manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline())); + } + manifestation + .setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey())); - return manifestation; + return manifestation; } -// private static List getManifestationList(Dataset emitformanifestation, -// Dataset datasource) { -// return emitformanifestation -// .joinWith( -// datasource, emitformanifestation -// .col("hostedBy") -// .equalTo(datasource.col("id")), -// "left") -// .map((MapFunction, Manifestation>) t2 -> { -// // se il lato sinistro c'e' allora ho la biblio e la venue -// // se non c'e' allora ho solo gli altri valori -// EmitPerManifestation epm = t2._1(); -// Manifestation manifestation = new Manifestation(); -// manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getClassname()); -// manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename()); -// manifestation -// .setDates( -// Arrays -// .asList( -// Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing"))); -// if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent()) -// switch (epm.getInstance().getRefereed().getClassid()) { -// case "0000": -// manifestation.setPeer_review(PeerReview.UNAVAILABLE.label); -// break; -// case "0001": -// manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label); -// break; -// case "0002": -// manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label); -// break; -// } -// -// manifestation.setMetadata_curation("unavailable"); -// if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent()) -// switch (epm.getInstance().getAccessright().getClassid()) { -// case "OPEN": -// case "OPEN DATA": -// case "OPEN SOURCE": -// manifestation.setAccess_right(AccessRight.OPEN.label); -// break; -// case "CLOSED": -// manifestation.setAccess_right(AccessRight.CLOSED.label); -// break; -// case "RESTRICTED": -// manifestation.setAccess_right(AccessRight.RESTRICTED.label); -// break; -// case "EMBARGO": -// case "12MONTHS": -// case "6MONTHS": -// manifestation.setAccess_right(AccessRight.EMBARGO.label); -// break; -// default: -// manifestation.setAccess_right(AccessRight.UNAVAILABLE.label); -// -// } -// manifestation.setLicence(epm.getInstance().getLicense().getValue()); -// manifestation.setUrl(epm.getInstance().getUrl().get(0)); -// if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) { -// manifestation.setPid(epm.getInstance().getPid().get(0).getValue()); -// } -// if (Optional.ofNullable(t2._2()).isPresent()) -// manifestation.setBiblio(getBiblio(epm)); -// manifestation.setVenue("venue_______::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey())); -// manifestation -// .setHosting_datasource("datasource__::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey())); -// return manifestation; -// }, Encoders.bean(Manifestation.class)) -// .collectAsList(); -// } - private static Biblio getBiblio(EmitPerManifestation epm) { Biblio biblio = new Biblio(); biblio.setEdition(epm.getJournal().getEdition()); @@ -298,7 +240,7 @@ public class DumpResult implements Serializable { } private static void dumpResult(SparkSession spark, String inputPath, String workingDir, - String outputPath) { + String outputPath) { ModelSupport.entityTypes .keySet() .parallelStream() @@ -314,14 +256,14 @@ public class DumpResult implements Serializable { .joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left") .map((MapFunction, ResearchProduct>) t2 -> { ResearchProduct rp = ResultMapper.map(t2._1()); - if(Optional.ofNullable(t2._2()).isPresent()) { - if(Optional.ofNullable(t2._2().getRelated_products()).isPresent()) + if (Optional.ofNullable(t2._2()).isPresent()) { + if (Optional.ofNullable(t2._2().getRelated_products()).isPresent()) rp.setRelated_products(t2._2().getRelated_products()); - if(Optional.ofNullable(t2._2().getFunding()).isPresent()) + if (Optional.ofNullable(t2._2().getFunding()).isPresent()) rp.setFunding(t2._2().getFunding()); - if(Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent()) + if (Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent()) rp.setRelevant_organizations(t2._2().getRelevant_organizations()); - if(Optional.ofNullable(t2._2().getManifestations()).isPresent()) + if (Optional.ofNullable(t2._2().getManifestations()).isPresent()) rp.setManifestations(t2._2().getManifestations()); } return rp; @@ -333,30 +275,37 @@ public class DumpResult implements Serializable { }); Dataset researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class)); - for(EntityType e : ModelSupport.entityTypes.keySet()) { - if(ModelSupport.isResult(e)) - researchProducts = researchProducts.union(Utils.readPath(spark,workingDir + e.name() + "/researchproduct", ResearchProduct.class)); - } + for (EntityType e : ModelSupport.entityTypes.keySet()) { + if (ModelSupport.isResult(e)) + researchProducts = researchProducts + .union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class)); + } researchProducts - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(outputPath + "ResearchProduct"); + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "ResearchProduct"); } private static void selectRelations(SparkSession spark, String inputPath, String workingDir) { - Dataset relation = Utils.readPath(spark, - inputPath + "relation", Relation.class) - .filter((FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - !r.getDataInfo().getInvisible()) - .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) || - r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) || - r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) || - r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label)|| - r.getRelClass().equalsIgnoreCase(RelationType.PART.label) || - r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) || - r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label)); + Dataset relation = Utils + .readPath( + spark, + inputPath + "relation", Relation.class) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + !r.getDataInfo().getInvisible()) + .filter( + (FilterFunction) r -> r + .getRelClass() + .equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) || + r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) || + r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) || + r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label) || + r.getRelClass().equalsIgnoreCase(RelationType.PART.label) || + r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) || + r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label)); relation .groupByKey((MapFunction) r -> r.getSource(), Encoders.STRING()) @@ -373,12 +322,14 @@ public class DumpResult implements Serializable { rpp.getOrganizations().add(Utils.getIdentifier(Prefixes.ORGANIZATION, target)); break; case "isproducedby": - rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT ,target)); + rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT, target)); break; default: if (!remainignRelations.keySet().contains(relClass)) remainignRelations.put(relClass, new ArrayList<>()); - remainignRelations.get(relClass).add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target)); + remainignRelations + .get(relClass) + .add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target)); } } for (String key : remainignRelations.keySet()) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpVenue.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpVenue.java index 648b0a1..f06a840 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpVenue.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpVenue.java @@ -1,156 +1,179 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Journal; -import eu.dnetlib.dhp.skgif.model.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.skgif.model.*; +import scala.Tuple2; /** * @author miriam.baglioni * @Date 21/02/24 */ public class DumpVenue implements Serializable { - private static final Logger log = LoggerFactory.getLogger(DumpVenue.class); + private static final Logger log = LoggerFactory.getLogger(DumpVenue.class); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - DumpVenue.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + DumpVenue.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String workingDir = parser.get("workingDir"); - log.info("workingDir: {}", workingDir); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath + "Venue"); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath + "Venue"); - mapDatasource(spark, inputPath, outputPath, workingDir); - }); - } + mapVenue(spark, inputPath, outputPath, workingDir); + }); + } - private static void mapDatasource(SparkSession spark, String inputPath, String outputPath, String workingDir) { - Utils.readPath(spark, inputPath + "datasource", Datasource.class) - .filter((FilterFunction) d -> !d.getDataInfo().getInvisible() && ! d.getDataInfo().getDeletedbyinference() - && d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive")) - .map((MapFunction) d -> { - Venue venue = new Venue(); - if(Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent()) - venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted())); - else if(Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent()) - venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline())); - venue.setIdentifiers(getVenueIdentifier(d.getJournal())); - venue.setName(d.getOfficialname().getValue()); - venue.setType(VenueType.JOURNAL.label); - //todo add map for publisher. Get from results? - venue.setPublisher("find it from result"); - venue.setAcronym(null); - venue.setSeries(null); - venue.setIs_currently_full_oa(null); - venue.setCreation_date(null); - venue.setContributions(null); - return venue; - }, Encoders.bean(Venue.class) ) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(workingDir + "Venues"); + private static void mapVenue(SparkSession spark, String inputPath, String outputPath, String workingDir) { + Dataset manifestationDataset = Utils + .readPath(spark, workingDir + "datasourcePublisher", EmitPerManifestation.class); + Dataset datasourceDataset = Utils + .readPath(spark, inputPath + "datasource", Datasource.class) + .filter( + (FilterFunction) d -> !d.getDataInfo().getInvisible() + && !d.getDataInfo().getDeletedbyinference() + && d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive")); + datasourceDataset + .joinWith( + manifestationDataset, datasourceDataset.col("id").equalTo(manifestationDataset.col("hostedby.key")), + "left") + .map((MapFunction, Venue>) t2 -> { + Venue venue = new Venue(); + Datasource d = t2._1(); + if (Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent()) + venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted())); + else if (Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent()) + venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline())); + venue.setIdentifiers(getVenueIdentifier(d.getJournal())); + venue.setName(d.getOfficialname().getValue()); + venue.setType(VenueType.JOURNAL.label); + if (Optional.ofNullable(t2._2()).isPresent()) + venue.setPublisher(t2._2().getPublisher()); + venue.setAcronym(null); + venue.setSeries(null); + venue.setIs_currently_full_oa(null); + venue.setCreation_date(null); + venue.setContributions(null); + return venue; + }, Encoders.bean(Venue.class)) - Utils.readPath(spark, workingDir + "Venues", Venue.class) - .groupByKey((MapFunction)v -> v.getLocal_identifier() , Encoders.STRING()) - .mapGroups((MapGroupsFunction) (k,v) -> v.next(), Encoders.bean(Venue.class) ) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(outputPath + "Venues"); - } + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingDir + "Venues"); - private static List getVenueIdentifier(Journal journal) { - List identifiers = new ArrayList<>(); - if (Optional.ofNullable((journal.getIssnOnline())).isPresent()) - identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline())); - if(Optional.ofNullable(journal.getIssnPrinted()).isPresent()) - identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted())); - if (Optional.ofNullable(journal.getIssnLinking()).isPresent()) - identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking())); - return identifiers; - } + Utils + .readPath(spark, workingDir + "Venues", Venue.class) + .groupByKey((MapFunction) v -> v.getLocal_identifier(), Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, v) -> v.next(), Encoders.bean(Venue.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "Venues"); + } - private static List getResearchProductAccessPolicy(List value) { + private static List getVenueIdentifier(Journal journal) { + List identifiers = new ArrayList<>(); + if (Optional.ofNullable((journal.getIssnOnline())).isPresent()) + identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline())); + if (Optional.ofNullable(journal.getIssnPrinted()).isPresent()) + identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted())); + if (Optional.ofNullable(journal.getIssnLinking()).isPresent()) + identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking())); + return identifiers; + } - return value.stream().map(v -> getResearchProductAccessPolicy(v)).filter(Objects::nonNull) - .map(v -> v.get(0)).distinct().collect(Collectors.toList()); - } - private static List getResearchProductAccessPolicy(String value) { - // "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) - //if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) - //if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " - switch(value){ - case "open"://(https://vocabularies.coar-repositories.org/access_rights/c_abf2/) - return Arrays.asList("open access"); - case "restricted"://(https://vocabularies.coar-repositories.org/access_rights/c_16ec/) - return Arrays.asList("restricted access"); - case "closed"://(https://vocabularies.coar-repositories.org/access_rights/c_14cb/) - return Arrays.asList("metadata only access"); - default: - return null; - } - } + private static List getResearchProductAccessPolicy(List value) { - private static List getEoscProductType(List researchentitytypes) { + return value + .stream() + .map(v -> getResearchProductAccessPolicy(v)) + .filter(Objects::nonNull) + .map(v -> v.get(0)) + .distinct() + .collect(Collectors.toList()); + } - List eoscProductType = new ArrayList<>(); - if(researchentitytypes != null) { + private static List getResearchProductAccessPolicy(String value) { + // "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) + // if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) + // if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " + switch (value) { + case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) + return Arrays.asList("open access"); + case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) + return Arrays.asList("restricted access"); + case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) + return Arrays.asList("metadata only access"); + default: + return null; + } + } - if (researchentitytypes.contains("Software")) - eoscProductType.add("Research Software"); - if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature")) - eoscProductType.add("Research Literature"); - if (researchentitytypes.contains("Research Data")) - eoscProductType.add("Research Data"); - if (researchentitytypes.contains("Organization") || - researchentitytypes.contains("Organizations") || - researchentitytypes.contains("Services") || - researchentitytypes.contains("Projects")) - eoscProductType.add("Other research product"); - } - return eoscProductType; - } + private static List getEoscProductType(List researchentitytypes) { + + List eoscProductType = new ArrayList<>(); + if (researchentitytypes != null) { + + if (researchentitytypes.contains("Software")) + eoscProductType.add("Research Software"); + if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature")) + eoscProductType.add("Research Literature"); + if (researchentitytypes.contains("Research Data")) + eoscProductType.add("Research Data"); + if (researchentitytypes.contains("Organization") || + researchentitytypes.contains("Organizations") || + researchentitytypes.contains("Services") || + researchentitytypes.contains("Projects")) + eoscProductType.add("Other research product"); + } + return eoscProductType; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResults.java index 1cf1c1a..3bd4624 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResults.java @@ -7,8 +7,6 @@ import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; -import eu.dnetlib.dhp.schema.oaf.Datasource; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -23,10 +21,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; - +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; @@ -42,7 +41,7 @@ public class EmitFromResults implements Serializable { public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( - EmitFromResults.class + EmitFromResults.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/dump/emit_biblio_parameters.json")); @@ -91,17 +90,21 @@ public class EmitFromResults implements Serializable { Class resultClazz = ModelSupport.entityTypes.get(e); Utils .readPath(spark, inputPath + e.name(), resultClazz) - .filter((FilterFunction) r -> Optional.of(r.getSubject()).isPresent()) + .filter((FilterFunction) r -> Optional.ofNullable(r.getSubject()).isPresent()) .flatMap( (FlatMapFunction) r -> r .getSubject() .stream() - .filter(s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") || s.getQualifier().getClassid().equalsIgnoreCase("sdg")) + .filter( + s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") + || s.getQualifier().getClassid().equalsIgnoreCase("sdg")) .map(s -> { Topic t = new Topic(); t .setLocal_identifier( - Utils.getIdentifier(Prefixes.TOPIC ,s.getQualifier().getClassid() + s.getValue())); + Utils + .getIdentifier( + Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue())); t .setIdentifiers( Arrays @@ -154,7 +157,8 @@ public class EmitFromResults implements Serializable { p.setGiven_name(a.getName()); String identifier = new String(); if (Optional.ofNullable(a.getPid()).isPresent()) { - Tuple2 orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils.getOrcid(a.getPid()); + Tuple2 orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils + .getOrcid(a.getPid()); if (orcid != null) { identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()); if (orcid._2()) @@ -164,12 +168,15 @@ public class EmitFromResults implements Serializable { else p .setIdentifiers( - Arrays.asList(Identifier.newInstance("inferred_orcid", orcid._1()))); + Arrays + .asList(Identifier.newInstance("inferred_orcid", orcid._1()))); } else { if (Optional.ofNullable(a.getRank()).isPresent()) { - identifier = Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,r.getId() + a.getRank()); + identifier = Utils + .getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank()); } else { - identifier = Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,r.getId() + count); + identifier = Utils + .getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count); } } @@ -243,6 +250,32 @@ public class EmitFromResults implements Serializable { } }); + + Dataset emitPerManifestationDataset = Utils + .readPath( + spark, workingDir + "software/manifestation", EmitPerManifestation.class) + .union( + Utils + .readPath( + spark, workingDir + "dataset/manifestation", EmitPerManifestation.class)) + .union( + Utils + .readPath( + spark, workingDir + "publication/manifestation", EmitPerManifestation.class)) + .union( + Utils + .readPath( + spark, workingDir + "otherresearchproduct/manifestation", EmitPerManifestation.class)); + + emitPerManifestationDataset + .groupByKey((MapFunction) p -> p.getHostedBy(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) (k, v) -> v.next(), + Encoders.bean(EmitPerManifestation.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingDir + "/datasourcePublisher"); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/ResultMapper.java index 3d358f2..3062779 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/ResultMapper.java @@ -5,10 +5,9 @@ import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; - -import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoAllowedTypeException; import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoTitleFoundException; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; @@ -54,20 +53,21 @@ public class ResultMapper implements Serializable { for (Author a : input.getAuthor()) { count += 1; Contribution contribution = new Contribution(); - Tuple2 orcid = Utils.getOrcid(a.getPid()); - if (orcid != null) { - contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2())); + Tuple2 orcid = Utils.getOrcid(a.getPid()); + if (orcid != null) { + contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2())); + } else { + if (Optional.ofNullable(a.getRank()).isPresent()) { + contribution + .setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + a.getRank())); } else { - if (Optional.ofNullable(a.getRank()).isPresent()) { - contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,input.getId() + a.getRank())); - } else { - contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,input.getId() + count)); - } + contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + count)); + } - } - if(Optional.ofNullable(a.getRank()).isPresent()){ - contribution.setRank(a.getRank()); - } + } + if (Optional.ofNullable(a.getRank()).isPresent()) { + contribution.setRank(a.getRank()); + } contributionList.add(contribution); } @@ -83,12 +83,15 @@ public class ResultMapper implements Serializable { input .getSubject() .stream() - .filter(s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") || + .filter( + s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") || s.getQualifier().getClassid().equalsIgnoreCase("sdg")) .map(s -> { ResultTopic topic = new ResultTopic(); - topic.setTopic(Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue())); - if(Optional.ofNullable(s.getDataInfo()).isPresent()){ + topic + .setTopic( + Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue())); + if (Optional.ofNullable(s.getDataInfo()).isPresent()) { Provenance provenance = new Provenance(); provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust())); provenance.setType(s.getDataInfo().getInferenceprovenance()); @@ -101,7 +104,6 @@ public class ResultMapper implements Serializable { } } - private static void mapType(ResearchProduct out, E input) throws NoAllowedTypeException { switch (input.getResulttype().getClassid()) { case "publication": @@ -148,7 +150,7 @@ public class ResultMapper implements Serializable { .filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title")) .collect(Collectors.toList()); if (!iTitle.isEmpty()) { - out.setTitles(Collections.singletonMap("none",Arrays.asList(iTitle.get(0).getValue()))); + out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue()))); return; } @@ -158,7 +160,7 @@ public class ResultMapper implements Serializable { .filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle")) .collect(Collectors.toList()); if (!iTitle.isEmpty()) { - out.setTitles(Collections.singletonMap("none",Arrays.asList(iTitle.get(0).getValue()))); + out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue()))); } } @@ -169,6 +171,6 @@ public class ResultMapper implements Serializable { Optional .ofNullable(input.getDescription()) .ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue()))); - out.setAbstracts(Collections.singletonMap("none",descriptionList)); + out.setAbstracts(Collections.singletonMap("none", descriptionList)); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/Utils.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/Utils.java index 8990760..d578a2c 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/Utils.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/Utils.java @@ -5,16 +5,18 @@ import java.io.Serializable; import java.util.List; import java.util.Optional; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; + import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.skgif.model.Prefixes; import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; import scala.Tuple2; /** @@ -32,11 +34,11 @@ public class Utils implements Serializable { } public static Dataset readPath( - SparkSession spark, String inputPath, Class clazz) { + SparkSession spark, String inputPath, Class clazz) { return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } public static Tuple2 getOrcid(List pid) { @@ -57,7 +59,7 @@ public class Utils implements Serializable { return null; } - public static String getIdentifier(Prefixes entity, String id){ + public static String getIdentifier(Prefixes entity, String id) { return entity.label + DHPUtils.md5(id); } diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/config-default.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/config-default.xml new file mode 100644 index 0000000..d262cb6 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/workflow.xml new file mode 100644 index 0000000..de19f37 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/skgif/oozie_app/workflow.xml @@ -0,0 +1,216 @@ + + + + sourcePath + the source path + + + + outputPath + the output path + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + yarn + cluster + Extraction + eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResults + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --workingDir${workingDir}/ + --outputPath${outputPath} + + + + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.skgif.DumpResult + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${outputPath} + --workingDir${workingDir}/ + + + + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.skgif.DumpDatasource + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${outputPath} + --workingDir${workingDir}/ + + + + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.skgif.DumpVenue + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${outputPath} + --workingDir${workingDir}/ + + + + + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.skgif.DumpOrganization + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${outputPath} + --workingDir${workingDir}/ + + + + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.skgif.DumpGrant + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${outputPath} + --workingDir${workingDir}/ + + + + + + + \ No newline at end of file diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/ZenodoUploadTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/ZenodoUploadTest.java index d18f36a..6d276ec 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/ZenodoUploadTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/ZenodoUploadTest.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; -import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResultJobTest; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.junit.jupiter.api.Assertions; @@ -17,10 +16,10 @@ import org.junit.jupiter.api.Test; import com.google.gson.Gson; +import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResultJobTest; import eu.dnetlib.dhp.oa.zenodoapi.MissingConceptDoiException; import eu.dnetlib.dhp.oa.zenodoapi.ZenodoAPIClient; - @Disabled public class ZenodoUploadTest { @@ -162,8 +161,6 @@ public class ZenodoUploadTest { } - - @Test void depositBigFile() throws MissingConceptDoiException, IOException { ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasourceTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasourceTest.java index 79bebdd..c9c8f22 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasourceTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpDatasourceTest.java @@ -1,8 +1,11 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.skgif.model.Datasource; -import eu.dnetlib.dhp.skgif.model.Organization; +import java.io.IOException; +import java.io.Serializable; +import java.nio.file.Files; +import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -18,76 +21,76 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.Serializable; -import java.nio.file.Files; -import java.nio.file.Path; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.skgif.model.Datasource; +import eu.dnetlib.dhp.skgif.model.Organization; /** * @author miriam.baglioni * @Date 22/02/24 */ public class DumpDatasourceTest implements Serializable { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(DumpDatasourceTest.class); + private static final Logger log = LoggerFactory.getLogger(DumpDatasourceTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(DumpDatasourceTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(DumpDatasourceTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(DumpDatasourceTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(DumpDatasourceTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(DumpDatasourceTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(DumpDatasourceTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testDumpDatasource() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") - .getPath(); + @Test + public void testDumpDatasource() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") + .getPath(); + DumpDatasource + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-outputPath", workingDir.toString() + "/" - DumpDatasource.main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-outputPath", workingDir.toString() + "/" + }); - }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD datasource = sc + .textFile(workingDir.toString() + "/Datasource") + .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)); - JavaRDD datasource = sc - .textFile(workingDir.toString() + "/Datasource") - .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)); - - Dataset datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class)); -Assertions.assertEquals(5,datasourceDataset.count()); - datasourceDataset.show(false); + Dataset datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class)); + Assertions.assertEquals(5, datasourceDataset.count()); + datasourceDataset.show(false); // Assertions.assertEquals(7, relationDataset.count()); // RelationPerProduct temp = relationDataset.filter((FilterFunction) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); @@ -115,32 +118,33 @@ Assertions.assertEquals(5,datasourceDataset.count()); // // - } + } - @Test - public void testDumpDatasourceComplete() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") - .getPath(); + @Test + public void testDumpDatasourceComplete() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") + .getPath(); + DumpDatasource + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-outputPath", workingDir.toString() + "/" - DumpDatasource.main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-outputPath", workingDir.toString() + "/" + }); - }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD datasource = sc + .textFile(workingDir.toString() + "/Datasource") + .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)); - JavaRDD datasource = sc - .textFile(workingDir.toString() + "/Datasource") - .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)); + Dataset datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class)); - Dataset datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class)); - - datasourceDataset.foreach((ForeachFunction) d -> System.out.println(OBJECT_MAPPER.writeValueAsString(d))); + datasourceDataset + .foreach((ForeachFunction) d -> System.out.println(OBJECT_MAPPER.writeValueAsString(d))); // Assertions.assertEquals(7, relationDataset.count()); // RelationPerProduct temp = relationDataset.filter((FilterFunction) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); @@ -168,5 +172,5 @@ Assertions.assertEquals(5,datasourceDataset.count()); // // - } + } } diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrantTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrantTest.java index 7bee51b..ff70a51 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrantTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrantTest.java @@ -1,8 +1,11 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.skgif.model.Datasource; -import eu.dnetlib.dhp.skgif.model.Grant; +import java.io.IOException; +import java.io.Serializable; +import java.nio.file.Files; +import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -17,76 +20,76 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.Serializable; -import java.nio.file.Files; -import java.nio.file.Path; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.skgif.model.Datasource; +import eu.dnetlib.dhp.skgif.model.Grant; /** * @author miriam.baglioni * @Date 22/02/24 */ public class DumpGrantTest implements Serializable { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(DumpGrantTest.class); + private static final Logger log = LoggerFactory.getLogger(DumpGrantTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(DumpGrantTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(DumpGrantTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(DumpGrantTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(DumpGrantTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(DumpGrantTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(DumpGrantTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testDumpGrant() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") - .getPath(); + @Test + public void testDumpGrant() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") + .getPath(); + DumpGrant + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-outputPath", workingDir.toString() + "/" - DumpGrant.main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-outputPath", workingDir.toString() + "/" + }); - }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD grant = sc + .textFile(workingDir.toString() + "/Grant") + .map(item -> OBJECT_MAPPER.readValue(item, Grant.class)); - JavaRDD grant = sc - .textFile(workingDir.toString() + "/Grant") - .map(item -> OBJECT_MAPPER.readValue(item, Grant.class)); - - Dataset grantDataset = spark.createDataset(grant.rdd(), Encoders.bean(Grant.class)); -Assertions.assertEquals(12,grantDataset.count()); - grantDataset.show(false); + Dataset grantDataset = spark.createDataset(grant.rdd(), Encoders.bean(Grant.class)); + Assertions.assertEquals(12, grantDataset.count()); + grantDataset.show(false); // Assertions.assertEquals(7, relationDataset.count()); // RelationPerProduct temp = relationDataset.filter((FilterFunction) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); @@ -114,5 +117,5 @@ Assertions.assertEquals(12,grantDataset.count()); // // - } + } } diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganizationTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganizationTest.java index 4abcb66..3e835f8 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganizationTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpOrganizationTest.java @@ -1,10 +1,11 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; -import eu.dnetlib.dhp.skgif.model.Organization; -import eu.dnetlib.dhp.skgif.model.Prefixes; -import eu.dnetlib.dhp.skgif.model.ResearchProduct; +import java.io.IOException; +import java.io.Serializable; +import java.nio.file.Files; +import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -20,77 +21,79 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.Serializable; -import java.nio.file.Files; -import java.nio.file.Path; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; +import eu.dnetlib.dhp.skgif.model.Organization; +import eu.dnetlib.dhp.skgif.model.Prefixes; +import eu.dnetlib.dhp.skgif.model.ResearchProduct; /** * @author miriam.baglioni * @Date 22/02/24 */ public class DumpOrganizationTest implements Serializable { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(DumpOrganizationTest.class); + private static final Logger log = LoggerFactory.getLogger(DumpOrganizationTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(DumpOrganizationTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(DumpOrganizationTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(DumpOrganizationTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(DumpOrganizationTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(DumpOrganizationTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(DumpOrganizationTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testDumpOrganization() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") - .getPath(); + @Test + public void testDumpOrganization() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") + .getPath(); + DumpOrganization + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-outputPath", workingDir.toString() + "/" - DumpOrganization - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-outputPath", workingDir.toString() + "/" + }); - }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD organization = sc + .textFile(workingDir.toString() + "/Organization") + .map(item -> OBJECT_MAPPER.readValue(item, Organization.class)); - JavaRDD organization = sc - .textFile(workingDir.toString() + "/Organization") - .map(item -> OBJECT_MAPPER.readValue(item, Organization.class)); - - Dataset organizationDataset = spark.createDataset(organization.rdd(), Encoders.bean(Organization.class)); -Assertions.assertEquals(34-19,organizationDataset.count()); - organizationDataset.show(false); + Dataset organizationDataset = spark + .createDataset(organization.rdd(), Encoders.bean(Organization.class)); + Assertions.assertEquals(34 - 19, organizationDataset.count()); + organizationDataset.show(false); // Assertions.assertEquals(7, relationDataset.count()); // RelationPerProduct temp = relationDataset.filter((FilterFunction) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); @@ -118,5 +121,5 @@ Assertions.assertEquals(34-19,organizationDataset.count()); // // - } + } } diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResultTest.java index e64dd33..473d128 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResultTest.java @@ -1,10 +1,13 @@ + package eu.dnetlib.dhp.oa.graph.dump.skgif; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; -import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; -import eu.dnetlib.dhp.skgif.model.*; -import eu.dnetlib.dhp.utils.DHPUtils; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.stream.Collectors; + +import javax.validation.constraints.AssertTrue; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -21,261 +24,456 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.validation.constraints.AssertTrue; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.stream.Collectors; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; +import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; +import eu.dnetlib.dhp.skgif.model.*; +import eu.dnetlib.dhp.utils.DHPUtils; /** * @author miriam.baglioni * @Date 20/02/24 */ public class DumpResultTest { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class); + private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(DumpResultTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(DumpResultTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(DumpResultTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(DumpResultTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testEmitFromResult() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") - .getPath(); + @Test + public void testEmitFromResult() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") + .getPath(); - final String workingDir = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/") - .getPath(); + final String workingDir = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/") + .getPath(); - DumpResult - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-workingDir", workingDir + DumpResult + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-workingDir", workingDir - }); + }); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD relation = sc - .textFile(workingDir + "/aggrelation") - .map(item -> OBJECT_MAPPER.readValue(item, RelationPerProduct.class)); + JavaRDD relation = sc + .textFile(workingDir + "/aggrelation") + .map(item -> OBJECT_MAPPER.readValue(item, RelationPerProduct.class)); - Dataset relationDataset = spark.createDataset(relation.rdd(), Encoders.bean(RelationPerProduct.class)); + Dataset relationDataset = spark + .createDataset(relation.rdd(), Encoders.bean(RelationPerProduct.class)); - relationDataset.show(false); - Assertions.assertEquals(7, relationDataset.count()); - RelationPerProduct temp = relationDataset.filter((FilterFunction) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); - Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); - Assertions.assertEquals(1, temp.getFunding().size()); - Assertions.assertEquals(2, temp.getRelatedProduct().size()); - Assertions.assertEquals(1, temp.getRelatedProduct().stream().filter(rp -> rp.getRelation_type().equalsIgnoreCase("issupplementedby")).count()); - Assertions.assertEquals(1, temp.getRelatedProduct().stream().filter(rp -> rp.getRelation_type().equalsIgnoreCase("isdocumentedby")).count()); + relationDataset.show(false); + Assertions.assertEquals(7, relationDataset.count()); + RelationPerProduct temp = relationDataset + .filter( + (FilterFunction) r -> r + .getResultId() + .equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")) + .first(); + Assertions + .assertEquals( + 3, temp.getFunding().size() + temp.getRelatedProduct().size() + temp.getOrganizations().size()); + Assertions.assertEquals(1, temp.getFunding().size()); + Assertions.assertEquals(2, temp.getRelatedProduct().size()); + Assertions + .assertEquals( + 1, + temp + .getRelatedProduct() + .stream() + .filter(rp -> rp.getRelation_type().equalsIgnoreCase("issupplementedby")) + .count()); + Assertions + .assertEquals( + 1, + temp + .getRelatedProduct() + .stream() + .filter(rp -> rp.getRelation_type().equalsIgnoreCase("isdocumentedby")) + .count()); - JavaRDD researchProduct = sc - .textFile(workingDir.toString() + "/publication/researchproduct") - .map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class)); + JavaRDD researchProduct = sc + .textFile(workingDir.toString() + "/publication/researchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class)); - org.apache.spark.sql.Dataset researchProductDataset = spark - .createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class)); + org.apache.spark.sql.Dataset researchProductDataset = spark + .createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class)); - Assertions.assertEquals(1, researchProductDataset.filter((FilterFunction) p -> p.getLocal_identifier().equalsIgnoreCase(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))).count()); - ResearchProduct product = researchProductDataset.filter((FilterFunction) p -> p.getLocal_identifier().equalsIgnoreCase(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))).first(); - Assertions.assertEquals(2, product.getRelevant_organizations().size()); - Assertions.assertEquals(1, product.getFunding().size()); - Assertions.assertEquals(0, product.getRelated_products().size()); - Assertions.assertEquals(1, product.getContributions().size()); - Assertions.assertEquals(2, product.getManifestations().size()); + Assertions + .assertEquals( + 1, + researchProductDataset + .filter( + (FilterFunction) p -> p + .getLocal_identifier() + .equalsIgnoreCase( + Utils + .getIdentifier( + Prefixes.RESEARCH_PRODUCT, + "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))) + .count()); + ResearchProduct product = researchProductDataset + .filter( + (FilterFunction) p -> p + .getLocal_identifier() + .equalsIgnoreCase( + Utils + .getIdentifier( + Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))) + .first(); + Assertions.assertEquals(2, product.getRelevant_organizations().size()); + Assertions.assertEquals(1, product.getFunding().size()); + Assertions.assertEquals(0, product.getRelated_products().size()); + Assertions.assertEquals(1, product.getContributions().size()); + Assertions.assertEquals(2, product.getManifestations().size()); - researchProductDataset.show(false); + researchProductDataset.show(false); + } + @Test + public void testEmitFromDedupedResult() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") + .getPath(); - } + final String workingDir = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir_complete_entities/") + .getPath(); - @Test - public void testEmitFromDedupedResult() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") - .getPath(); + DumpResult + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-workingDir", workingDir, + "-outputPath", workingDir - final String workingDir = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir_complete_entities/") - .getPath(); + }); - DumpResult - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-workingDir", workingDir, - "-outputPath", workingDir + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - }); + JavaRDD researchProduct = sc + .textFile(workingDir.toString() + "ResearchProduct") + .map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class)); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + org.apache.spark.sql.Dataset researchProductDataset = spark + .createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class)); - JavaRDD researchProduct = sc - .textFile(workingDir.toString() + "ResearchProduct") - .map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class)); + Assertions.assertEquals(1, researchProductDataset.count()); - org.apache.spark.sql.Dataset researchProductDataset = spark - .createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class)); + ResearchProduct rp = researchProductDataset.first(); - Assertions.assertEquals(1, researchProductDataset.count()); + // check the local identifier + Assertions.assertEquals("product_____::e22a152ab43b9215d14ece613f76ec84", rp.getLocal_identifier()); - ResearchProduct rp = researchProductDataset.first(); + // check the pids of the result + Assertions.assertEquals(3, rp.getIdentifiers().size()); + Assertions + .assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("doi")).count()); + Assertions + .assertEquals( + "10.1007/s40199-021-00403-x", + rp + .getIdentifiers() + .stream() + .filter(p -> p.getScheme().equalsIgnoreCase("doi")) + .collect(Collectors.toList()) + .get(0) + .getValue()); + Assertions + .assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("pmid")).count()); + Assertions + .assertEquals( + "34327650", + rp + .getIdentifiers() + .stream() + .filter(p -> p.getScheme().equalsIgnoreCase("pmid")) + .collect(Collectors.toList()) + .get(0) + .getValue()); + Assertions + .assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("pmc")).count()); + Assertions + .assertEquals( + "PMC8602609", + rp + .getIdentifiers() + .stream() + .filter(p -> p.getScheme().equalsIgnoreCase("pmc")) + .collect(Collectors.toList()) + .get(0) + .getValue()); - //check the local identifier - Assertions.assertEquals("product_____::e22a152ab43b9215d14ece613f76ec84", rp.getLocal_identifier()); + // check the title + Assertions.assertEquals(1, rp.getTitles().keySet().size()); + Assertions.assertTrue(rp.getTitles().keySet().contains("none")); + Assertions.assertEquals(1, rp.getTitles().get("none").size()); - //check the pids of the result - Assertions.assertEquals(3,rp.getIdentifiers().size()); - Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("doi")).count()); - Assertions.assertEquals("10.1007/s40199-021-00403-x", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("doi")).collect(Collectors.toList()).get(0).getValue()); - Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmid")).count()); - Assertions.assertEquals("34327650", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmid")).collect(Collectors.toList()).get(0).getValue()); - Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmc")).count()); - Assertions.assertEquals("PMC8602609", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmc")).collect(Collectors.toList()).get(0).getValue()); + // check abstract + Assertions.assertEquals(1, rp.getAbstracts().keySet().size()); + Assertions.assertTrue(rp.getAbstracts().keySet().contains("none")); + Assertions.assertEquals(1, rp.getAbstracts().get("none").size()); - //check the title - Assertions.assertEquals(1, rp.getTitles().keySet().size()); - Assertions.assertTrue(rp.getTitles().keySet().contains("none")); - Assertions.assertEquals(1, rp.getTitles().get("none").size()); + // check type + Assertions.assertEquals("literature", rp.getProduct_type()); - //check abstract - Assertions.assertEquals(1, rp.getAbstracts().keySet().size()); - Assertions.assertTrue(rp.getAbstracts().keySet().contains("none")); - Assertions.assertEquals(1, rp.getAbstracts().get("none").size()); + // check topics + Assertions.assertEquals(3, rp.getTopics().size()); + Assertions + .assertTrue( + rp + .getTopics() + .stream() + .anyMatch( + t -> t + .getTopic() + .equalsIgnoreCase(Prefixes.TOPIC.label + DHPUtils.md5("FOSSustained delivery")))); - //check type - Assertions.assertEquals("literature", rp.getProduct_type()); + // check contributions + Assertions.assertEquals(4, rp.getContributions().size()); + Assertions + .assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("person")).count()); + Assertions + .assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("temp")).count()); + rp.getContributions().forEach(c -> Assertions.assertTrue(c.getDeclared_affiliation() == null)); + Assertions + .assertEquals( + 1, + rp + .getContributions() + .stream() + .filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-8284-6269true"))) + .collect(Collectors.toList()) + .get(0) + .getRank()); + Assertions + .assertEquals( + 2, + rp + .getContributions() + .stream() + .filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0002-0940-893xtrue"))) + .collect(Collectors.toList()) + .get(0) + .getRank()); + Assertions + .assertEquals( + 3, + rp + .getContributions() + .stream() + .filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-5291-577xtrue"))) + .collect(Collectors.toList()) + .get(0) + .getRank()); + Assertions + .assertEquals( + 4, + rp + .getContributions() + .stream() + .filter( + c -> c + .getPerson() + .equals( + Utils + .getIdentifier( + Prefixes.TEMPORARY_PERSON, + "50|doi_dedup___::0000661be7c602727bae9690778b16514"))) + .collect(Collectors.toList()) + .get(0) + .getRank()); + researchProductDataset.show(10, 100, true); - //check topics - Assertions.assertEquals(3, rp.getTopics().size()); - Assertions.assertTrue(rp.getTopics().stream().anyMatch(t -> t.getTopic().equalsIgnoreCase(Prefixes.TOPIC.label + DHPUtils.md5("FOSSustained delivery")))); + // check manifestation 1 + Assertions.assertEquals(3, rp.getManifestations().size()); + Manifestation manifestation = rp + .getManifestations() + .stream() + .filter( + m -> m + .getHosting_datasource() + .equals( + Utils.getIdentifier(Prefixes.DATASOURCE, "10|doajarticles::6107489403b31fc7cf37cb7fda35f7f1"))) + .collect(Collectors.toList()) + .get(0); + Assertions.assertEquals("Article", manifestation.getProduct_local_type()); + Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); + Assertions.assertEquals(1, manifestation.getDates().size()); + Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue()); + Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); + Assertions.assertEquals(PeerReview.PEER_REVIEWED.label, manifestation.getPeer_review()); + Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); + Assertions.assertEquals(AccessRight.CLOSED.label, manifestation.getAccess_right()); + Assertions.assertEquals("Springer Nature TDM", manifestation.getLicence()); + Assertions.assertEquals("https://doi.org/10.1007/s40199-021-00403-x", manifestation.getUrl()); + Assertions.assertEquals("10.1007/s40199-021-00403-x", manifestation.getPid()); + Assertions.assertTrue(manifestation.getBiblio() != null); + Biblio biblio = manifestation.getBiblio(); + Assertions.assertTrue(biblio.getEdition() == null); + Assertions.assertTrue(biblio.getIssue() == null); + Assertions.assertEquals("Springer Science and Business Media LLC", biblio.getPublisher()); + Assertions.assertEquals("29", biblio.getVolume()); + Assertions.assertEquals("415", biblio.getStart_page()); + Assertions.assertEquals("438", biblio.getEnd_page()); - //check contributions - Assertions.assertEquals(4, rp.getContributions().size()); - Assertions.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("person")).count()); - Assertions.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("temp")).count()); - rp.getContributions().forEach(c -> Assertions.assertTrue(c.getDeclared_affiliation()==null)); - Assertions.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-8284-6269true"))) - .collect(Collectors.toList()).get(0).getRank()); - Assertions.assertEquals(2, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0002-0940-893xtrue"))) - .collect(Collectors.toList()).get(0).getRank()); - Assertions.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-5291-577xtrue"))) - .collect(Collectors.toList()).get(0).getRank()); - Assertions.assertEquals(4, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, "50|doi_dedup___::0000661be7c602727bae9690778b16514"))) - .collect(Collectors.toList()).get(0).getRank()); - researchProductDataset.show(10,100,true); + // check manifestation 2 + manifestation = rp + .getManifestations() + .stream() + .filter( + m -> m + .getHosting_datasource() + .equals( + Utils.getIdentifier(Prefixes.DATASOURCE, "10|openaire____::55045bd2a65019fd8e6741a755395c8c"))) + .collect(Collectors.toList()) + .get(0); + Assertions.assertEquals("Article", manifestation.getProduct_local_type()); + Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); + Assertions.assertEquals(1, manifestation.getDates().size()); + Assertions.assertEquals("2020-01-03", manifestation.getDates().get(0).getValue()); + Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); + Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review()); + Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); + Assertions.assertEquals(AccessRight.UNAVAILABLE.label, manifestation.getAccess_right()); + Assertions.assertEquals(null, manifestation.getLicence()); + Assertions.assertEquals("https://pubmed.ncbi.nlm.nih.gov/34327650", manifestation.getUrl()); + Assertions.assertEquals("34327650", manifestation.getPid()); + Assertions.assertTrue(manifestation.getBiblio() == null); - //check manifestation 1 - Assertions.assertEquals(3, rp.getManifestations().size()); - Manifestation manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|doajarticles::6107489403b31fc7cf37cb7fda35f7f1"))) - .collect(Collectors.toList()).get(0); - Assertions.assertEquals("Article" , manifestation.getProduct_local_type()); - Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); - Assertions.assertEquals(1, manifestation.getDates().size()); - Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue()); - Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); - Assertions.assertEquals(PeerReview.PEER_REVIEWED.label, manifestation.getPeer_review()); - Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); - Assertions.assertEquals(AccessRight.CLOSED.label, manifestation.getAccess_right()); - Assertions.assertEquals("Springer Nature TDM", manifestation.getLicence()); - Assertions.assertEquals("https://doi.org/10.1007/s40199-021-00403-x", manifestation.getUrl()); - Assertions.assertEquals("10.1007/s40199-021-00403-x", manifestation.getPid()); - Assertions.assertTrue(manifestation.getBiblio() != null); - Biblio biblio = manifestation.getBiblio(); - Assertions.assertTrue(biblio.getEdition() == null); - Assertions.assertTrue(biblio.getIssue() == null); - Assertions.assertEquals("Springer Science and Business Media LLC",biblio.getPublisher() ); - Assertions.assertEquals("29", biblio.getVolume()); - Assertions.assertEquals("415", biblio.getStart_page()); - Assertions.assertEquals("438", biblio.getEnd_page()); + // check manifestation 3 + manifestation = rp + .getManifestations() + .stream() + .filter( + m -> m + .getHosting_datasource() + .equals( + Utils.getIdentifier(Prefixes.DATASOURCE, "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"))) + .collect(Collectors.toList()) + .get(0); + Assertions.assertEquals("Other literature type", manifestation.getProduct_local_type()); + Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); + Assertions.assertEquals(1, manifestation.getDates().size()); + Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue()); + Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); + Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review()); + Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); + Assertions.assertEquals(AccessRight.OPEN.label, manifestation.getAccess_right()); + Assertions.assertEquals(null, manifestation.getLicence()); + Assertions.assertEquals("https://europepmc.org/articles/PMC8602609/", manifestation.getUrl()); + Assertions.assertEquals("PMC8602609", manifestation.getPid()); + Assertions.assertTrue(manifestation.getBiblio() == null); - //check manifestation 2 - manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|openaire____::55045bd2a65019fd8e6741a755395c8c"))) - .collect(Collectors.toList()).get(0); - Assertions.assertEquals("Article" , manifestation.getProduct_local_type()); - Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); - Assertions.assertEquals(1, manifestation.getDates().size()); - Assertions.assertEquals("2020-01-03", manifestation.getDates().get(0).getValue()); - Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); - Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review()); - Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); - Assertions.assertEquals(AccessRight.UNAVAILABLE.label, manifestation.getAccess_right()); - Assertions.assertEquals(null, manifestation.getLicence()); - Assertions.assertEquals("https://pubmed.ncbi.nlm.nih.gov/34327650", manifestation.getUrl()); - Assertions.assertEquals("34327650", manifestation.getPid()); - Assertions.assertTrue(manifestation.getBiblio() == null); + // check relevant organization + Assertions.assertEquals(1, rp.getRelevant_organizations().size()); + Assertions + .assertEquals( + Prefixes.ORGANIZATION.label + "601e510b1fda7cc6cb03329531502171", + rp.getRelevant_organizations().get(0)); - //check manifestation 3 - manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"))) - .collect(Collectors.toList()).get(0); - Assertions.assertEquals("Other literature type" , manifestation.getProduct_local_type()); - Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); - Assertions.assertEquals(1, manifestation.getDates().size()); - Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue()); - Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); - Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review()); - Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); - Assertions.assertEquals(AccessRight.OPEN.label, manifestation.getAccess_right()); - Assertions.assertEquals(null, manifestation.getLicence()); - Assertions.assertEquals("https://europepmc.org/articles/PMC8602609/", manifestation.getUrl()); - Assertions.assertEquals("PMC8602609", manifestation.getPid()); - Assertions.assertTrue(manifestation.getBiblio() == null); + // check funding + Assertions.assertEquals(1, rp.getFunding().size()); + Assertions.assertEquals(Prefixes.GRANT.label + "a7795022763d413f5de59036ebbd0c52", rp.getFunding().get(0)); - //check relevant organization - Assertions.assertEquals(1,rp.getRelevant_organizations().size()); - Assertions.assertEquals(Prefixes.ORGANIZATION.label + "601e510b1fda7cc6cb03329531502171", rp.getRelevant_organizations().get(0)); + // check related products + Assertions.assertEquals(5, rp.getRelated_products().size()); + Assertions + .assertEquals( + 4, + rp + .getRelated_products() + .stream() + .filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.CITATION.label)) + .collect(Collectors.toList()) + .get(0) + .getProduct_list() + .size()); + Assertions + .assertEquals( + 1, + rp + .getRelated_products() + .stream() + .filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.DOCUMENTS.label)) + .collect(Collectors.toList()) + .get(0) + .getProduct_list() + .size()); + Assertions + .assertEquals( + 1, + rp + .getRelated_products() + .stream() + .filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.PART.label)) + .collect(Collectors.toList()) + .get(0) + .getProduct_list() + .size()); + Assertions + .assertEquals( + 1, + rp + .getRelated_products() + .stream() + .filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.SUPPLEMENT.label)) + .collect(Collectors.toList()) + .get(0) + .getProduct_list() + .size()); + Assertions + .assertEquals( + 1, + rp + .getRelated_products() + .stream() + .filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.VERSION.label)) + .collect(Collectors.toList()) + .get(0) + .getProduct_list() + .size()); - - //check funding - Assertions.assertEquals(1,rp.getFunding().size()); - Assertions.assertEquals(Prefixes.GRANT.label + "a7795022763d413f5de59036ebbd0c52", rp.getFunding().get(0)); - - //check related products - Assertions.assertEquals(5, rp.getRelated_products().size()); - Assertions.assertEquals(4, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.CITATION.label)).collect(Collectors.toList()).get(0).getProduct_list().size()); - Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.DOCUMENTS.label)).collect(Collectors.toList()).get(0).getProduct_list().size()); - Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.PART.label)).collect(Collectors.toList()).get(0).getProduct_list().size()); - Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.SUPPLEMENT.label)).collect(Collectors.toList()).get(0).getProduct_list().size()); - Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.VERSION.label)).collect(Collectors.toList()).get(0).getProduct_list().size()); - - - } + } } diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResultJobTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResultJobTest.java index 006cbe5..1ac3cfc 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResultJobTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/skgif/EmitFromResultJobTest.java @@ -5,7 +5,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import eu.dnetlib.dhp.skgif.model.Topic; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -24,9 +23,9 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; import eu.dnetlib.dhp.skgif.model.Persons; +import eu.dnetlib.dhp.skgif.model.Topic; //@Disabled public class EmitFromResultJobTest { @@ -101,17 +100,54 @@ public class EmitFromResultJobTest { .createDataset(persons.rdd(), Encoders.bean(Persons.class)); personsDataset.show(false); - Persons claudiaBorer = personsDataset.filter((FilterFunction) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")) - .first(); + Persons claudiaBorer = personsDataset + .filter( + (FilterFunction) p -> p + .getLocal_identifier() + .equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")) + .first(); - Assertions.assertEquals(2, personsDataset.filter((FilterFunction) p -> p.getGiven_name().equalsIgnoreCase("claudia") && p.getFamily_name().equalsIgnoreCase("borer")).count()); - Assertions.assertEquals(1, personsDataset.filter((FilterFunction) p -> p.getGiven_name().equalsIgnoreCase("claudia") && p.getFamily_name().equalsIgnoreCase("borer") && !p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")).count()); + Assertions + .assertEquals( + 2, + personsDataset + .filter( + (FilterFunction) p -> p.getGiven_name().equalsIgnoreCase("claudia") + && p.getFamily_name().equalsIgnoreCase("borer")) + .count()); + Assertions + .assertEquals( + 1, + personsDataset + .filter( + (FilterFunction) p -> p.getGiven_name().equalsIgnoreCase("claudia") + && p.getFamily_name().equalsIgnoreCase("borer") + && !p + .getLocal_identifier() + .equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")) + .count()); Assertions.assertEquals("claudia", claudiaBorer.getGiven_name().toLowerCase()); Assertions.assertEquals("borer", claudiaBorer.getFamily_name().toLowerCase()); - Assertions.assertEquals(2, personsDataset.filter((FilterFunction) p -> p.getLocal_identifier().startsWith("person")).count()); - Assertions.assertEquals(1, personsDataset.filter((FilterFunction) p -> p.getLocal_identifier().startsWith("person") && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")).count()); - Persons orcidPerson = personsDataset.filter((FilterFunction) p -> p.getLocal_identifier().startsWith("person") && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")).first(); + Assertions + .assertEquals( + 2, + personsDataset + .filter((FilterFunction) p -> p.getLocal_identifier().startsWith("person")) + .count()); + Assertions + .assertEquals( + 1, + personsDataset + .filter( + (FilterFunction) p -> p.getLocal_identifier().startsWith("person") + && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")) + .count()); + Persons orcidPerson = personsDataset + .filter( + (FilterFunction) p -> p.getLocal_identifier().startsWith("person") + && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")) + .first(); Assertions.assertEquals("M.", orcidPerson.getGiven_name()); Assertions.assertEquals("Kooi", orcidPerson.getFamily_name()); Assertions.assertEquals(1, orcidPerson.getIdentifiers().size()); @@ -119,58 +155,57 @@ public class EmitFromResultJobTest { Assertions.assertEquals("0000-0002-5597-4916", orcidPerson.getIdentifiers().get(0).getValue()); Dataset manifestationDataset = spark - .createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class)); + .createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class)); manifestationDataset.show(false); Assertions.assertEquals(4, manifestationDataset.count()); Dataset topicDataset = spark - .createDataset(topics.rdd(), Encoders.bean(Topic.class)); + .createDataset(topics.rdd(), Encoders.bean(Topic.class)); Assertions.assertEquals(0, topicDataset.count()); } - @Test public void testEmitFromResultComplete() throws Exception { final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") - .getPath(); + .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") + .getPath(); EmitFromResults - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", sourcePath, - "-outputPath", workingDir.toString() + "/result/", - "-workingDir", workingDir.toString() + "/" + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-outputPath", workingDir.toString() + "/result/", + "-workingDir", workingDir.toString() + "/" - }); + }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD persons = sc - .textFile(workingDir.toString() + "/result/Persons") - .map(item -> OBJECT_MAPPER.readValue(item, Persons.class)); + .textFile(workingDir.toString() + "/result/Persons") + .map(item -> OBJECT_MAPPER.readValue(item, Persons.class)); org.apache.spark.sql.Dataset personsDataset = spark - .createDataset(persons.rdd(), Encoders.bean(Persons.class)); + .createDataset(persons.rdd(), Encoders.bean(Persons.class)); personsDataset.foreach((ForeachFunction) p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p))); JavaRDD topics = sc - .textFile(workingDir.toString() + "/result/Topic") - .map(item -> OBJECT_MAPPER.readValue(item, Topic.class)); - Dataset topicDataset = spark - .createDataset(topics.rdd(), Encoders.bean(Topic.class)); + .textFile(workingDir.toString() + "/result/Topic") + .map(item -> OBJECT_MAPPER.readValue(item, Topic.class)); + Dataset topicDataset = spark + .createDataset(topics.rdd(), Encoders.bean(Topic.class)); Assertions.assertEquals(3, topicDataset.count()); topicDataset.foreach((ForeachFunction) t -> System.out.println(OBJECT_MAPPER.writeValueAsString(t))); JavaRDD manifestation = sc - .textFile(workingDir.toString() + "/publication/manifestation") - .map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class)); + .textFile(workingDir.toString() + "/publication/manifestation") + .map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class)); Dataset manifestationDataset = spark - .createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class)); + .createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class)); manifestationDataset.show(false); // Persons claudiaBorer = personsDataset.filter((FilterFunction) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")) @@ -194,6 +229,5 @@ public class EmitFromResultJobTest { // Assertions.assertEquals(4, manifestationDataset.count()); // - } }