[SKG-IF] refactoring and fixing issues

This commit is contained in:
Miriam Baglioni 2024-03-01 09:35:15 +01:00
parent 0c887ca015
commit 752fd896e4
32 changed files with 2082 additions and 1496 deletions

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
@ -7,8 +8,9 @@ import java.io.Serializable;
* @Date 22/02/24
*/
public class Contributor implements Serializable {
private String person; //I would not map it because we have only information regarding the person (if any) associated to the leading organization
private String organization ; //contributors.person
private String person; // I would not map it because we have only information regarding the person (if any)
// associated to the leading organization
private String organization; // contributors.person
private String role ;//private
private String role;// private
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
@ -8,143 +9,146 @@ import java.util.List;
* @Date 21/02/24
*/
public class Datasource implements Serializable {
private String local_identifier ;//id
private List<Identifier> identifiers; //.schema pid.qualifier.classid;identifiers.value pid.value
private String name; //officialname.value
private String submission_policy_url;// submissionpolicyurl
private String preservation_policy_url;// preservationpolicyurl
private Boolean version_control;// versioncontrol bool
private List<PersistentIdentitySystems> persistent_identity_systems;//. product_type researchentitytype list type to be remapped to the eosc types
//persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values
private String jurisdiction;// jurisdiction.classname
private String data_source_classification;// eoscdatasourcetype.classname
private List<String> research_product_type;// researchentitytype list type to be remapped to the eosc types
private Boolean thematic ;//thematic bool
private List<Licence> research_product_license; //.name not mappable listresearch_product_license.url not mappable
private List<String> research_product_access_policy;// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
//if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
//if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list
private List<Licence> research_product_metadata_license; //.name not mappable list
//research_product_metadata_license.url not mappable
private List<String>research_product_metadata_access_policy ;//researchproductmetadataccesspolicies list with the same mapping of research_product_access_policy
private String local_identifier;// id
private List<Identifier> identifiers; // .schema pid.qualifier.classid;identifiers.value pid.value
private String name; // officialname.value
private String submission_policy_url;// submissionpolicyurl
private String preservation_policy_url;// preservationpolicyurl
private Boolean version_control;// versioncontrol bool
private List<PersistentIdentitySystems> persistent_identity_systems;// . product_type researchentitytype list type
// to be remapped to the eosc types
// persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values
private String jurisdiction;// jurisdiction.classname
private String data_source_classification;// eoscdatasourcetype.classname
private List<String> research_product_type;// researchentitytype list type to be remapped to the eosc types
private Boolean thematic;// thematic bool
private List<Licence> research_product_license; // .name not mappable listresearch_product_license.url not mappable
private List<String> research_product_access_policy;// "databaseaccesstype if open => open access
// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list
private List<Licence> research_product_metadata_license; // .name not mappable list
// research_product_metadata_license.url not mappable
private List<String> research_product_metadata_access_policy;// researchproductmetadataccesspolicies list with the
// same mapping of research_product_access_policy
public String getLocal_identifier() {
return local_identifier;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public void setName(String name) {
this.name = name;
}
public String getSubmission_policy_url() {
return submission_policy_url;
}
public String getSubmission_policy_url() {
return submission_policy_url;
}
public void setSubmission_policy_url(String submission_policy_url) {
this.submission_policy_url = submission_policy_url;
}
public void setSubmission_policy_url(String submission_policy_url) {
this.submission_policy_url = submission_policy_url;
}
public String getPreservation_policy_url() {
return preservation_policy_url;
}
public String getPreservation_policy_url() {
return preservation_policy_url;
}
public void setPreservation_policy_url(String preservation_policy_url) {
this.preservation_policy_url = preservation_policy_url;
}
public void setPreservation_policy_url(String preservation_policy_url) {
this.preservation_policy_url = preservation_policy_url;
}
public Boolean getVersion_control() {
return version_control;
}
public Boolean getVersion_control() {
return version_control;
}
public void setVersion_control(Boolean version_control) {
this.version_control = version_control;
}
public void setVersion_control(Boolean version_control) {
this.version_control = version_control;
}
public List<PersistentIdentitySystems> getPersistent_identity_systems() {
return persistent_identity_systems;
}
public List<PersistentIdentitySystems> getPersistent_identity_systems() {
return persistent_identity_systems;
}
public void setPersistent_identity_systems(List<PersistentIdentitySystems> persistent_identity_systems) {
this.persistent_identity_systems = persistent_identity_systems;
}
public void setPersistent_identity_systems(List<PersistentIdentitySystems> persistent_identity_systems) {
this.persistent_identity_systems = persistent_identity_systems;
}
public String getJurisdiction() {
return jurisdiction;
}
public String getJurisdiction() {
return jurisdiction;
}
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public String getData_source_classification() {
return data_source_classification;
}
public String getData_source_classification() {
return data_source_classification;
}
public void setData_source_classification(String data_source_classification) {
this.data_source_classification = data_source_classification;
}
public void setData_source_classification(String data_source_classification) {
this.data_source_classification = data_source_classification;
}
public List<String> getResearch_product_type() {
return research_product_type;
}
public List<String> getResearch_product_type() {
return research_product_type;
}
public void setResearch_product_type(List<String> research_product_type) {
this.research_product_type = research_product_type;
}
public void setResearch_product_type(List<String> research_product_type) {
this.research_product_type = research_product_type;
}
public Boolean getThematic() {
return thematic;
}
public Boolean getThematic() {
return thematic;
}
public void setThematic(Boolean thematic) {
this.thematic = thematic;
}
public void setThematic(Boolean thematic) {
this.thematic = thematic;
}
public List<Licence> getResearch_product_license() {
return research_product_license;
}
public List<Licence> getResearch_product_license() {
return research_product_license;
}
public void setResearch_product_license(List<Licence> research_product_license) {
this.research_product_license = research_product_license;
}
public void setResearch_product_license(List<Licence> research_product_license) {
this.research_product_license = research_product_license;
}
public List<String> getResearch_product_access_policy() {
return research_product_access_policy;
}
public List<String> getResearch_product_access_policy() {
return research_product_access_policy;
}
public void setResearch_product_access_policy(List<String> research_product_access_policy) {
this.research_product_access_policy = research_product_access_policy;
}
public void setResearch_product_access_policy(List<String> research_product_access_policy) {
this.research_product_access_policy = research_product_access_policy;
}
public List<Licence> getResearch_product_metadata_license() {
return research_product_metadata_license;
}
public List<Licence> getResearch_product_metadata_license() {
return research_product_metadata_license;
}
public void setResearch_product_metadata_license(List<Licence> research_product_metadata_license) {
this.research_product_metadata_license = research_product_metadata_license;
}
public void setResearch_product_metadata_license(List<Licence> research_product_metadata_license) {
this.research_product_metadata_license = research_product_metadata_license;
}
public List<String> getResearch_product_metadata_access_policy() {
return research_product_metadata_access_policy;
}
public List<String> getResearch_product_metadata_access_policy() {
return research_product_metadata_access_policy;
}
public void setResearch_product_metadata_access_policy(List<String> research_product_metadata_access_policy) {
this.research_product_metadata_access_policy = research_product_metadata_access_policy;
}
public void setResearch_product_metadata_access_policy(List<String> research_product_metadata_access_policy) {
this.research_product_metadata_access_policy = research_product_metadata_access_policy;
}
}

View File

@ -1,153 +1,154 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import org.codehaus.jackson.annotate.JsonProperty;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class Grant implements Serializable {
private String local_identifier;// id
private List<Identifier> identifiers;//.schema pid.qualifier.classid identifiers.value pid.value
//identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname
//identifiers.value project.code
private String local_identifier;// id
private List<Identifier> identifiers;// .schema pid.qualifier.classid identifiers.value pid.value
// identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname
// identifiers.value project.code
private String title;// title.value
@JsonProperty(value="abstract")
private String summary ;//summary.value
private String acronym; //acronym.value
private String funder ;//fundingtree to be used the xpath //funder/name
private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
private String currency;// currency.value
private Float funded_amount;//' fundedamount.value
private List<String> keywords;// subject.value
private String start_date;// startdate.value
private String end_date;// enddate.value
private String website;// websiteurl.value
private List<String> beneficiaries;// organization.id for the organizations in the relation with semantic class isParticipant produces the list of organization internal identifiers
private List<Contributor> contributors;//
private String title;// title.value
@JsonProperty(value = "abstract")
private String summary;// summary.value
private String acronym; // acronym.value
private String funder;// fundingtree to be used the xpath //funder/name
private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
private String currency;// currency.value
private Float funded_amount;// ' fundedamount.value
private List<String> keywords;// subject.value
private String start_date;// startdate.value
private String end_date;// enddate.value
private String website;// websiteurl.value
private List<String> beneficiaries;// organization.id for the organizations in the relation with semantic class
// isParticipant produces the list of organization internal identifiers
private List<Contributor> contributors;//
public String getLocal_identifier() {
return local_identifier;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getTitle() {
return title;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSummary() {
return summary;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getAcronym() {
return acronym;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getFunder() {
return funder;
}
public String getFunder() {
return funder;
}
public void setFunder(String funder) {
this.funder = funder;
}
public void setFunder(String funder) {
this.funder = funder;
}
public String getFunding_stream() {
return funding_stream;
}
public String getFunding_stream() {
return funding_stream;
}
public void setFunding_stream(String funding_stream) {
this.funding_stream = funding_stream;
}
public void setFunding_stream(String funding_stream) {
this.funding_stream = funding_stream;
}
public String getCurrency() {
return currency;
}
public String getCurrency() {
return currency;
}
public void setCurrency(String currency) {
this.currency = currency;
}
public void setCurrency(String currency) {
this.currency = currency;
}
public Float getFunded_amount() {
return funded_amount;
}
public Float getFunded_amount() {
return funded_amount;
}
public void setFunded_amount(Float funded_amount) {
this.funded_amount = funded_amount;
}
public void setFunded_amount(Float funded_amount) {
this.funded_amount = funded_amount;
}
public List<String> getKeywords() {
return keywords;
}
public List<String> getKeywords() {
return keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public String getStart_date() {
return start_date;
}
public String getStart_date() {
return start_date;
}
public void setStart_date(String start_date) {
this.start_date = start_date;
}
public void setStart_date(String start_date) {
this.start_date = start_date;
}
public String getEnd_date() {
return end_date;
}
public String getEnd_date() {
return end_date;
}
public void setEnd_date(String end_date) {
this.end_date = end_date;
}
public void setEnd_date(String end_date) {
this.end_date = end_date;
}
public String getWebsite() {
return website;
}
public String getWebsite() {
return website;
}
public void setWebsite(String website) {
this.website = website;
}
public void setWebsite(String website) {
this.website = website;
}
public List<String> getBeneficiaries() {
return beneficiaries;
}
public List<String> getBeneficiaries() {
return beneficiaries;
}
public void setBeneficiaries(List<String> beneficiaries) {
this.beneficiaries = beneficiaries;
}
public void setBeneficiaries(List<String> beneficiaries) {
this.beneficiaries = beneficiaries;
}
public List<Contributor> getContributors() {
return contributors;
}
public List<Contributor> getContributors() {
return contributors;
}
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
@ -8,77 +9,77 @@ import java.util.List;
* @Date 21/02/24
*/
public class Organization implements Serializable {
private String local_identifier; // id
private List<Identifier> identifiers; // pid.qualifier.classid; pid.value list
private String name ; //legalname.value
private String local_identifier; // id
private List<Identifier> identifiers; // pid.qualifier.classid; pid.value list
private String name; // legalname.value
private String short_name; // legalshortname.value
private List<String> other_names;// alternative_names.value list
private String website ;//websiteurl.value
private String country; // country.classid
private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other"
private String short_name; // legalshortname.value
private List<String> other_names;// alternative_names.value list
private String website;// websiteurl.value
private String country; // country.classid
private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other"
public String getLocal_identifier() {
return local_identifier;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public void setName(String name) {
this.name = name;
}
public String getShort_name() {
return short_name;
}
public String getShort_name() {
return short_name;
}
public void setShort_name(String short_name) {
this.short_name = short_name;
}
public void setShort_name(String short_name) {
this.short_name = short_name;
}
public List<String> getOther_names() {
return other_names;
}
public List<String> getOther_names() {
return other_names;
}
public void setOther_names(List<String> other_names) {
this.other_names = other_names;
}
public void setOther_names(List<String> other_names) {
this.other_names = other_names;
}
public String getWebsite() {
return website;
}
public String getWebsite() {
return website;
}
public void setWebsite(String website) {
this.website = website;
}
public void setWebsite(String website) {
this.website = website;
}
public String getCountry() {
return country;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public void setCountry(String country) {
this.country = country;
}
public String getType() {
return type;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public void setType(String type) {
this.type = type;
}
}

View File

@ -1,20 +1,17 @@
package eu.dnetlib.dhp.skgif.model;
public enum OrganizationTypes {
ARCHIVE ("archive"),
ARCHIVE("archive"),
COMPANY("company"),
COMPANY("company"),
EDUCATION("education"),
FACILITY("facility"),
GOVERNMENT("government"),
HEALTHCARE("healthcare"),
NONPROFIT("nonprofit"),
FUNDER("funder"),
OTHER("other");
public final String label;
EDUCATION("education"), FACILITY("facility"), GOVERNMENT("government"), HEALTHCARE("healthcare"), NONPROFIT(
"nonprofit"), FUNDER("funder"), OTHER("other");
private OrganizationTypes(String label) {
this.label = label;
}
public final String label;
private OrganizationTypes(String label) {
this.label = label;
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.skgif.model;
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
@ -8,24 +8,22 @@ import java.io.Serializable;
* @Date 21/02/24
*/
public enum Prefixes implements Serializable {
RESEARCH_PRODUCT("product_____::"),
RESEARCH_PRODUCT("product_____::"),
ORGANIZATION("organization::"),
ORGANIZATION("organization::"),
GRANT("grant_______::"),
GRANT("grant_______::"),
PERSON(
PERSON(
"person______::"),
TEMPORARY_PERSON("temp_person_::"),
TEMPORARY_PERSON("temp_person_::"),
DATASOURCE("datasource__::"),
TOPIC("topic_______::"),
VENUE("venue_______::");
DATASOURCE("datasource__::"), TOPIC("topic_______::"), VENUE("venue_______::");
public final String label;
public final String label;
private Prefixes(String label) {
this.label = label;
}
}
private Prefixes(String label) {
this.label = label;
}
}

View File

@ -8,15 +8,10 @@ import java.io.Serializable;
* @Date 05/09/23
*/
public enum RelationType implements Serializable {
RESULT_OUTCOME_FUNDING("isProducedBy"),
RESULT_AFFILIATIED_TO_ORGANIZATION("hasAuthorInstitution"),
ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"),
SUPPLEMENT("IsSupplementedBy"),
DOCUMENTS(
"IsDocumentedBy"),
PART("IsPartOf"),
VERSION("IsNewVersionOf"),
CITATION("Cites");
RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION(
"hasAuthorInstitution"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), SUPPLEMENT(
"IsSupplementedBy"), DOCUMENTS(
"IsDocumentedBy"), PART("IsPartOf"), VERSION("IsNewVersionOf"), CITATION("Cites");
public final String label;

View File

@ -16,7 +16,7 @@ public class ResearchProduct implements Serializable {
private String local_identifier;
private List<Identifier> identifiers;
private Map<String, List<String>> titles;
private Map<String,List<String>> abstracts;
private Map<String, List<String>> abstracts;
@JsonProperty("product_type")
private String product_type;
private List<ResultTopic> topics;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
@ -8,95 +9,95 @@ import java.util.List;
* @Date 27/02/24
*/
public class Venue implements Serializable {
private String local_identifier;
private List<Identifier> identifiers;
private String name;
private String acronym;
private String type;
private String publisher;
private String series;
private Boolean is_currently_full_oa;
private String local_identifier;
private List<Identifier> identifiers;
private String name;
private String acronym;
private String type;
private String publisher;
private String series;
private Boolean is_currently_full_oa;
private String creation_date;
private List<VenueContribution> contributions;
private String creation_date;
private List<VenueContribution> contributions;
public String getLocal_identifier() {
return local_identifier;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public void setName(String name) {
this.name = name;
}
public String getAcronym() {
return acronym;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getType() {
return type;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public void setType(String type) {
this.type = type;
}
public String getPublisher() {
return publisher;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getSeries() {
return series;
}
public String getSeries() {
return series;
}
public void setSeries(String series) {
this.series = series;
}
public void setSeries(String series) {
this.series = series;
}
public Boolean getIs_currently_full_oa() {
return is_currently_full_oa;
}
public Boolean getIs_currently_full_oa() {
return is_currently_full_oa;
}
public void setIs_currently_full_oa(Boolean is_currently_full_oa) {
this.is_currently_full_oa = is_currently_full_oa;
}
public void setIs_currently_full_oa(Boolean is_currently_full_oa) {
this.is_currently_full_oa = is_currently_full_oa;
}
public String getCreation_date() {
return creation_date;
}
public String getCreation_date() {
return creation_date;
}
public void setCreation_date(String creation_date) {
this.creation_date = creation_date;
}
public void setCreation_date(String creation_date) {
this.creation_date = creation_date;
}
public List<VenueContribution> getContributions() {
return contributions;
}
public List<VenueContribution> getContributions() {
return contributions;
}
public void setContributions(List<VenueContribution> contributions) {
this.contributions = contributions;
}
public void setContributions(List<VenueContribution> contributions) {
this.contributions = contributions;
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
@ -8,23 +9,23 @@ import java.util.List;
* @Date 27/02/24
*/
public class VenueContribution implements Serializable {
private String person;
private List<String> roles;
private String person;
private List<String> roles;
public String getPerson() {
public String getPerson() {
return person;
}
return person;
}
public void setPerson(String person) {
this.person = person;
}
public void setPerson(String person) {
this.person = person;
}
public List<String> getRoles() {
return roles;
}
public List<String> getRoles() {
return roles;
}
public void setRoles(List<String> roles) {
this.roles = roles;
}
public void setRoles(List<String> roles) {
this.roles = roles;
}
}

View File

@ -1,22 +1,16 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
public enum VenueIdentifierType implements Serializable {
EISSN("eissn"),
ISSN("issn"),
LISSN("lissn"),
ISBN("isbn"),
OPENDOAR(
"opendoar"),
R3DATA("re3data.org"),
FAIRSHARING("fairsharing");
EISSN("eissn"), ISSN("issn"), LISSN("lissn"), ISBN("isbn"), OPENDOAR(
"opendoar"), R3DATA("re3data.org"), FAIRSHARING("fairsharing");
public final String label;
public final String label;
private VenueIdentifierType(String label) {
this.label = label;
}
private VenueIdentifierType(String label) {
this.label = label;
}
}

View File

@ -1,21 +1,16 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
public enum VenueType implements Serializable {
REPOSITORY("repository"),
JOURNAL("journal"),
CONFERENCE("conference"),
BOOK("book"),
OTHER(
"other"),
UNKNOWN("unknown");
REPOSITORY("repository"), JOURNAL("journal"), CONFERENCE("conference"), BOOK("book"), OTHER(
"other"), UNKNOWN("unknown");
public final String label;
public final String label;
private VenueType(String label) {
this.label = label;
}
private VenueType(String label) {
this.label = label;
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.exceptions;
public class CardinalityTooHighException extends Exception {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.exceptions;
public class NoAvailableEntityTypeException extends Exception {

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -15,133 +18,156 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class DumpDatasource implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class);
private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpDatasource.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpDatasource.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Datasources");
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Datasources");
mapDatasource(spark, inputPath, outputPath);
});
}
mapDatasource(spark, inputPath, outputPath);
});
}
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) {
Utils.readPath(spark, inputPath + "datasource", Datasource.class)
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible() && ! d.getDataInfo().getDeletedbyinference())
.map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Datasource>) d -> {
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId()));
datasource.setIdentifiers(d.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
&& !d.getDataInfo().getDeletedbyinference())
.map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Datasource>) d -> {
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId()));
datasource
.setIdentifiers(
d
.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
datasource.setName(d.getOfficialname().getValue());
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
datasource.setJurisdiction(Optional.ofNullable(d.getJurisdiction())
.map(v -> v.getClassid()).
orElse(new String()));
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
datasource.setVersion_control(d.getVersioncontrol());
datasource.setName(d.getOfficialname().getValue());
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
datasource
.setJurisdiction(
Optional
.ofNullable(d.getJurisdiction())
.map(v -> v.getClassid())
.orElse(new String()));
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
datasource.setVersion_control(d.getVersioncontrol());
datasource.setData_source_classification(Optional.ofNullable(d.getEoscdatasourcetype())
.map(v -> v.getClassname()).
orElse(new String()));
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
datasource.setThematic(d.getThematic());
datasource.setResearch_product_access_policy(Optional.ofNullable(d.getDatabaseaccesstype())
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
.orElse(new ArrayList<>()));
datasource.setResearch_product_metadata_access_policy(Optional.ofNullable(d.getResearchproductmetadataaccesspolicies())
.map(v->getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
.orElse(new ArrayList<>()));
return datasource;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class) )
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath + "Datasource");
}
datasource
.setData_source_classification(
Optional
.ofNullable(d.getEoscdatasourcetype())
.map(v -> v.getClassname())
.orElse(new String()));
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
datasource.setThematic(d.getThematic());
datasource
.setResearch_product_access_policy(
Optional
.ofNullable(d.getDatabaseaccesstype())
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
.orElse(new ArrayList<>()));
datasource
.setResearch_product_metadata_access_policy(
Optional
.ofNullable(d.getResearchproductmetadataaccesspolicies())
.map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
.orElse(new ArrayList<>()));
return datasource;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Datasource");
}
private static List<String> getResearchProductAccessPolicy(List<String> value) {
private static List<String> getResearchProductAccessPolicy(List<String> value) {
return value.stream().map(v -> getResearchProductAccessPolicy(v)).filter(Objects::nonNull)
.map(v -> v.get(0)).distinct().collect(Collectors.toList());
}
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
//if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
//if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch(value){
case "open"://(https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted"://(https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed"://(https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
return value
.stream()
.map(v -> getResearchProductAccessPolicy(v))
.filter(Objects::nonNull)
.map(v -> v.get(0))
.distinct()
.collect(Collectors.toList());
}
private static List<String> getEoscProductType(List<String> researchentitytypes) {
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch (value) {
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
List<String> eoscProductType = new ArrayList<>();
if(researchentitytypes != null) {
private static List<String> getEoscProductType(List<String> researchentitytypes) {
if (researchentitytypes.contains("Software"))
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature");
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data");
if (researchentitytypes.contains("Organization") ||
researchentitytypes.contains("Organizations") ||
researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
List<String> eoscProductType = new ArrayList<>();
if (researchentitytypes != null) {
if (researchentitytypes.contains("Software"))
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature");
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data");
if (researchentitytypes.contains("Organization") ||
researchentitytypes.contains("Organizations") ||
researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
}

View File

@ -1,13 +1,16 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.skgif.model.Grant;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.RelationType;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -22,141 +25,185 @@ import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.skgif.model.Grant;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.RelationType;
import scala.Tuple2;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class DumpGrant implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpGrant.class);
private static final Logger log = LoggerFactory.getLogger(DumpGrant.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpGrant.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json"));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpGrant.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Grant");
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Grant");
mapGrants(spark, inputPath, outputPath);
});
}
mapGrants(spark, inputPath, outputPath);
});
}
private static void mapGrants(SparkSession spark, String inputPath, String outputPath) {
Dataset<Project> projects = Utils.readPath(spark, inputPath + "project", Project.class)
.filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible());
Dataset<Relation> relations = Utils.readPath(spark, inputPath + "relation", Relation.class)
.filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible() &&
r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label));
projects.joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left")
.groupByKey((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._1().getId(), Encoders.STRING() )
.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, Grant>) (k,v) ->{
Grant g = new Grant();
Tuple2<Project, Relation> first = v.next();
g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k));
g.setIdentifiers(getProjectIdentifier(first._1()));
g.setTitle(first._1().getTitle().getValue());
g.setSummary(Optional.ofNullable(first._1().getSummary())
.map(value->value.getValue()).orElse(new String()));
g.setAcronym(Optional.ofNullable(first._1().getAcronym())
.map(value->value.getValue()).orElse(new String()));
g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue()));
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue()));
g.setCurrency(Optional.ofNullable(first._1().getCurrency())
.map(value -> value.getValue()).orElse(new String()));
g.setFunded_amount(Optional.ofNullable(first._1().getFundedamount())
.orElse(null));
g.setKeywords(first._1().getSubjects()
.stream().map(s -> s.getValue()).collect(Collectors.toList()));
g.setStart_date(Optional.ofNullable(first._1().getStartdate())
.map(value -> value.getValue()).orElse(new String()));
g.setEnd_date(Optional.ofNullable(first._1().getEnddate())
.map(value -> value.getValue()).orElse(new String()));
g.setWebsite(Optional.ofNullable(first._1().getWebsiteurl())
.map(value -> value.getValue()).orElse(new String()));
if(Optional.ofNullable(first._2()).isPresent()) {
List<String> relevantOrganizatios = new ArrayList<>();
relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource()));
v.forEachRemaining(t2 -> relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource())));
g.setBeneficiaries(relevantOrganizatios);
}
return g;
} , Encoders.bean(Grant.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath + "Grant");
}
private static void mapGrants(SparkSession spark, String inputPath, String outputPath) {
Dataset<Project> projects = Utils
.readPath(spark, inputPath + "project", Project.class)
.filter(
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible());
Dataset<Relation> relations = Utils
.readPath(spark, inputPath + "relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible() &&
r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label));
projects
.joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left")
.groupByKey((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._1().getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, Grant>) (k, v) -> {
Grant g = new Grant();
Tuple2<Project, Relation> first = v.next();
g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k));
g.setIdentifiers(getProjectIdentifier(first._1()));
g.setTitle(first._1().getTitle().getValue());
g
.setSummary(
Optional
.ofNullable(first._1().getSummary())
.map(value -> value.getValue())
.orElse(new String()));
g
.setAcronym(
Optional
.ofNullable(first._1().getAcronym())
.map(value -> value.getValue())
.orElse(new String()));
g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue()));
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue()));
g
.setCurrency(
Optional
.ofNullable(first._1().getCurrency())
.map(value -> value.getValue())
.orElse(new String()));
g
.setFunded_amount(
Optional
.ofNullable(first._1().getFundedamount())
.orElse(null));
g
.setKeywords(
first
._1()
.getSubjects()
.stream()
.map(s -> s.getValue())
.collect(Collectors.toList()));
g
.setStart_date(
Optional
.ofNullable(first._1().getStartdate())
.map(value -> value.getValue())
.orElse(new String()));
g
.setEnd_date(
Optional
.ofNullable(first._1().getEnddate())
.map(value -> value.getValue())
.orElse(new String()));
g
.setWebsite(
Optional
.ofNullable(first._1().getWebsiteurl())
.map(value -> value.getValue())
.orElse(new String()));
if (Optional.ofNullable(first._2()).isPresent()) {
List<String> relevantOrganizatios = new ArrayList<>();
relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource()));
v
.forEachRemaining(
t2 -> relevantOrganizatios
.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource())));
g.setBeneficiaries(relevantOrganizatios);
}
return g;
}, Encoders.bean(Grant.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Grant");
}
private static String getFundingStream(String fundingtree) throws DocumentException {
final Document doc;
private static String getFundingStream(String fundingtree) throws DocumentException {
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
if(Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
doc.selectNodes("//funding_level_0").size() > 0)
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText();
return new String();
doc = new SAXReader().read(new StringReader(fundingtree));
if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
doc.selectNodes("//funding_level_0").size() > 0)
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText();
return new String();
}
}
private static String getFunderName(String fundingtree) throws DocumentException {
final Document doc;
private static String getFunderName(String fundingtree) throws DocumentException {
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
//f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
doc = new SAXReader().read(new StringReader(fundingtree));
// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
// f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
}
}
private static List<Identifier> getProjectIdentifier(Project project) {
if (project.getPid().size() > 0 )
return project.getPid().stream().map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())).collect(Collectors.toList());
return new ArrayList<>();
// private List<Identifier> identifiers;//.schema pid.qualifier.classid identifiers.value pid.value
//identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname
//identifiers.value project.code
private static List<Identifier> getProjectIdentifier(Project project) throws DocumentException {
List<Identifier> identifiers = new ArrayList<>();
if (project.getPid().size() > 0)
project
.getPid()
.stream()
.forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())));
identifiers
.add(
Identifier
.newInstance(
getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue()));
return identifiers;
}
}
}
}

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -16,95 +18,117 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
import eu.dnetlib.dhp.skgif.model.Prefixes;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class DumpOrganization implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class);
private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpOrganization.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json"));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpOrganization.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Organization");
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Organization");
mapOrganization(spark, inputPath, outputPath);
});
}
mapOrganization(spark, inputPath, outputPath);
});
}
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
organizations.filter((FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() && !o.getDataInfo().getInvisible())
.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
organization.setCountry(Optional.ofNullable(o.getCountry().getClassid())
.orElse(new String()));
organization.setName(Optional.ofNullable(o.getLegalname().getValue())
.orElse(new String()));
organization.setShort_name(Optional.ofNullable(o.getLegalshortname())
.map(v-> v.getValue())
.orElse(new String()));
organization.setIdentifiers(o.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
organization.setOther_names(o.getAlternativeNames().stream()
.map(a -> a.getValue())
.collect(Collectors.toList()));
organization.setType(getOrganizationType(o));
return organization;
}
, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath + "Organization");
}
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
organizations
.filter(
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
&& !o.getDataInfo().getInvisible())
.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
organization
.setCountry(
Optional
.ofNullable(o.getCountry().getClassid())
.orElse(new String()));
organization
.setName(
Optional
.ofNullable(o.getLegalname().getValue())
.orElse(new String()));
organization
.setShort_name(
Optional
.ofNullable(o.getLegalshortname())
.map(v -> v.getValue())
.orElse(new String()));
organization
.setIdentifiers(
o
.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
organization
.setOther_names(
o
.getAlternativeNames()
.stream()
.map(a -> a.getValue())
.collect(Collectors.toList()));
organization.setType(getOrganizationType(o));
return organization;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Organization");
}
private static String getOrganizationType(Organization o) {
if(Optional.ofNullable(o.getEcenterprise()).isPresent() && o.getEcenterprise().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.COMPANY.label;
if(Optional.ofNullable(o.getEchighereducation()).isPresent() && o.getEchighereducation().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if(Optional.ofNullable(o.getEcresearchorganization()).isPresent() && o.getEcresearchorganization().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if(Optional.ofNullable(o.getEcnonprofit()).isPresent() && o.getEcnonprofit().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.NONPROFIT.label;
private static String getOrganizationType(Organization o) {
if (Optional.ofNullable(o.getEcenterprise()).isPresent()
&& o.getEcenterprise().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.COMPANY.label;
if (Optional.ofNullable(o.getEchighereducation()).isPresent()
&& o.getEchighereducation().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if (Optional.ofNullable(o.getEcresearchorganization()).isPresent()
&& o.getEcresearchorganization().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if (Optional.ofNullable(o.getEcnonprofit()).isPresent()
&& o.getEcnonprofit().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.NONPROFIT.label;
return OrganizationTypes.OTHER.label;
return OrganizationTypes.OTHER.label;
}
}
}

View File

@ -6,11 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -22,9 +17,13 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.skgif.model.AccessRight;
import eu.dnetlib.dhp.utils.DHPUtils;
@ -106,15 +105,20 @@ public class DumpResult implements Serializable {
Dataset<Datasource> datasource = Utils
.readPath(spark, inputPath + "/datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEosctype()).isPresent() &&
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEoscdatasourcetype()).isPresent() &&
d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
Dataset<EmitPerManifestation> man = Utils
.readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
Dataset<PartialResearchProduct> partialResearchProduct = man.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left")
.groupByKey((MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (k, v) -> {
Dataset<PartialResearchProduct> partialResearchProduct = man
.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left")
.groupByKey(
(MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (
k, v) -> {
PartialResearchProduct prp = new PartialResearchProduct();
prp.setResultId(k);
List<Manifestation> manifestationList = new ArrayList<>();
@ -124,10 +128,13 @@ public class DumpResult implements Serializable {
return prp;
}, Encoders.bean(PartialResearchProduct.class));
partialResearchProduct
.joinWith(aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")), "left")
.map((MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> {
.joinWith(
aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")),
"left")
.map(
(MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> {
PartialResearchProduct prp = t2._1();
if(Optional.ofNullable(t2._2()).isPresent()){
if (Optional.ofNullable(t2._2()).isPresent()) {
prp.setRelated_products(t2._2().getRelatedProduct());
prp.setRelevant_organizations(t2._2().getOrganizations());
prp.setFunding(t2._2().getFunding());
@ -144,148 +151,83 @@ public class DumpResult implements Serializable {
private static Manifestation getManifestation(Tuple2<EmitPerManifestation, Datasource> t2) {
// se il lato sinistro c'e' allora ho la biblio e la venue
// se non c'e' allora ho solo gli altri valori
EmitPerManifestation epm = t2._1();
Manifestation manifestation = new Manifestation();
manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname());
manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
if(Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent())
manifestation
.setDates(
Arrays
.asList(
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
switch (epm.getInstance().getRefereed().getClassid()) {
case "0000":
manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
break;
case "0001":
manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
break;
case "0002":
manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
break;
}
// se il lato sinistro c'e' allora ho la biblio e la venue
// se non c'e' allora ho solo gli altri valori
EmitPerManifestation epm = t2._1();
Manifestation manifestation = new Manifestation();
manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname());
manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
if (Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent())
manifestation
.setDates(
Arrays
.asList(
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
switch (epm.getInstance().getRefereed().getClassid()) {
case "0000":
manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
break;
case "0001":
manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
break;
case "0002":
manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
break;
}
manifestation.setMetadata_curation("unavailable");
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
switch (epm.getInstance().getAccessright().getClassid()) {
case "OPEN":
case "OPEN DATA":
case "OPEN SOURCE":
manifestation.setAccess_right(AccessRight.OPEN.label);
break;
case "CLOSED":
manifestation.setAccess_right(AccessRight.CLOSED.label);
break;
case "RESTRICTED":
manifestation.setAccess_right(AccessRight.RESTRICTED.label);
break;
case "EMBARGO":
case "12MONTHS":
case "6MONTHS":
manifestation.setAccess_right(AccessRight.EMBARGO.label);
break;
default:
manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
manifestation.setMetadata_curation("unavailable");
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
switch (epm.getInstance().getAccessright().getClassid()) {
case "OPEN":
case "OPEN DATA":
case "OPEN SOURCE":
manifestation.setAccess_right(AccessRight.OPEN.label);
break;
case "CLOSED":
manifestation.setAccess_right(AccessRight.CLOSED.label);
break;
case "RESTRICTED":
manifestation.setAccess_right(AccessRight.RESTRICTED.label);
break;
case "EMBARGO":
case "12MONTHS":
case "6MONTHS":
manifestation.setAccess_right(AccessRight.EMBARGO.label);
break;
default:
manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
}
manifestation.setLicence(Optional.ofNullable(epm.getInstance().getLicense())
.map(value -> value.getValue())
.orElse(null));
manifestation.setUrl(Optional.ofNullable(epm.getInstance().getUrl())
.map(value -> value.get(0))
.orElse(null));
}
manifestation
.setLicence(
Optional
.ofNullable(epm.getInstance().getLicense())
.map(value -> value.getValue())
.orElse(null));
manifestation
.setUrl(
Optional
.ofNullable(epm.getInstance().getUrl())
.map(value -> value.get(0))
.orElse(null));
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) {
manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
}
if (Optional.ofNullable(t2._2()).isPresent()) {
manifestation.setBiblio(getBiblio(epm));
if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted()));
else if(Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline()));
}
manifestation
.setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE,epm.getInstance().getHostedby().getKey()));
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent() && epm.getInstance().getPid().size() > 0) {
manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
}
if (Optional.ofNullable(t2._2()).isPresent()) {
manifestation.setBiblio(getBiblio(epm));
if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted()));
else if (Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline()));
}
manifestation
.setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()));
return manifestation;
return manifestation;
}
// private static List<Manifestation> getManifestationList(Dataset<EmitPerManifestation> emitformanifestation,
// Dataset<Datasource> datasource) {
// return emitformanifestation
// .joinWith(
// datasource, emitformanifestation
// .col("hostedBy")
// .equalTo(datasource.col("id")),
// "left")
// .map((MapFunction<Tuple2<EmitPerManifestation, Datasource>, Manifestation>) t2 -> {
// // se il lato sinistro c'e' allora ho la biblio e la venue
// // se non c'e' allora ho solo gli altri valori
// EmitPerManifestation epm = t2._1();
// Manifestation manifestation = new Manifestation();
// manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getClassname());
// manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
// manifestation
// .setDates(
// Arrays
// .asList(
// Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
// if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
// switch (epm.getInstance().getRefereed().getClassid()) {
// case "0000":
// manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
// break;
// case "0001":
// manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
// break;
// case "0002":
// manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
// break;
// }
//
// manifestation.setMetadata_curation("unavailable");
// if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
// switch (epm.getInstance().getAccessright().getClassid()) {
// case "OPEN":
// case "OPEN DATA":
// case "OPEN SOURCE":
// manifestation.setAccess_right(AccessRight.OPEN.label);
// break;
// case "CLOSED":
// manifestation.setAccess_right(AccessRight.CLOSED.label);
// break;
// case "RESTRICTED":
// manifestation.setAccess_right(AccessRight.RESTRICTED.label);
// break;
// case "EMBARGO":
// case "12MONTHS":
// case "6MONTHS":
// manifestation.setAccess_right(AccessRight.EMBARGO.label);
// break;
// default:
// manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
//
// }
// manifestation.setLicence(epm.getInstance().getLicense().getValue());
// manifestation.setUrl(epm.getInstance().getUrl().get(0));
// if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) {
// manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
// }
// if (Optional.ofNullable(t2._2()).isPresent())
// manifestation.setBiblio(getBiblio(epm));
// manifestation.setVenue("venue_______::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey()));
// manifestation
// .setHosting_datasource("datasource__::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey()));
// return manifestation;
// }, Encoders.bean(Manifestation.class))
// .collectAsList();
// }
private static Biblio getBiblio(EmitPerManifestation epm) {
Biblio biblio = new Biblio();
biblio.setEdition(epm.getJournal().getEdition());
@ -298,7 +240,7 @@ public class DumpResult implements Serializable {
}
private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
String outputPath) {
ModelSupport.entityTypes
.keySet()
.parallelStream()
@ -314,14 +256,14 @@ public class DumpResult implements Serializable {
.joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left")
.map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> {
ResearchProduct rp = ResultMapper.map(t2._1());
if(Optional.ofNullable(t2._2()).isPresent()) {
if(Optional.ofNullable(t2._2().getRelated_products()).isPresent())
if (Optional.ofNullable(t2._2()).isPresent()) {
if (Optional.ofNullable(t2._2().getRelated_products()).isPresent())
rp.setRelated_products(t2._2().getRelated_products());
if(Optional.ofNullable(t2._2().getFunding()).isPresent())
if (Optional.ofNullable(t2._2().getFunding()).isPresent())
rp.setFunding(t2._2().getFunding());
if(Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent())
if (Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent())
rp.setRelevant_organizations(t2._2().getRelevant_organizations());
if(Optional.ofNullable(t2._2().getManifestations()).isPresent())
if (Optional.ofNullable(t2._2().getManifestations()).isPresent())
rp.setManifestations(t2._2().getManifestations());
}
return rp;
@ -333,30 +275,37 @@ public class DumpResult implements Serializable {
});
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
for(EntityType e : ModelSupport.entityTypes.keySet()) {
if(ModelSupport.isResult(e))
researchProducts = researchProducts.union(Utils.readPath(spark,workingDir + e.name() + "/researchproduct", ResearchProduct.class));
}
for (EntityType e : ModelSupport.entityTypes.keySet()) {
if (ModelSupport.isResult(e))
researchProducts = researchProducts
.union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class));
}
researchProducts
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath + "ResearchProduct");
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "ResearchProduct");
}
private static void selectRelations(SparkSession spark, String inputPath, String workingDir) {
Dataset<Relation> relation = Utils.readPath(spark,
inputPath + "relation", Relation.class)
.filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible())
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label)||
r.getRelClass().equalsIgnoreCase(RelationType.PART.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label));
Dataset<Relation> relation = Utils
.readPath(
spark,
inputPath + "relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible())
.filter(
(FilterFunction<Relation>) r -> r
.getRelClass()
.equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.PART.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label));
relation
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
@ -373,12 +322,14 @@ public class DumpResult implements Serializable {
rpp.getOrganizations().add(Utils.getIdentifier(Prefixes.ORGANIZATION, target));
break;
case "isproducedby":
rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT ,target));
rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT, target));
break;
default:
if (!remainignRelations.keySet().contains(relClass))
remainignRelations.put(relClass, new ArrayList<>());
remainignRelations.get(relClass).add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target));
remainignRelations
.get(relClass)
.add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target));
}
}
for (String key : remainignRelations.keySet())

View File

@ -1,156 +1,179 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.skgif.model.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class DumpVenue implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpVenue.class);
private static final Logger log = LoggerFactory.getLogger(DumpVenue.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpVenue.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpVenue.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Venue");
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "Venue");
mapDatasource(spark, inputPath, outputPath, workingDir);
});
}
mapVenue(spark, inputPath, outputPath, workingDir);
});
}
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Utils.readPath(spark, inputPath + "datasource", Datasource.class)
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible() && ! d.getDataInfo().getDeletedbyinference()
&& d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"))
.map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Venue>) d -> {
Venue venue = new Venue();
if(Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent())
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted()));
else if(Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent())
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline()));
venue.setIdentifiers(getVenueIdentifier(d.getJournal()));
venue.setName(d.getOfficialname().getValue());
venue.setType(VenueType.JOURNAL.label);
//todo add map for publisher. Get from results?
venue.setPublisher("find it from result");
venue.setAcronym(null);
venue.setSeries(null);
venue.setIs_currently_full_oa(null);
venue.setCreation_date(null);
venue.setContributions(null);
return venue;
}, Encoders.bean(Venue.class) )
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(workingDir + "Venues");
private static void mapVenue(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Dataset<EmitPerManifestation> manifestationDataset = Utils
.readPath(spark, workingDir + "datasourcePublisher", EmitPerManifestation.class);
Dataset<Datasource> datasourceDataset = Utils
.readPath(spark, inputPath + "datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
&& !d.getDataInfo().getDeletedbyinference()
&& d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
datasourceDataset
.joinWith(
manifestationDataset, datasourceDataset.col("id").equalTo(manifestationDataset.col("hostedby.key")),
"left")
.map((MapFunction<Tuple2<Datasource, EmitPerManifestation>, Venue>) t2 -> {
Venue venue = new Venue();
Datasource d = t2._1();
if (Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent())
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted()));
else if (Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent())
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline()));
venue.setIdentifiers(getVenueIdentifier(d.getJournal()));
venue.setName(d.getOfficialname().getValue());
venue.setType(VenueType.JOURNAL.label);
if (Optional.ofNullable(t2._2()).isPresent())
venue.setPublisher(t2._2().getPublisher());
venue.setAcronym(null);
venue.setSeries(null);
venue.setIs_currently_full_oa(null);
venue.setCreation_date(null);
venue.setContributions(null);
return venue;
}, Encoders.bean(Venue.class))
Utils.readPath(spark, workingDir + "Venues", Venue.class)
.groupByKey((MapFunction<Venue, String>)v -> v.getLocal_identifier() , Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Venue, Venue>) (k,v) -> v.next(), Encoders.bean(Venue.class) )
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath + "Venues");
}
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "Venues");
private static List<Identifier> getVenueIdentifier(Journal journal) {
List<Identifier> identifiers = new ArrayList<>();
if (Optional.ofNullable((journal.getIssnOnline())).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline()));
if(Optional.ofNullable(journal.getIssnPrinted()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted()));
if (Optional.ofNullable(journal.getIssnLinking()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking()));
return identifiers;
}
Utils
.readPath(spark, workingDir + "Venues", Venue.class)
.groupByKey((MapFunction<Venue, String>) v -> v.getLocal_identifier(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Venue, Venue>) (k, v) -> v.next(), Encoders.bean(Venue.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Venues");
}
private static List<String> getResearchProductAccessPolicy(List<String> value) {
private static List<Identifier> getVenueIdentifier(Journal journal) {
List<Identifier> identifiers = new ArrayList<>();
if (Optional.ofNullable((journal.getIssnOnline())).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline()));
if (Optional.ofNullable(journal.getIssnPrinted()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted()));
if (Optional.ofNullable(journal.getIssnLinking()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking()));
return identifiers;
}
return value.stream().map(v -> getResearchProductAccessPolicy(v)).filter(Objects::nonNull)
.map(v -> v.get(0)).distinct().collect(Collectors.toList());
}
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
//if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
//if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch(value){
case "open"://(https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted"://(https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed"://(https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
private static List<String> getResearchProductAccessPolicy(List<String> value) {
private static List<String> getEoscProductType(List<String> researchentitytypes) {
return value
.stream()
.map(v -> getResearchProductAccessPolicy(v))
.filter(Objects::nonNull)
.map(v -> v.get(0))
.distinct()
.collect(Collectors.toList());
}
List<String> eoscProductType = new ArrayList<>();
if(researchentitytypes != null) {
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch (value) {
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
if (researchentitytypes.contains("Software"))
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature");
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data");
if (researchentitytypes.contains("Organization") ||
researchentitytypes.contains("Organizations") ||
researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
private static List<String> getEoscProductType(List<String> researchentitytypes) {
List<String> eoscProductType = new ArrayList<>();
if (researchentitytypes != null) {
if (researchentitytypes.contains("Software"))
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature");
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data");
if (researchentitytypes.contains("Organization") ||
researchentitytypes.contains("Organizations") ||
researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
}

View File

@ -7,8 +7,6 @@ import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
@ -23,10 +21,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
@ -42,7 +41,7 @@ public class EmitFromResults implements Serializable {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
EmitFromResults.class
EmitFromResults.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/emit_biblio_parameters.json"));
@ -91,17 +90,21 @@ public class EmitFromResults implements Serializable {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Utils
.readPath(spark, inputPath + e.name(), resultClazz)
.filter((FilterFunction<R>) r -> Optional.of(r.getSubject()).isPresent())
.filter((FilterFunction<R>) r -> Optional.ofNullable(r.getSubject()).isPresent())
.flatMap(
(FlatMapFunction<R, Topic>) r -> r
.getSubject()
.stream()
.filter(s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") || s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos")
|| s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.map(s -> {
Topic t = new Topic();
t
.setLocal_identifier(
Utils.getIdentifier(Prefixes.TOPIC ,s.getQualifier().getClassid() + s.getValue()));
Utils
.getIdentifier(
Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
t
.setIdentifiers(
Arrays
@ -154,7 +157,8 @@ public class EmitFromResults implements Serializable {
p.setGiven_name(a.getName());
String identifier = new String();
if (Optional.ofNullable(a.getPid()).isPresent()) {
Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils.getOrcid(a.getPid());
Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils
.getOrcid(a.getPid());
if (orcid != null) {
identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2());
if (orcid._2())
@ -164,12 +168,15 @@ public class EmitFromResults implements Serializable {
else
p
.setIdentifiers(
Arrays.asList(Identifier.newInstance("inferred_orcid", orcid._1())));
Arrays
.asList(Identifier.newInstance("inferred_orcid", orcid._1())));
} else {
if (Optional.ofNullable(a.getRank()).isPresent()) {
identifier = Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,r.getId() + a.getRank());
identifier = Utils
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank());
} else {
identifier = Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,r.getId() + count);
identifier = Utils
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count);
}
}
@ -243,6 +250,32 @@ public class EmitFromResults implements Serializable {
}
});
Dataset<EmitPerManifestation> emitPerManifestationDataset = Utils
.readPath(
spark, workingDir + "software/manifestation", EmitPerManifestation.class)
.union(
Utils
.readPath(
spark, workingDir + "dataset/manifestation", EmitPerManifestation.class))
.union(
Utils
.readPath(
spark, workingDir + "publication/manifestation", EmitPerManifestation.class))
.union(
Utils
.readPath(
spark, workingDir + "otherresearchproduct/manifestation", EmitPerManifestation.class));
emitPerManifestationDataset
.groupByKey((MapFunction<EmitPerManifestation, String>) p -> p.getHostedBy(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, EmitPerManifestation, EmitPerManifestation>) (k, v) -> v.next(),
Encoders.bean(EmitPerManifestation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "/datasourcePublisher");
}
}

View File

@ -5,10 +5,9 @@ import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoAllowedTypeException;
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoTitleFoundException;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
@ -54,20 +53,21 @@ public class ResultMapper implements Serializable {
for (Author a : input.getAuthor()) {
count += 1;
Contribution contribution = new Contribution();
Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid());
if (orcid != null) {
contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()));
Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid());
if (orcid != null) {
contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()));
} else {
if (Optional.ofNullable(a.getRank()).isPresent()) {
contribution
.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + a.getRank()));
} else {
if (Optional.ofNullable(a.getRank()).isPresent()) {
contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,input.getId() + a.getRank()));
} else {
contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,input.getId() + count));
}
contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + count));
}
}
if(Optional.ofNullable(a.getRank()).isPresent()){
contribution.setRank(a.getRank());
}
}
if (Optional.ofNullable(a.getRank()).isPresent()) {
contribution.setRank(a.getRank());
}
contributionList.add(contribution);
}
@ -83,12 +83,15 @@ public class ResultMapper implements Serializable {
input
.getSubject()
.stream()
.filter(s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") ||
.filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") ||
s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.map(s -> {
ResultTopic topic = new ResultTopic();
topic.setTopic(Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
if(Optional.ofNullable(s.getDataInfo()).isPresent()){
topic
.setTopic(
Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
if (Optional.ofNullable(s.getDataInfo()).isPresent()) {
Provenance provenance = new Provenance();
provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust()));
provenance.setType(s.getDataInfo().getInferenceprovenance());
@ -101,7 +104,6 @@ public class ResultMapper implements Serializable {
}
}
private static <E extends Result> void mapType(ResearchProduct out, E input) throws NoAllowedTypeException {
switch (input.getResulttype().getClassid()) {
case "publication":
@ -148,7 +150,7 @@ public class ResultMapper implements Serializable {
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setTitles(Collections.singletonMap("none",Arrays.asList(iTitle.get(0).getValue())));
out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
return;
}
@ -158,7 +160,7 @@ public class ResultMapper implements Serializable {
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setTitles(Collections.singletonMap("none",Arrays.asList(iTitle.get(0).getValue())));
out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
}
}
@ -169,6 +171,6 @@ public class ResultMapper implements Serializable {
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setAbstracts(Collections.singletonMap("none",descriptionList));
out.setAbstracts(Collections.singletonMap("none", descriptionList));
}
}

View File

@ -5,16 +5,18 @@ import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
/**
@ -32,11 +34,11 @@ public class Utils implements Serializable {
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
@ -57,7 +59,7 @@ public class Utils implements Serializable {
return null;
}
public static String getIdentifier(Prefixes entity, String id){
public static String getIdentifier(Prefixes entity, String id) {
return entity.label + DHPUtils.md5(id);
}

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,216 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="emit_from_result"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="emit_from_result">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extraction</name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResults</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="dump_result"/>
<error to="Kill"/>
</action>
<action name="dump_result">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpResult</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_datasource"/>
<error to="Kill"/>
</action>
<action name="dump_datasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpDatasource</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_venue"/>
<error to="Kill"/>
</action>
<action name="dump_venue">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpVenue</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_organization"/>
<error to="Kill"/>
</action>
<action name="dump_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpOrganization</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_grant"/>
<error to="Kill"/>
</action>
<action name="dump_grant">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpGrant</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -7,7 +7,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResultJobTest;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.jupiter.api.Assertions;
@ -17,10 +16,10 @@ import org.junit.jupiter.api.Test;
import com.google.gson.Gson;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResultJobTest;
import eu.dnetlib.dhp.oa.zenodoapi.MissingConceptDoiException;
import eu.dnetlib.dhp.oa.zenodoapi.ZenodoAPIClient;
@Disabled
public class ZenodoUploadTest {
@ -162,8 +161,6 @@ public class ZenodoUploadTest {
}
@Test
void depositBigFile() throws MissingConceptDoiException, IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,

View File

@ -1,8 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.skgif.model.Datasource;
import eu.dnetlib.dhp.skgif.model.Organization;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -18,76 +21,76 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.skgif.model.Datasource;
import eu.dnetlib.dhp.skgif.model.Organization;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class DumpDatasourceTest implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static SparkSession spark;
private static Path workingDir;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpDatasourceTest.class);
private static final Logger log = LoggerFactory.getLogger(DumpDatasourceTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpDatasourceTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpDatasourceTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpDatasourceTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DumpDatasourceTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpDatasourceTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
spark = SparkSession
.builder()
.appName(DumpDatasourceTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpDatasource() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
@Test
public void testDumpDatasource() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
DumpDatasource
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpDatasource.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
});
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Datasource> datasource = sc
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
JavaRDD<Datasource> datasource = sc
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
Assertions.assertEquals(5,datasourceDataset.count());
datasourceDataset.show(false);
Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
Assertions.assertEquals(5, datasourceDataset.count());
datasourceDataset.show(false);
// Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -115,32 +118,33 @@ Assertions.assertEquals(5,datasourceDataset.count());
//
//
}
}
@Test
public void testDumpDatasourceComplete() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
@Test
public void testDumpDatasourceComplete() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
DumpDatasource
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpDatasource.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
});
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Datasource> datasource = sc
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
JavaRDD<Datasource> datasource = sc
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
datasourceDataset.foreach((ForeachFunction<Datasource>) d -> System.out.println(OBJECT_MAPPER.writeValueAsString(d)));
datasourceDataset
.foreach((ForeachFunction<Datasource>) d -> System.out.println(OBJECT_MAPPER.writeValueAsString(d)));
// Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -168,5 +172,5 @@ Assertions.assertEquals(5,datasourceDataset.count());
//
//
}
}
}

View File

@ -1,8 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.skgif.model.Datasource;
import eu.dnetlib.dhp.skgif.model.Grant;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -17,76 +20,76 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.skgif.model.Datasource;
import eu.dnetlib.dhp.skgif.model.Grant;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class DumpGrantTest implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static SparkSession spark;
private static Path workingDir;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpGrantTest.class);
private static final Logger log = LoggerFactory.getLogger(DumpGrantTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpGrantTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpGrantTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpGrantTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DumpGrantTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpGrantTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
spark = SparkSession
.builder()
.appName(DumpGrantTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpGrant() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
@Test
public void testDumpGrant() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
DumpGrant
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpGrant.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
});
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Grant> grant = sc
.textFile(workingDir.toString() + "/Grant")
.map(item -> OBJECT_MAPPER.readValue(item, Grant.class));
JavaRDD<Grant> grant = sc
.textFile(workingDir.toString() + "/Grant")
.map(item -> OBJECT_MAPPER.readValue(item, Grant.class));
Dataset<Grant> grantDataset = spark.createDataset(grant.rdd(), Encoders.bean(Grant.class));
Assertions.assertEquals(12,grantDataset.count());
grantDataset.show(false);
Dataset<Grant> grantDataset = spark.createDataset(grant.rdd(), Encoders.bean(Grant.class));
Assertions.assertEquals(12, grantDataset.count());
grantDataset.show(false);
// Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -114,5 +117,5 @@ Assertions.assertEquals(12,grantDataset.count());
//
//
}
}
}

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.skgif.model.Organization;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -20,77 +21,79 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.skgif.model.Organization;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class DumpOrganizationTest implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static SparkSession spark;
private static Path workingDir;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpOrganizationTest.class);
private static final Logger log = LoggerFactory.getLogger(DumpOrganizationTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpOrganizationTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpOrganizationTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpOrganizationTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DumpOrganizationTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpOrganizationTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
spark = SparkSession
.builder()
.appName(DumpOrganizationTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpOrganization() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
@Test
public void testDumpOrganization() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
DumpOrganization
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpOrganization
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
});
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Organization> organization = sc
.textFile(workingDir.toString() + "/Organization")
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class));
JavaRDD<Organization> organization = sc
.textFile(workingDir.toString() + "/Organization")
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class));
Dataset<Organization> organizationDataset = spark.createDataset(organization.rdd(), Encoders.bean(Organization.class));
Assertions.assertEquals(34-19,organizationDataset.count());
organizationDataset.show(false);
Dataset<Organization> organizationDataset = spark
.createDataset(organization.rdd(), Encoders.bean(Organization.class));
Assertions.assertEquals(34 - 19, organizationDataset.count());
organizationDataset.show(false);
// Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -118,5 +121,5 @@ Assertions.assertEquals(34-19,organizationDataset.count());
//
//
}
}
}

View File

@ -1,10 +1,13 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import javax.validation.constraints.AssertTrue;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -21,261 +24,456 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.validation.constraints.AssertTrue;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* @author miriam.baglioni
* @Date 20/02/24
*/
public class DumpResultTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static SparkSession spark;
private static Path workingDir;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class);
private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpResultTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DumpResultTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpResultTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
spark = SparkSession
.builder()
.appName(DumpResultTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testEmitFromResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
@Test
public void testEmitFromResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath();
final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/")
.getPath();
final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/")
.getPath();
DumpResult
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-workingDir", workingDir
DumpResult
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-workingDir", workingDir
});
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<RelationPerProduct> relation = sc
.textFile(workingDir + "/aggrelation")
.map(item -> OBJECT_MAPPER.readValue(item, RelationPerProduct.class));
JavaRDD<RelationPerProduct> relation = sc
.textFile(workingDir + "/aggrelation")
.map(item -> OBJECT_MAPPER.readValue(item, RelationPerProduct.class));
Dataset<RelationPerProduct> relationDataset = spark.createDataset(relation.rdd(), Encoders.bean(RelationPerProduct.class));
Dataset<RelationPerProduct> relationDataset = spark
.createDataset(relation.rdd(), Encoders.bean(RelationPerProduct.class));
relationDataset.show(false);
Assertions.assertEquals(7, relationDataset.count());
RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
Assertions.assertEquals(1, temp.getFunding().size());
Assertions.assertEquals(2, temp.getRelatedProduct().size());
Assertions.assertEquals(1, temp.getRelatedProduct().stream().filter(rp -> rp.getRelation_type().equalsIgnoreCase("issupplementedby")).count());
Assertions.assertEquals(1, temp.getRelatedProduct().stream().filter(rp -> rp.getRelation_type().equalsIgnoreCase("isdocumentedby")).count());
relationDataset.show(false);
Assertions.assertEquals(7, relationDataset.count());
RelationPerProduct temp = relationDataset
.filter(
(FilterFunction<RelationPerProduct>) r -> r
.getResultId()
.equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
.first();
Assertions
.assertEquals(
3, temp.getFunding().size() + temp.getRelatedProduct().size() + temp.getOrganizations().size());
Assertions.assertEquals(1, temp.getFunding().size());
Assertions.assertEquals(2, temp.getRelatedProduct().size());
Assertions
.assertEquals(
1,
temp
.getRelatedProduct()
.stream()
.filter(rp -> rp.getRelation_type().equalsIgnoreCase("issupplementedby"))
.count());
Assertions
.assertEquals(
1,
temp
.getRelatedProduct()
.stream()
.filter(rp -> rp.getRelation_type().equalsIgnoreCase("isdocumentedby"))
.count());
JavaRDD<ResearchProduct> researchProduct = sc
.textFile(workingDir.toString() + "/publication/researchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
JavaRDD<ResearchProduct> researchProduct = sc
.textFile(workingDir.toString() + "/publication/researchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
Assertions.assertEquals(1, researchProductDataset.filter((FilterFunction<ResearchProduct>) p -> p.getLocal_identifier().equalsIgnoreCase(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))).count());
ResearchProduct product = researchProductDataset.filter((FilterFunction<ResearchProduct>) p -> p.getLocal_identifier().equalsIgnoreCase(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))).first();
Assertions.assertEquals(2, product.getRelevant_organizations().size());
Assertions.assertEquals(1, product.getFunding().size());
Assertions.assertEquals(0, product.getRelated_products().size());
Assertions.assertEquals(1, product.getContributions().size());
Assertions.assertEquals(2, product.getManifestations().size());
Assertions
.assertEquals(
1,
researchProductDataset
.filter(
(FilterFunction<ResearchProduct>) p -> p
.getLocal_identifier()
.equalsIgnoreCase(
Utils
.getIdentifier(
Prefixes.RESEARCH_PRODUCT,
"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")))
.count());
ResearchProduct product = researchProductDataset
.filter(
(FilterFunction<ResearchProduct>) p -> p
.getLocal_identifier()
.equalsIgnoreCase(
Utils
.getIdentifier(
Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")))
.first();
Assertions.assertEquals(2, product.getRelevant_organizations().size());
Assertions.assertEquals(1, product.getFunding().size());
Assertions.assertEquals(0, product.getRelated_products().size());
Assertions.assertEquals(1, product.getContributions().size());
Assertions.assertEquals(2, product.getManifestations().size());
researchProductDataset.show(false);
researchProductDataset.show(false);
}
@Test
public void testEmitFromDedupedResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
}
final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir_complete_entities/")
.getPath();
@Test
public void testEmitFromDedupedResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
DumpResult
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-workingDir", workingDir,
"-outputPath", workingDir
final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir_complete_entities/")
.getPath();
});
DumpResult
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-workingDir", workingDir,
"-outputPath", workingDir
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
});
JavaRDD<ResearchProduct> researchProduct = sc
.textFile(workingDir.toString() + "ResearchProduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
JavaRDD<ResearchProduct> researchProduct = sc
.textFile(workingDir.toString() + "ResearchProduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
Assertions.assertEquals(1, researchProductDataset.count());
org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
ResearchProduct rp = researchProductDataset.first();
Assertions.assertEquals(1, researchProductDataset.count());
// check the local identifier
Assertions.assertEquals("product_____::e22a152ab43b9215d14ece613f76ec84", rp.getLocal_identifier());
ResearchProduct rp = researchProductDataset.first();
// check the pids of the result
Assertions.assertEquals(3, rp.getIdentifiers().size());
Assertions
.assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("doi")).count());
Assertions
.assertEquals(
"10.1007/s40199-021-00403-x",
rp
.getIdentifiers()
.stream()
.filter(p -> p.getScheme().equalsIgnoreCase("doi"))
.collect(Collectors.toList())
.get(0)
.getValue());
Assertions
.assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("pmid")).count());
Assertions
.assertEquals(
"34327650",
rp
.getIdentifiers()
.stream()
.filter(p -> p.getScheme().equalsIgnoreCase("pmid"))
.collect(Collectors.toList())
.get(0)
.getValue());
Assertions
.assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("pmc")).count());
Assertions
.assertEquals(
"PMC8602609",
rp
.getIdentifiers()
.stream()
.filter(p -> p.getScheme().equalsIgnoreCase("pmc"))
.collect(Collectors.toList())
.get(0)
.getValue());
//check the local identifier
Assertions.assertEquals("product_____::e22a152ab43b9215d14ece613f76ec84", rp.getLocal_identifier());
// check the title
Assertions.assertEquals(1, rp.getTitles().keySet().size());
Assertions.assertTrue(rp.getTitles().keySet().contains("none"));
Assertions.assertEquals(1, rp.getTitles().get("none").size());
//check the pids of the result
Assertions.assertEquals(3,rp.getIdentifiers().size());
Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("doi")).count());
Assertions.assertEquals("10.1007/s40199-021-00403-x", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("doi")).collect(Collectors.toList()).get(0).getValue());
Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmid")).count());
Assertions.assertEquals("34327650", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmid")).collect(Collectors.toList()).get(0).getValue());
Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmc")).count());
Assertions.assertEquals("PMC8602609", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmc")).collect(Collectors.toList()).get(0).getValue());
// check abstract
Assertions.assertEquals(1, rp.getAbstracts().keySet().size());
Assertions.assertTrue(rp.getAbstracts().keySet().contains("none"));
Assertions.assertEquals(1, rp.getAbstracts().get("none").size());
//check the title
Assertions.assertEquals(1, rp.getTitles().keySet().size());
Assertions.assertTrue(rp.getTitles().keySet().contains("none"));
Assertions.assertEquals(1, rp.getTitles().get("none").size());
// check type
Assertions.assertEquals("literature", rp.getProduct_type());
//check abstract
Assertions.assertEquals(1, rp.getAbstracts().keySet().size());
Assertions.assertTrue(rp.getAbstracts().keySet().contains("none"));
Assertions.assertEquals(1, rp.getAbstracts().get("none").size());
// check topics
Assertions.assertEquals(3, rp.getTopics().size());
Assertions
.assertTrue(
rp
.getTopics()
.stream()
.anyMatch(
t -> t
.getTopic()
.equalsIgnoreCase(Prefixes.TOPIC.label + DHPUtils.md5("FOSSustained delivery"))));
//check type
Assertions.assertEquals("literature", rp.getProduct_type());
// check contributions
Assertions.assertEquals(4, rp.getContributions().size());
Assertions
.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("person")).count());
Assertions
.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("temp")).count());
rp.getContributions().forEach(c -> Assertions.assertTrue(c.getDeclared_affiliation() == null));
Assertions
.assertEquals(
1,
rp
.getContributions()
.stream()
.filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-8284-6269true")))
.collect(Collectors.toList())
.get(0)
.getRank());
Assertions
.assertEquals(
2,
rp
.getContributions()
.stream()
.filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0002-0940-893xtrue")))
.collect(Collectors.toList())
.get(0)
.getRank());
Assertions
.assertEquals(
3,
rp
.getContributions()
.stream()
.filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-5291-577xtrue")))
.collect(Collectors.toList())
.get(0)
.getRank());
Assertions
.assertEquals(
4,
rp
.getContributions()
.stream()
.filter(
c -> c
.getPerson()
.equals(
Utils
.getIdentifier(
Prefixes.TEMPORARY_PERSON,
"50|doi_dedup___::0000661be7c602727bae9690778b16514")))
.collect(Collectors.toList())
.get(0)
.getRank());
researchProductDataset.show(10, 100, true);
//check topics
Assertions.assertEquals(3, rp.getTopics().size());
Assertions.assertTrue(rp.getTopics().stream().anyMatch(t -> t.getTopic().equalsIgnoreCase(Prefixes.TOPIC.label + DHPUtils.md5("FOSSustained delivery"))));
// check manifestation 1
Assertions.assertEquals(3, rp.getManifestations().size());
Manifestation manifestation = rp
.getManifestations()
.stream()
.filter(
m -> m
.getHosting_datasource()
.equals(
Utils.getIdentifier(Prefixes.DATASOURCE, "10|doajarticles::6107489403b31fc7cf37cb7fda35f7f1")))
.collect(Collectors.toList())
.get(0);
Assertions.assertEquals("Article", manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.CLOSED.label, manifestation.getAccess_right());
Assertions.assertEquals("Springer Nature TDM", manifestation.getLicence());
Assertions.assertEquals("https://doi.org/10.1007/s40199-021-00403-x", manifestation.getUrl());
Assertions.assertEquals("10.1007/s40199-021-00403-x", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() != null);
Biblio biblio = manifestation.getBiblio();
Assertions.assertTrue(biblio.getEdition() == null);
Assertions.assertTrue(biblio.getIssue() == null);
Assertions.assertEquals("Springer Science and Business Media LLC", biblio.getPublisher());
Assertions.assertEquals("29", biblio.getVolume());
Assertions.assertEquals("415", biblio.getStart_page());
Assertions.assertEquals("438", biblio.getEnd_page());
//check contributions
Assertions.assertEquals(4, rp.getContributions().size());
Assertions.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("person")).count());
Assertions.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("temp")).count());
rp.getContributions().forEach(c -> Assertions.assertTrue(c.getDeclared_affiliation()==null));
Assertions.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-8284-6269true")))
.collect(Collectors.toList()).get(0).getRank());
Assertions.assertEquals(2, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0002-0940-893xtrue")))
.collect(Collectors.toList()).get(0).getRank());
Assertions.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-5291-577xtrue")))
.collect(Collectors.toList()).get(0).getRank());
Assertions.assertEquals(4, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, "50|doi_dedup___::0000661be7c602727bae9690778b16514")))
.collect(Collectors.toList()).get(0).getRank());
researchProductDataset.show(10,100,true);
// check manifestation 2
manifestation = rp
.getManifestations()
.stream()
.filter(
m -> m
.getHosting_datasource()
.equals(
Utils.getIdentifier(Prefixes.DATASOURCE, "10|openaire____::55045bd2a65019fd8e6741a755395c8c")))
.collect(Collectors.toList())
.get(0);
Assertions.assertEquals("Article", manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2020-01-03", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.UNAVAILABLE.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://pubmed.ncbi.nlm.nih.gov/34327650", manifestation.getUrl());
Assertions.assertEquals("34327650", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
//check manifestation 1
Assertions.assertEquals(3, rp.getManifestations().size());
Manifestation manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|doajarticles::6107489403b31fc7cf37cb7fda35f7f1")))
.collect(Collectors.toList()).get(0);
Assertions.assertEquals("Article" , manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.CLOSED.label, manifestation.getAccess_right());
Assertions.assertEquals("Springer Nature TDM", manifestation.getLicence());
Assertions.assertEquals("https://doi.org/10.1007/s40199-021-00403-x", manifestation.getUrl());
Assertions.assertEquals("10.1007/s40199-021-00403-x", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() != null);
Biblio biblio = manifestation.getBiblio();
Assertions.assertTrue(biblio.getEdition() == null);
Assertions.assertTrue(biblio.getIssue() == null);
Assertions.assertEquals("Springer Science and Business Media LLC",biblio.getPublisher() );
Assertions.assertEquals("29", biblio.getVolume());
Assertions.assertEquals("415", biblio.getStart_page());
Assertions.assertEquals("438", biblio.getEnd_page());
// check manifestation 3
manifestation = rp
.getManifestations()
.stream()
.filter(
m -> m
.getHosting_datasource()
.equals(
Utils.getIdentifier(Prefixes.DATASOURCE, "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c")))
.collect(Collectors.toList())
.get(0);
Assertions.assertEquals("Other literature type", manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.OPEN.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://europepmc.org/articles/PMC8602609/", manifestation.getUrl());
Assertions.assertEquals("PMC8602609", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
//check manifestation 2
manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|openaire____::55045bd2a65019fd8e6741a755395c8c")))
.collect(Collectors.toList()).get(0);
Assertions.assertEquals("Article" , manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2020-01-03", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.UNAVAILABLE.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://pubmed.ncbi.nlm.nih.gov/34327650", manifestation.getUrl());
Assertions.assertEquals("34327650", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
// check relevant organization
Assertions.assertEquals(1, rp.getRelevant_organizations().size());
Assertions
.assertEquals(
Prefixes.ORGANIZATION.label + "601e510b1fda7cc6cb03329531502171",
rp.getRelevant_organizations().get(0));
//check manifestation 3
manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c")))
.collect(Collectors.toList()).get(0);
Assertions.assertEquals("Other literature type" , manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.OPEN.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://europepmc.org/articles/PMC8602609/", manifestation.getUrl());
Assertions.assertEquals("PMC8602609", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
// check funding
Assertions.assertEquals(1, rp.getFunding().size());
Assertions.assertEquals(Prefixes.GRANT.label + "a7795022763d413f5de59036ebbd0c52", rp.getFunding().get(0));
//check relevant organization
Assertions.assertEquals(1,rp.getRelevant_organizations().size());
Assertions.assertEquals(Prefixes.ORGANIZATION.label + "601e510b1fda7cc6cb03329531502171", rp.getRelevant_organizations().get(0));
// check related products
Assertions.assertEquals(5, rp.getRelated_products().size());
Assertions
.assertEquals(
4,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.CITATION.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.DOCUMENTS.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.PART.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.SUPPLEMENT.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.VERSION.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
//check funding
Assertions.assertEquals(1,rp.getFunding().size());
Assertions.assertEquals(Prefixes.GRANT.label + "a7795022763d413f5de59036ebbd0c52", rp.getFunding().get(0));
//check related products
Assertions.assertEquals(5, rp.getRelated_products().size());
Assertions.assertEquals(4, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.CITATION.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.DOCUMENTS.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.PART.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.SUPPLEMENT.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.VERSION.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
}
}
}

View File

@ -5,7 +5,6 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.skgif.model.Topic;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -24,9 +23,9 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.skgif.model.Persons;
import eu.dnetlib.dhp.skgif.model.Topic;
//@Disabled
public class EmitFromResultJobTest {
@ -101,17 +100,54 @@ public class EmitFromResultJobTest {
.createDataset(persons.rdd(), Encoders.bean(Persons.class));
personsDataset.show(false);
Persons claudiaBorer = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
.first();
Persons claudiaBorer = personsDataset
.filter(
(FilterFunction<Persons>) p -> p
.getLocal_identifier()
.equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
.first();
Assertions.assertEquals(2, personsDataset.filter((FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia") && p.getFamily_name().equalsIgnoreCase("borer")).count());
Assertions.assertEquals(1, personsDataset.filter((FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia") && p.getFamily_name().equalsIgnoreCase("borer") && !p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")).count());
Assertions
.assertEquals(
2,
personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia")
&& p.getFamily_name().equalsIgnoreCase("borer"))
.count());
Assertions
.assertEquals(
1,
personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia")
&& p.getFamily_name().equalsIgnoreCase("borer")
&& !p
.getLocal_identifier()
.equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
.count());
Assertions.assertEquals("claudia", claudiaBorer.getGiven_name().toLowerCase());
Assertions.assertEquals("borer", claudiaBorer.getFamily_name().toLowerCase());
Assertions.assertEquals(2, personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person")).count());
Assertions.assertEquals(1, personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person") && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")).count());
Persons orcidPerson = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person") && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")).first();
Assertions
.assertEquals(
2,
personsDataset
.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person"))
.count());
Assertions
.assertEquals(
1,
personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person")
&& p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916"))
.count());
Persons orcidPerson = personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person")
&& p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916"))
.first();
Assertions.assertEquals("M.", orcidPerson.getGiven_name());
Assertions.assertEquals("Kooi", orcidPerson.getFamily_name());
Assertions.assertEquals(1, orcidPerson.getIdentifiers().size());
@ -119,58 +155,57 @@ public class EmitFromResultJobTest {
Assertions.assertEquals("0000-0002-5597-4916", orcidPerson.getIdentifiers().get(0).getValue());
Dataset<EmitPerManifestation> manifestationDataset = spark
.createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class));
.createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class));
manifestationDataset.show(false);
Assertions.assertEquals(4, manifestationDataset.count());
Dataset<Topic> topicDataset = spark
.createDataset(topics.rdd(), Encoders.bean(Topic.class));
.createDataset(topics.rdd(), Encoders.bean(Topic.class));
Assertions.assertEquals(0, topicDataset.count());
}
@Test
public void testEmitFromResultComplete() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
EmitFromResults
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/result/",
"-workingDir", workingDir.toString() + "/"
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/result/",
"-workingDir", workingDir.toString() + "/"
});
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Persons> persons = sc
.textFile(workingDir.toString() + "/result/Persons")
.map(item -> OBJECT_MAPPER.readValue(item, Persons.class));
.textFile(workingDir.toString() + "/result/Persons")
.map(item -> OBJECT_MAPPER.readValue(item, Persons.class));
org.apache.spark.sql.Dataset<Persons> personsDataset = spark
.createDataset(persons.rdd(), Encoders.bean(Persons.class));
.createDataset(persons.rdd(), Encoders.bean(Persons.class));
personsDataset.foreach((ForeachFunction<Persons>) p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p)));
JavaRDD<Topic> topics = sc
.textFile(workingDir.toString() + "/result/Topic")
.map(item -> OBJECT_MAPPER.readValue(item, Topic.class));
Dataset<Topic> topicDataset = spark
.createDataset(topics.rdd(), Encoders.bean(Topic.class));
.textFile(workingDir.toString() + "/result/Topic")
.map(item -> OBJECT_MAPPER.readValue(item, Topic.class));
Dataset<Topic> topicDataset = spark
.createDataset(topics.rdd(), Encoders.bean(Topic.class));
Assertions.assertEquals(3, topicDataset.count());
topicDataset.foreach((ForeachFunction<Topic>) t -> System.out.println(OBJECT_MAPPER.writeValueAsString(t)));
JavaRDD<EmitPerManifestation> manifestation = sc
.textFile(workingDir.toString() + "/publication/manifestation")
.map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class));
.textFile(workingDir.toString() + "/publication/manifestation")
.map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class));
Dataset<EmitPerManifestation> manifestationDataset = spark
.createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class));
.createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class));
manifestationDataset.show(false);
// Persons claudiaBorer = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
@ -194,6 +229,5 @@ public class EmitFromResultJobTest {
// Assertions.assertEquals(4, manifestationDataset.count());
//
}
}