[SKG-IF] refactoring and fixing issues

This commit is contained in:
Miriam Baglioni 2024-03-01 09:35:15 +01:00
parent 0c887ca015
commit 752fd896e4
32 changed files with 2082 additions and 1496 deletions

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
@ -7,8 +8,9 @@ import java.io.Serializable;
* @Date 22/02/24 * @Date 22/02/24
*/ */
public class Contributor implements Serializable { public class Contributor implements Serializable {
private String person; //I would not map it because we have only information regarding the person (if any) associated to the leading organization private String person; // I would not map it because we have only information regarding the person (if any)
private String organization ; //contributors.person // associated to the leading organization
private String organization; // contributors.person
private String role ;//private private String role;// private
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
@ -8,143 +9,146 @@ import java.util.List;
* @Date 21/02/24 * @Date 21/02/24
*/ */
public class Datasource implements Serializable { public class Datasource implements Serializable {
private String local_identifier ;//id private String local_identifier;// id
private List<Identifier> identifiers; //.schema pid.qualifier.classid;identifiers.value pid.value private List<Identifier> identifiers; // .schema pid.qualifier.classid;identifiers.value pid.value
private String name; //officialname.value private String name; // officialname.value
private String submission_policy_url;// submissionpolicyurl private String submission_policy_url;// submissionpolicyurl
private String preservation_policy_url;// preservationpolicyurl private String preservation_policy_url;// preservationpolicyurl
private Boolean version_control;// versioncontrol bool private Boolean version_control;// versioncontrol bool
private List<PersistentIdentitySystems> persistent_identity_systems;//. product_type researchentitytype list type to be remapped to the eosc types private List<PersistentIdentitySystems> persistent_identity_systems;// . product_type researchentitytype list type
//persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values // to be remapped to the eosc types
private String jurisdiction;// jurisdiction.classname // persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values
private String data_source_classification;// eoscdatasourcetype.classname private String jurisdiction;// jurisdiction.classname
private List<String> research_product_type;// researchentitytype list type to be remapped to the eosc types private String data_source_classification;// eoscdatasourcetype.classname
private Boolean thematic ;//thematic bool private List<String> research_product_type;// researchentitytype list type to be remapped to the eosc types
private List<Licence> research_product_license; //.name not mappable listresearch_product_license.url not mappable private Boolean thematic;// thematic bool
private List<String> research_product_access_policy;// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) private List<Licence> research_product_license; // .name not mappable listresearch_product_license.url not mappable
//if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) private List<String> research_product_access_policy;// "databaseaccesstype if open => open access
//if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list // (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
private List<Licence> research_product_metadata_license; //.name not mappable list // if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
//research_product_metadata_license.url not mappable // if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list
private List<String>research_product_metadata_access_policy ;//researchproductmetadataccesspolicies list with the same mapping of research_product_access_policy private List<Licence> research_product_metadata_license; // .name not mappable list
// research_product_metadata_license.url not mappable
private List<String> research_product_metadata_access_policy;// researchproductmetadataccesspolicies list with the
// same mapping of research_product_access_policy
public String getLocal_identifier() { public String getLocal_identifier() {
return local_identifier; return local_identifier;
} }
public void setLocal_identifier(String local_identifier) { public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier; this.local_identifier = local_identifier;
} }
public List<Identifier> getIdentifiers() { public List<Identifier> getIdentifiers() {
return identifiers; return identifiers;
} }
public void setIdentifiers(List<Identifier> identifiers) { public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers; this.identifiers = identifiers;
} }
public String getName() { public String getName() {
return name; return name;
} }
public void setName(String name) { public void setName(String name) {
this.name = name; this.name = name;
} }
public String getSubmission_policy_url() { public String getSubmission_policy_url() {
return submission_policy_url; return submission_policy_url;
} }
public void setSubmission_policy_url(String submission_policy_url) { public void setSubmission_policy_url(String submission_policy_url) {
this.submission_policy_url = submission_policy_url; this.submission_policy_url = submission_policy_url;
} }
public String getPreservation_policy_url() { public String getPreservation_policy_url() {
return preservation_policy_url; return preservation_policy_url;
} }
public void setPreservation_policy_url(String preservation_policy_url) { public void setPreservation_policy_url(String preservation_policy_url) {
this.preservation_policy_url = preservation_policy_url; this.preservation_policy_url = preservation_policy_url;
} }
public Boolean getVersion_control() { public Boolean getVersion_control() {
return version_control; return version_control;
} }
public void setVersion_control(Boolean version_control) { public void setVersion_control(Boolean version_control) {
this.version_control = version_control; this.version_control = version_control;
} }
public List<PersistentIdentitySystems> getPersistent_identity_systems() { public List<PersistentIdentitySystems> getPersistent_identity_systems() {
return persistent_identity_systems; return persistent_identity_systems;
} }
public void setPersistent_identity_systems(List<PersistentIdentitySystems> persistent_identity_systems) { public void setPersistent_identity_systems(List<PersistentIdentitySystems> persistent_identity_systems) {
this.persistent_identity_systems = persistent_identity_systems; this.persistent_identity_systems = persistent_identity_systems;
} }
public String getJurisdiction() { public String getJurisdiction() {
return jurisdiction; return jurisdiction;
} }
public void setJurisdiction(String jurisdiction) { public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction; this.jurisdiction = jurisdiction;
} }
public String getData_source_classification() { public String getData_source_classification() {
return data_source_classification; return data_source_classification;
} }
public void setData_source_classification(String data_source_classification) { public void setData_source_classification(String data_source_classification) {
this.data_source_classification = data_source_classification; this.data_source_classification = data_source_classification;
} }
public List<String> getResearch_product_type() { public List<String> getResearch_product_type() {
return research_product_type; return research_product_type;
} }
public void setResearch_product_type(List<String> research_product_type) { public void setResearch_product_type(List<String> research_product_type) {
this.research_product_type = research_product_type; this.research_product_type = research_product_type;
} }
public Boolean getThematic() { public Boolean getThematic() {
return thematic; return thematic;
} }
public void setThematic(Boolean thematic) { public void setThematic(Boolean thematic) {
this.thematic = thematic; this.thematic = thematic;
} }
public List<Licence> getResearch_product_license() { public List<Licence> getResearch_product_license() {
return research_product_license; return research_product_license;
} }
public void setResearch_product_license(List<Licence> research_product_license) { public void setResearch_product_license(List<Licence> research_product_license) {
this.research_product_license = research_product_license; this.research_product_license = research_product_license;
} }
public List<String> getResearch_product_access_policy() { public List<String> getResearch_product_access_policy() {
return research_product_access_policy; return research_product_access_policy;
} }
public void setResearch_product_access_policy(List<String> research_product_access_policy) { public void setResearch_product_access_policy(List<String> research_product_access_policy) {
this.research_product_access_policy = research_product_access_policy; this.research_product_access_policy = research_product_access_policy;
} }
public List<Licence> getResearch_product_metadata_license() { public List<Licence> getResearch_product_metadata_license() {
return research_product_metadata_license; return research_product_metadata_license;
} }
public void setResearch_product_metadata_license(List<Licence> research_product_metadata_license) { public void setResearch_product_metadata_license(List<Licence> research_product_metadata_license) {
this.research_product_metadata_license = research_product_metadata_license; this.research_product_metadata_license = research_product_metadata_license;
} }
public List<String> getResearch_product_metadata_access_policy() { public List<String> getResearch_product_metadata_access_policy() {
return research_product_metadata_access_policy; return research_product_metadata_access_policy;
} }
public void setResearch_product_metadata_access_policy(List<String> research_product_metadata_access_policy) { public void setResearch_product_metadata_access_policy(List<String> research_product_metadata_access_policy) {
this.research_product_metadata_access_policy = research_product_metadata_access_policy; this.research_product_metadata_access_policy = research_product_metadata_access_policy;
} }
} }

View File

@ -1,153 +1,154 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List; import java.util.List;
import org.codehaus.jackson.annotate.JsonProperty; import org.codehaus.jackson.annotate.JsonProperty;
import java.io.Serializable;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 22/02/24 * @Date 22/02/24
*/ */
public class Grant implements Serializable { public class Grant implements Serializable {
private String local_identifier;// id private String local_identifier;// id
private List<Identifier> identifiers;//.schema pid.qualifier.classid identifiers.value pid.value private List<Identifier> identifiers;// .schema pid.qualifier.classid identifiers.value pid.value
//identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname // identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname
//identifiers.value project.code // identifiers.value project.code
private String title;// title.value private String title;// title.value
@JsonProperty(value="abstract") @JsonProperty(value = "abstract")
private String summary ;//summary.value private String summary;// summary.value
private String acronym; //acronym.value private String acronym; // acronym.value
private String funder ;//fundingtree to be used the xpath //funder/name private String funder;// fundingtree to be used the xpath //funder/name
private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
private String currency;// currency.value private String currency;// currency.value
private Float funded_amount;//' fundedamount.value private Float funded_amount;// ' fundedamount.value
private List<String> keywords;// subject.value private List<String> keywords;// subject.value
private String start_date;// startdate.value private String start_date;// startdate.value
private String end_date;// enddate.value private String end_date;// enddate.value
private String website;// websiteurl.value private String website;// websiteurl.value
private List<String> beneficiaries;// organization.id for the organizations in the relation with semantic class isParticipant produces the list of organization internal identifiers private List<String> beneficiaries;// organization.id for the organizations in the relation with semantic class
private List<Contributor> contributors;// // isParticipant produces the list of organization internal identifiers
private List<Contributor> contributors;//
public String getLocal_identifier() { public String getLocal_identifier() {
return local_identifier; return local_identifier;
} }
public void setLocal_identifier(String local_identifier) { public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier; this.local_identifier = local_identifier;
} }
public List<Identifier> getIdentifiers() { public List<Identifier> getIdentifiers() {
return identifiers; return identifiers;
} }
public void setIdentifiers(List<Identifier> identifiers) { public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers; this.identifiers = identifiers;
} }
public String getTitle() { public String getTitle() {
return title; return title;
} }
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }
public String getSummary() { public String getSummary() {
return summary; return summary;
} }
public void setSummary(String summary) { public void setSummary(String summary) {
this.summary = summary; this.summary = summary;
} }
public String getAcronym() { public String getAcronym() {
return acronym; return acronym;
} }
public void setAcronym(String acronym) { public void setAcronym(String acronym) {
this.acronym = acronym; this.acronym = acronym;
} }
public String getFunder() { public String getFunder() {
return funder; return funder;
} }
public void setFunder(String funder) { public void setFunder(String funder) {
this.funder = funder; this.funder = funder;
} }
public String getFunding_stream() { public String getFunding_stream() {
return funding_stream; return funding_stream;
} }
public void setFunding_stream(String funding_stream) { public void setFunding_stream(String funding_stream) {
this.funding_stream = funding_stream; this.funding_stream = funding_stream;
} }
public String getCurrency() { public String getCurrency() {
return currency; return currency;
} }
public void setCurrency(String currency) { public void setCurrency(String currency) {
this.currency = currency; this.currency = currency;
} }
public Float getFunded_amount() { public Float getFunded_amount() {
return funded_amount; return funded_amount;
} }
public void setFunded_amount(Float funded_amount) { public void setFunded_amount(Float funded_amount) {
this.funded_amount = funded_amount; this.funded_amount = funded_amount;
} }
public List<String> getKeywords() { public List<String> getKeywords() {
return keywords; return keywords;
} }
public void setKeywords(List<String> keywords) { public void setKeywords(List<String> keywords) {
this.keywords = keywords; this.keywords = keywords;
} }
public String getStart_date() { public String getStart_date() {
return start_date; return start_date;
} }
public void setStart_date(String start_date) { public void setStart_date(String start_date) {
this.start_date = start_date; this.start_date = start_date;
} }
public String getEnd_date() { public String getEnd_date() {
return end_date; return end_date;
} }
public void setEnd_date(String end_date) { public void setEnd_date(String end_date) {
this.end_date = end_date; this.end_date = end_date;
} }
public String getWebsite() { public String getWebsite() {
return website; return website;
} }
public void setWebsite(String website) { public void setWebsite(String website) {
this.website = website; this.website = website;
} }
public List<String> getBeneficiaries() { public List<String> getBeneficiaries() {
return beneficiaries; return beneficiaries;
} }
public void setBeneficiaries(List<String> beneficiaries) { public void setBeneficiaries(List<String> beneficiaries) {
this.beneficiaries = beneficiaries; this.beneficiaries = beneficiaries;
} }
public List<Contributor> getContributors() { public List<Contributor> getContributors() {
return contributors; return contributors;
} }
public void setContributors(List<Contributor> contributors) { public void setContributors(List<Contributor> contributors) {
this.contributors = contributors; this.contributors = contributors;
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
@ -8,77 +9,77 @@ import java.util.List;
* @Date 21/02/24 * @Date 21/02/24
*/ */
public class Organization implements Serializable { public class Organization implements Serializable {
private String local_identifier; // id private String local_identifier; // id
private List<Identifier> identifiers; // pid.qualifier.classid; pid.value list private List<Identifier> identifiers; // pid.qualifier.classid; pid.value list
private String name ; //legalname.value private String name; // legalname.value
private String short_name; // legalshortname.value private String short_name; // legalshortname.value
private List<String> other_names;// alternative_names.value list private List<String> other_names;// alternative_names.value list
private String website ;//websiteurl.value private String website;// websiteurl.value
private String country; // country.classid private String country; // country.classid
private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other" private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other"
public String getLocal_identifier() { public String getLocal_identifier() {
return local_identifier; return local_identifier;
} }
public void setLocal_identifier(String local_identifier) { public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier; this.local_identifier = local_identifier;
} }
public List<Identifier> getIdentifiers() { public List<Identifier> getIdentifiers() {
return identifiers; return identifiers;
} }
public void setIdentifiers(List<Identifier> identifiers) { public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers; this.identifiers = identifiers;
} }
public String getName() { public String getName() {
return name; return name;
} }
public void setName(String name) { public void setName(String name) {
this.name = name; this.name = name;
} }
public String getShort_name() { public String getShort_name() {
return short_name; return short_name;
} }
public void setShort_name(String short_name) { public void setShort_name(String short_name) {
this.short_name = short_name; this.short_name = short_name;
} }
public List<String> getOther_names() { public List<String> getOther_names() {
return other_names; return other_names;
} }
public void setOther_names(List<String> other_names) { public void setOther_names(List<String> other_names) {
this.other_names = other_names; this.other_names = other_names;
} }
public String getWebsite() { public String getWebsite() {
return website; return website;
} }
public void setWebsite(String website) { public void setWebsite(String website) {
this.website = website; this.website = website;
} }
public String getCountry() { public String getCountry() {
return country; return country;
} }
public void setCountry(String country) { public void setCountry(String country) {
this.country = country; this.country = country;
} }
public String getType() { public String getType() {
return type; return type;
} }
public void setType(String type) { public void setType(String type) {
this.type = type; this.type = type;
} }
} }

View File

@ -1,20 +1,17 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
public enum OrganizationTypes { public enum OrganizationTypes {
ARCHIVE ("archive"), ARCHIVE("archive"),
COMPANY("company"), COMPANY("company"),
EDUCATION("education"), EDUCATION("education"), FACILITY("facility"), GOVERNMENT("government"), HEALTHCARE("healthcare"), NONPROFIT(
FACILITY("facility"), "nonprofit"), FUNDER("funder"), OTHER("other");
GOVERNMENT("government"),
HEALTHCARE("healthcare"),
NONPROFIT("nonprofit"),
FUNDER("funder"),
OTHER("other");
public final String label;
private OrganizationTypes(String label) { public final String label;
this.label = label;
} private OrganizationTypes(String label) {
this.label = label;
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.skgif.model;
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
@ -8,24 +8,22 @@ import java.io.Serializable;
* @Date 21/02/24 * @Date 21/02/24
*/ */
public enum Prefixes implements Serializable { public enum Prefixes implements Serializable {
RESEARCH_PRODUCT("product_____::"), RESEARCH_PRODUCT("product_____::"),
ORGANIZATION("organization::"), ORGANIZATION("organization::"),
GRANT("grant_______::"), GRANT("grant_______::"),
PERSON( PERSON(
"person______::"), "person______::"),
TEMPORARY_PERSON("temp_person_::"), TEMPORARY_PERSON("temp_person_::"),
DATASOURCE("datasource__::"), DATASOURCE("datasource__::"), TOPIC("topic_______::"), VENUE("venue_______::");
TOPIC("topic_______::"),
VENUE("venue_______::");
public final String label; public final String label;
private Prefixes(String label) { private Prefixes(String label) {
this.label = label; this.label = label;
} }
} }

View File

@ -8,15 +8,10 @@ import java.io.Serializable;
* @Date 05/09/23 * @Date 05/09/23
*/ */
public enum RelationType implements Serializable { public enum RelationType implements Serializable {
RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION(
RESULT_AFFILIATIED_TO_ORGANIZATION("hasAuthorInstitution"), "hasAuthorInstitution"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), SUPPLEMENT(
ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), "IsSupplementedBy"), DOCUMENTS(
SUPPLEMENT("IsSupplementedBy"), "IsDocumentedBy"), PART("IsPartOf"), VERSION("IsNewVersionOf"), CITATION("Cites");
DOCUMENTS(
"IsDocumentedBy"),
PART("IsPartOf"),
VERSION("IsNewVersionOf"),
CITATION("Cites");
public final String label; public final String label;

View File

@ -16,7 +16,7 @@ public class ResearchProduct implements Serializable {
private String local_identifier; private String local_identifier;
private List<Identifier> identifiers; private List<Identifier> identifiers;
private Map<String, List<String>> titles; private Map<String, List<String>> titles;
private Map<String,List<String>> abstracts; private Map<String, List<String>> abstracts;
@JsonProperty("product_type") @JsonProperty("product_type")
private String product_type; private String product_type;
private List<ResultTopic> topics; private List<ResultTopic> topics;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
@ -8,95 +9,95 @@ import java.util.List;
* @Date 27/02/24 * @Date 27/02/24
*/ */
public class Venue implements Serializable { public class Venue implements Serializable {
private String local_identifier; private String local_identifier;
private List<Identifier> identifiers; private List<Identifier> identifiers;
private String name; private String name;
private String acronym; private String acronym;
private String type; private String type;
private String publisher; private String publisher;
private String series; private String series;
private Boolean is_currently_full_oa; private Boolean is_currently_full_oa;
private String creation_date; private String creation_date;
private List<VenueContribution> contributions; private List<VenueContribution> contributions;
public String getLocal_identifier() { public String getLocal_identifier() {
return local_identifier; return local_identifier;
} }
public void setLocal_identifier(String local_identifier) { public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier; this.local_identifier = local_identifier;
} }
public List<Identifier> getIdentifiers() { public List<Identifier> getIdentifiers() {
return identifiers; return identifiers;
} }
public void setIdentifiers(List<Identifier> identifiers) { public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers; this.identifiers = identifiers;
} }
public String getName() { public String getName() {
return name; return name;
} }
public void setName(String name) { public void setName(String name) {
this.name = name; this.name = name;
} }
public String getAcronym() { public String getAcronym() {
return acronym; return acronym;
} }
public void setAcronym(String acronym) { public void setAcronym(String acronym) {
this.acronym = acronym; this.acronym = acronym;
} }
public String getType() { public String getType() {
return type; return type;
} }
public void setType(String type) { public void setType(String type) {
this.type = type; this.type = type;
} }
public String getPublisher() { public String getPublisher() {
return publisher; return publisher;
} }
public void setPublisher(String publisher) { public void setPublisher(String publisher) {
this.publisher = publisher; this.publisher = publisher;
} }
public String getSeries() { public String getSeries() {
return series; return series;
} }
public void setSeries(String series) { public void setSeries(String series) {
this.series = series; this.series = series;
} }
public Boolean getIs_currently_full_oa() { public Boolean getIs_currently_full_oa() {
return is_currently_full_oa; return is_currently_full_oa;
} }
public void setIs_currently_full_oa(Boolean is_currently_full_oa) { public void setIs_currently_full_oa(Boolean is_currently_full_oa) {
this.is_currently_full_oa = is_currently_full_oa; this.is_currently_full_oa = is_currently_full_oa;
} }
public String getCreation_date() { public String getCreation_date() {
return creation_date; return creation_date;
} }
public void setCreation_date(String creation_date) { public void setCreation_date(String creation_date) {
this.creation_date = creation_date; this.creation_date = creation_date;
} }
public List<VenueContribution> getContributions() { public List<VenueContribution> getContributions() {
return contributions; return contributions;
} }
public void setContributions(List<VenueContribution> contributions) { public void setContributions(List<VenueContribution> contributions) {
this.contributions = contributions; this.contributions = contributions;
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
@ -8,23 +9,23 @@ import java.util.List;
* @Date 27/02/24 * @Date 27/02/24
*/ */
public class VenueContribution implements Serializable { public class VenueContribution implements Serializable {
private String person; private String person;
private List<String> roles; private List<String> roles;
public String getPerson() { public String getPerson() {
return person; return person;
} }
public void setPerson(String person) { public void setPerson(String person) {
this.person = person; this.person = person;
} }
public List<String> getRoles() { public List<String> getRoles() {
return roles; return roles;
} }
public void setRoles(List<String> roles) { public void setRoles(List<String> roles) {
this.roles = roles; this.roles = roles;
} }
} }

View File

@ -1,22 +1,16 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
public enum VenueIdentifierType implements Serializable { public enum VenueIdentifierType implements Serializable {
EISSN("eissn"), EISSN("eissn"), ISSN("issn"), LISSN("lissn"), ISBN("isbn"), OPENDOAR(
ISSN("issn"), "opendoar"), R3DATA("re3data.org"), FAIRSHARING("fairsharing");
LISSN("lissn"),
ISBN("isbn"),
OPENDOAR(
"opendoar"),
R3DATA("re3data.org"),
FAIRSHARING("fairsharing");
public final String label;
public final String label; private VenueIdentifierType(String label) {
this.label = label;
private VenueIdentifierType(String label) { }
this.label = label;
}
} }

View File

@ -1,21 +1,16 @@
package eu.dnetlib.dhp.skgif.model; package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable; import java.io.Serializable;
public enum VenueType implements Serializable { public enum VenueType implements Serializable {
REPOSITORY("repository"), REPOSITORY("repository"), JOURNAL("journal"), CONFERENCE("conference"), BOOK("book"), OTHER(
JOURNAL("journal"), "other"), UNKNOWN("unknown");
CONFERENCE("conference"),
BOOK("book"),
OTHER(
"other"),
UNKNOWN("unknown");
public final String label;
public final String label; private VenueType(String label) {
this.label = label;
private VenueType(String label) { }
this.label = label;
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.exceptions; package eu.dnetlib.dhp.oa.graph.dump.exceptions;
public class CardinalityTooHighException extends Exception { public class CardinalityTooHighException extends Exception {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.exceptions; package eu.dnetlib.dhp.oa.graph.dump.exceptions;
public class NoAvailableEntityTypeException extends Exception { public class NoAvailableEntityTypeException extends Exception {

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.Identifier; import java.io.Serializable;
import eu.dnetlib.dhp.skgif.model.Prefixes; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -15,133 +18,156 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.Serializable; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.*; import eu.dnetlib.dhp.schema.oaf.Datasource;
import java.util.stream.Collectors; import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 21/02/24 * @Date 21/02/24
*/ */
public class DumpDatasource implements Serializable { public class DumpDatasource implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class); private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
DumpDatasource.class DumpDatasource.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir"); final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir); log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath + "Datasources"); Utils.removeOutputDir(spark, outputPath + "Datasources");
mapDatasource(spark, inputPath, outputPath); mapDatasource(spark, inputPath, outputPath);
}); });
} }
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) { private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) {
Utils.readPath(spark, inputPath + "datasource", Datasource.class) Utils
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible() && ! d.getDataInfo().getDeletedbyinference()) .readPath(spark, inputPath + "datasource", Datasource.class)
.map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Datasource>) d -> { .filter(
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource(); (FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId())); && !d.getDataInfo().getDeletedbyinference())
datasource.setIdentifiers(d.getPid() .map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Datasource>) d -> {
.stream() eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())) datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId()));
.collect(Collectors.toList())); datasource
.setIdentifiers(
d
.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
datasource.setName(d.getOfficialname().getValue()); datasource.setName(d.getOfficialname().getValue());
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl()); datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
datasource.setJurisdiction(Optional.ofNullable(d.getJurisdiction()) datasource
.map(v -> v.getClassid()). .setJurisdiction(
orElse(new String())); Optional
datasource.setPreservation_policy_url(d.getPreservationpolicyurl()); .ofNullable(d.getJurisdiction())
datasource.setVersion_control(d.getVersioncontrol()); .map(v -> v.getClassid())
.orElse(new String()));
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
datasource.setVersion_control(d.getVersioncontrol());
datasource.setData_source_classification(Optional.ofNullable(d.getEoscdatasourcetype()) datasource
.map(v -> v.getClassname()). .setData_source_classification(
orElse(new String())); Optional
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes())); .ofNullable(d.getEoscdatasourcetype())
datasource.setThematic(d.getThematic()); .map(v -> v.getClassname())
datasource.setResearch_product_access_policy(Optional.ofNullable(d.getDatabaseaccesstype()) .orElse(new String()));
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue())) datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
.orElse(new ArrayList<>())); datasource.setThematic(d.getThematic());
datasource.setResearch_product_metadata_access_policy(Optional.ofNullable(d.getResearchproductmetadataaccesspolicies()) datasource
.map(v->getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies())) .setResearch_product_access_policy(
.orElse(new ArrayList<>())); Optional
return datasource; .ofNullable(d.getDatabaseaccesstype())
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class) ) .map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
.write() .orElse(new ArrayList<>()));
.mode(SaveMode.Overwrite) datasource
.option("compression","gzip") .setResearch_product_metadata_access_policy(
.json(outputPath + "Datasource"); Optional
} .ofNullable(d.getResearchproductmetadataaccesspolicies())
.map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
.orElse(new ArrayList<>()));
return datasource;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Datasource");
}
private static List<String> getResearchProductAccessPolicy(List<String> value) { private static List<String> getResearchProductAccessPolicy(List<String> value) {
return value.stream().map(v -> getResearchProductAccessPolicy(v)).filter(Objects::nonNull) return value
.map(v -> v.get(0)).distinct().collect(Collectors.toList()); .stream()
} .map(v -> getResearchProductAccessPolicy(v))
private static List<String> getResearchProductAccessPolicy(String value) { .filter(Objects::nonNull)
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/) .map(v -> v.get(0))
//if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/) .distinct()
//if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " .collect(Collectors.toList());
switch(value){ }
case "open"://(https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted"://(https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed"://(https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
private static List<String> getEoscProductType(List<String> researchentitytypes) { private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch (value) {
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
List<String> eoscProductType = new ArrayList<>(); private static List<String> getEoscProductType(List<String> researchentitytypes) {
if(researchentitytypes != null) {
if (researchentitytypes.contains("Software")) List<String> eoscProductType = new ArrayList<>();
eoscProductType.add("Research Software"); if (researchentitytypes != null) {
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature"); if (researchentitytypes.contains("Software"))
if (researchentitytypes.contains("Research Data")) eoscProductType.add("Research Software");
eoscProductType.add("Research Data"); if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
if (researchentitytypes.contains("Organization") || eoscProductType.add("Research Literature");
researchentitytypes.contains("Organizations") || if (researchentitytypes.contains("Research Data"))
researchentitytypes.contains("Services") || eoscProductType.add("Research Data");
researchentitytypes.contains("Projects")) if (researchentitytypes.contains("Organization") ||
eoscProductType.add("Other research product"); researchentitytypes.contains("Organizations") ||
} researchentitytypes.contains("Services") ||
return eoscProductType; researchentitytypes.contains("Projects"))
} eoscProductType.add("Other research product");
}
return eoscProductType;
}
} }

View File

@ -1,13 +1,16 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.Serializable;
import eu.dnetlib.dhp.schema.oaf.Project; import java.io.StringReader;
import eu.dnetlib.dhp.schema.oaf.Relation; import java.util.ArrayList;
import eu.dnetlib.dhp.skgif.model.Grant; import java.util.List;
import eu.dnetlib.dhp.skgif.model.Identifier; import java.util.Optional;
import eu.dnetlib.dhp.skgif.model.Prefixes; import java.util.stream.Collectors;
import eu.dnetlib.dhp.skgif.model.RelationType;
import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -22,141 +25,185 @@ import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.skgif.model.Grant;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.RelationType;
import scala.Tuple2; import scala.Tuple2;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 22/02/24 * @Date 22/02/24
*/ */
public class DumpGrant implements Serializable { public class DumpGrant implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpGrant.class); private static final Logger log = LoggerFactory.getLogger(DumpGrant.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
DumpGrant.class DumpGrant.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir"); final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir); log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath + "Grant"); Utils.removeOutputDir(spark, outputPath + "Grant");
mapGrants(spark, inputPath, outputPath); mapGrants(spark, inputPath, outputPath);
}); });
} }
private static void mapGrants(SparkSession spark, String inputPath, String outputPath) { private static void mapGrants(SparkSession spark, String inputPath, String outputPath) {
Dataset<Project> projects = Utils.readPath(spark, inputPath + "project", Project.class) Dataset<Project> projects = Utils
.filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() && .readPath(spark, inputPath + "project", Project.class)
!p.getDataInfo().getInvisible()); .filter(
Dataset<Relation> relations = Utils.readPath(spark, inputPath + "relation", Relation.class) (FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
.filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible());
!r.getDataInfo().getInvisible() && Dataset<Relation> relations = Utils
r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label)); .readPath(spark, inputPath + "relation", Relation.class)
projects.joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left") .filter(
.groupByKey((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._1().getId(), Encoders.STRING() ) (FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, Grant>) (k,v) ->{ !r.getDataInfo().getInvisible() &&
Grant g = new Grant(); r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label));
Tuple2<Project, Relation> first = v.next(); projects
g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k)); .joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left")
g.setIdentifiers(getProjectIdentifier(first._1())); .groupByKey((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._1().getId(), Encoders.STRING())
g.setTitle(first._1().getTitle().getValue()); .mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, Grant>) (k, v) -> {
g.setSummary(Optional.ofNullable(first._1().getSummary()) Grant g = new Grant();
.map(value->value.getValue()).orElse(new String())); Tuple2<Project, Relation> first = v.next();
g.setAcronym(Optional.ofNullable(first._1().getAcronym()) g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k));
.map(value->value.getValue()).orElse(new String())); g.setIdentifiers(getProjectIdentifier(first._1()));
g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue())); g.setTitle(first._1().getTitle().getValue());
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] g
g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue())); .setSummary(
g.setCurrency(Optional.ofNullable(first._1().getCurrency()) Optional
.map(value -> value.getValue()).orElse(new String())); .ofNullable(first._1().getSummary())
g.setFunded_amount(Optional.ofNullable(first._1().getFundedamount()) .map(value -> value.getValue())
.orElse(null)); .orElse(new String()));
g.setKeywords(first._1().getSubjects() g
.stream().map(s -> s.getValue()).collect(Collectors.toList())); .setAcronym(
g.setStart_date(Optional.ofNullable(first._1().getStartdate()) Optional
.map(value -> value.getValue()).orElse(new String())); .ofNullable(first._1().getAcronym())
g.setEnd_date(Optional.ofNullable(first._1().getEnddate()) .map(value -> value.getValue())
.map(value -> value.getValue()).orElse(new String())); .orElse(new String()));
g.setWebsite(Optional.ofNullable(first._1().getWebsiteurl()) g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue()));
.map(value -> value.getValue()).orElse(new String())); // * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
if(Optional.ofNullable(first._2()).isPresent()) { g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue()));
List<String> relevantOrganizatios = new ArrayList<>(); g
relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource())); .setCurrency(
v.forEachRemaining(t2 -> relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource()))); Optional
g.setBeneficiaries(relevantOrganizatios); .ofNullable(first._1().getCurrency())
} .map(value -> value.getValue())
return g; .orElse(new String()));
} , Encoders.bean(Grant.class)) g
.write() .setFunded_amount(
.mode(SaveMode.Overwrite) Optional
.option("compression","gzip") .ofNullable(first._1().getFundedamount())
.json(outputPath + "Grant"); .orElse(null));
} g
.setKeywords(
first
._1()
.getSubjects()
.stream()
.map(s -> s.getValue())
.collect(Collectors.toList()));
g
.setStart_date(
Optional
.ofNullable(first._1().getStartdate())
.map(value -> value.getValue())
.orElse(new String()));
g
.setEnd_date(
Optional
.ofNullable(first._1().getEnddate())
.map(value -> value.getValue())
.orElse(new String()));
g
.setWebsite(
Optional
.ofNullable(first._1().getWebsiteurl())
.map(value -> value.getValue())
.orElse(new String()));
if (Optional.ofNullable(first._2()).isPresent()) {
List<String> relevantOrganizatios = new ArrayList<>();
relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource()));
v
.forEachRemaining(
t2 -> relevantOrganizatios
.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource())));
g.setBeneficiaries(relevantOrganizatios);
}
return g;
}, Encoders.bean(Grant.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Grant");
}
private static String getFundingStream(String fundingtree) throws DocumentException { private static String getFundingStream(String fundingtree) throws DocumentException {
final Document doc; final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree)); doc = new SAXReader().read(new StringReader(fundingtree));
if(Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() && if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
doc.selectNodes("//funding_level_0").size() > 0) doc.selectNodes("//funding_level_0").size() > 0)
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText(); return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText();
return new String(); return new String();
} }
private static String getFunderName(String fundingtree) throws DocumentException { private static String getFunderName(String fundingtree) throws DocumentException {
final Document doc; final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree)); doc = new SAXReader().read(new StringReader(fundingtree));
// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText()); // f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText(); return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
//f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText()); // f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
} }
private static List<Identifier> getProjectIdentifier(Project project) { private static List<Identifier> getProjectIdentifier(Project project) throws DocumentException {
if (project.getPid().size() > 0 ) List<Identifier> identifiers = new ArrayList<>();
return project.getPid().stream().map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())).collect(Collectors.toList()); if (project.getPid().size() > 0)
return new ArrayList<>(); project
// private List<Identifier> identifiers;//.schema pid.qualifier.classid identifiers.value pid.value .getPid()
//identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname .stream()
//identifiers.value project.code .forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())));
identifiers
.add(
Identifier
.newInstance(
getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue()));
return identifiers;
}
} }
}

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.skgif.model.Identifier; import java.io.Serializable;
import eu.dnetlib.dhp.skgif.model.OrganizationTypes; import java.util.Optional;
import eu.dnetlib.dhp.skgif.model.Prefixes; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -16,95 +18,117 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.Serializable; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.Optional; import eu.dnetlib.dhp.schema.oaf.Organization;
import java.util.stream.Collectors; import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import eu.dnetlib.dhp.skgif.model.Prefixes;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 21/02/24 * @Date 21/02/24
*/ */
public class DumpOrganization implements Serializable { public class DumpOrganization implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class); private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
DumpOrganization.class DumpOrganization.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath + "Organization"); Utils.removeOutputDir(spark, outputPath + "Organization");
mapOrganization(spark, inputPath, outputPath); mapOrganization(spark, inputPath, outputPath);
}); });
} }
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) { private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class); Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
organizations.filter((FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() && !o.getDataInfo().getInvisible()) organizations
.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> { .filter(
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization(); (FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId())); && !o.getDataInfo().getInvisible())
organization.setCountry(Optional.ofNullable(o.getCountry().getClassid()) .map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
.orElse(new String())); eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
organization.setName(Optional.ofNullable(o.getLegalname().getValue()) organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
.orElse(new String())); organization
organization.setShort_name(Optional.ofNullable(o.getLegalshortname()) .setCountry(
.map(v-> v.getValue()) Optional
.orElse(new String())); .ofNullable(o.getCountry().getClassid())
organization.setIdentifiers(o.getPid() .orElse(new String()));
.stream() organization
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())) .setName(
.collect(Collectors.toList())); Optional
organization.setOther_names(o.getAlternativeNames().stream() .ofNullable(o.getLegalname().getValue())
.map(a -> a.getValue()) .orElse(new String()));
.collect(Collectors.toList())); organization
organization.setType(getOrganizationType(o)); .setShort_name(
return organization; Optional
} .ofNullable(o.getLegalshortname())
, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class)) .map(v -> v.getValue())
.write() .orElse(new String()));
.mode(SaveMode.Overwrite) organization
.option("compression","gzip") .setIdentifiers(
.json(outputPath + "Organization"); o
} .getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
organization
.setOther_names(
o
.getAlternativeNames()
.stream()
.map(a -> a.getValue())
.collect(Collectors.toList()));
organization.setType(getOrganizationType(o));
return organization;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "Organization");
}
private static String getOrganizationType(Organization o) { private static String getOrganizationType(Organization o) {
if(Optional.ofNullable(o.getEcenterprise()).isPresent() && o.getEcenterprise().getValue().equalsIgnoreCase("true")) if (Optional.ofNullable(o.getEcenterprise()).isPresent()
return OrganizationTypes.COMPANY.label; && o.getEcenterprise().getValue().equalsIgnoreCase("true"))
if(Optional.ofNullable(o.getEchighereducation()).isPresent() && o.getEchighereducation().getValue().equalsIgnoreCase("true")) return OrganizationTypes.COMPANY.label;
return OrganizationTypes.EDUCATION.label; if (Optional.ofNullable(o.getEchighereducation()).isPresent()
if(Optional.ofNullable(o.getEcresearchorganization()).isPresent() && o.getEcresearchorganization().getValue().equalsIgnoreCase("true")) && o.getEchighereducation().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label; return OrganizationTypes.EDUCATION.label;
if(Optional.ofNullable(o.getEcnonprofit()).isPresent() && o.getEcnonprofit().getValue().equalsIgnoreCase("true")) if (Optional.ofNullable(o.getEcresearchorganization()).isPresent()
return OrganizationTypes.NONPROFIT.label; && o.getEcresearchorganization().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if (Optional.ofNullable(o.getEcnonprofit()).isPresent()
&& o.getEcnonprofit().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.NONPROFIT.label;
return OrganizationTypes.OTHER.label; return OrganizationTypes.OTHER.label;
} }
} }

View File

@ -6,11 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -22,9 +17,13 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.skgif.model.AccessRight; import eu.dnetlib.dhp.skgif.model.AccessRight;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
@ -106,15 +105,20 @@ public class DumpResult implements Serializable {
Dataset<Datasource> datasource = Utils Dataset<Datasource> datasource = Utils
.readPath(spark, inputPath + "/datasource", Datasource.class) .readPath(spark, inputPath + "/datasource", Datasource.class)
.filter( .filter(
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEosctype()).isPresent() && (FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEoscdatasourcetype()).isPresent() &&
d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive")); d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
Dataset<EmitPerManifestation> man = Utils Dataset<EmitPerManifestation> man = Utils
.readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class); .readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
Dataset<PartialResearchProduct> partialResearchProduct = man.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left") Dataset<PartialResearchProduct> partialResearchProduct = man
.groupByKey((MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(), Encoders.STRING()) .joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left")
.mapGroups((MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (k, v) -> { .groupByKey(
(MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (
k, v) -> {
PartialResearchProduct prp = new PartialResearchProduct(); PartialResearchProduct prp = new PartialResearchProduct();
prp.setResultId(k); prp.setResultId(k);
List<Manifestation> manifestationList = new ArrayList<>(); List<Manifestation> manifestationList = new ArrayList<>();
@ -124,10 +128,13 @@ public class DumpResult implements Serializable {
return prp; return prp;
}, Encoders.bean(PartialResearchProduct.class)); }, Encoders.bean(PartialResearchProduct.class));
partialResearchProduct partialResearchProduct
.joinWith(aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")), "left") .joinWith(
.map((MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> { aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")),
"left")
.map(
(MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> {
PartialResearchProduct prp = t2._1(); PartialResearchProduct prp = t2._1();
if(Optional.ofNullable(t2._2()).isPresent()){ if (Optional.ofNullable(t2._2()).isPresent()) {
prp.setRelated_products(t2._2().getRelatedProduct()); prp.setRelated_products(t2._2().getRelatedProduct());
prp.setRelevant_organizations(t2._2().getOrganizations()); prp.setRelevant_organizations(t2._2().getOrganizations());
prp.setFunding(t2._2().getFunding()); prp.setFunding(t2._2().getFunding());
@ -144,148 +151,83 @@ public class DumpResult implements Serializable {
private static Manifestation getManifestation(Tuple2<EmitPerManifestation, Datasource> t2) { private static Manifestation getManifestation(Tuple2<EmitPerManifestation, Datasource> t2) {
// se il lato sinistro c'e' allora ho la biblio e la venue // se il lato sinistro c'e' allora ho la biblio e la venue
// se non c'e' allora ho solo gli altri valori // se non c'e' allora ho solo gli altri valori
EmitPerManifestation epm = t2._1(); EmitPerManifestation epm = t2._1();
Manifestation manifestation = new Manifestation(); Manifestation manifestation = new Manifestation();
manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname()); manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname());
manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename()); manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
if(Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent()) if (Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent())
manifestation manifestation
.setDates( .setDates(
Arrays Arrays
.asList( .asList(
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing"))); Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent()) if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
switch (epm.getInstance().getRefereed().getClassid()) { switch (epm.getInstance().getRefereed().getClassid()) {
case "0000": case "0000":
manifestation.setPeer_review(PeerReview.UNAVAILABLE.label); manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
break; break;
case "0001": case "0001":
manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label); manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
break; break;
case "0002": case "0002":
manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label); manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
break; break;
} }
manifestation.setMetadata_curation("unavailable"); manifestation.setMetadata_curation("unavailable");
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent()) if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
switch (epm.getInstance().getAccessright().getClassid()) { switch (epm.getInstance().getAccessright().getClassid()) {
case "OPEN": case "OPEN":
case "OPEN DATA": case "OPEN DATA":
case "OPEN SOURCE": case "OPEN SOURCE":
manifestation.setAccess_right(AccessRight.OPEN.label); manifestation.setAccess_right(AccessRight.OPEN.label);
break; break;
case "CLOSED": case "CLOSED":
manifestation.setAccess_right(AccessRight.CLOSED.label); manifestation.setAccess_right(AccessRight.CLOSED.label);
break; break;
case "RESTRICTED": case "RESTRICTED":
manifestation.setAccess_right(AccessRight.RESTRICTED.label); manifestation.setAccess_right(AccessRight.RESTRICTED.label);
break; break;
case "EMBARGO": case "EMBARGO":
case "12MONTHS": case "12MONTHS":
case "6MONTHS": case "6MONTHS":
manifestation.setAccess_right(AccessRight.EMBARGO.label); manifestation.setAccess_right(AccessRight.EMBARGO.label);
break; break;
default: default:
manifestation.setAccess_right(AccessRight.UNAVAILABLE.label); manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
} }
manifestation.setLicence(Optional.ofNullable(epm.getInstance().getLicense()) manifestation
.map(value -> value.getValue()) .setLicence(
.orElse(null)); Optional
manifestation.setUrl(Optional.ofNullable(epm.getInstance().getUrl()) .ofNullable(epm.getInstance().getLicense())
.map(value -> value.get(0)) .map(value -> value.getValue())
.orElse(null)); .orElse(null));
manifestation
.setUrl(
Optional
.ofNullable(epm.getInstance().getUrl())
.map(value -> value.get(0))
.orElse(null));
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) { if (Optional.ofNullable(epm.getInstance().getPid()).isPresent() && epm.getInstance().getPid().size() > 0) {
manifestation.setPid(epm.getInstance().getPid().get(0).getValue()); manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
} }
if (Optional.ofNullable(t2._2()).isPresent()) { if (Optional.ofNullable(t2._2()).isPresent()) {
manifestation.setBiblio(getBiblio(epm)); manifestation.setBiblio(getBiblio(epm));
if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent()) if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted())); manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted()));
else if(Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent()) else if (Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline())); manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline()));
} }
manifestation manifestation
.setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE,epm.getInstance().getHostedby().getKey())); .setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()));
return manifestation; return manifestation;
} }
// private static List<Manifestation> getManifestationList(Dataset<EmitPerManifestation> emitformanifestation,
// Dataset<Datasource> datasource) {
// return emitformanifestation
// .joinWith(
// datasource, emitformanifestation
// .col("hostedBy")
// .equalTo(datasource.col("id")),
// "left")
// .map((MapFunction<Tuple2<EmitPerManifestation, Datasource>, Manifestation>) t2 -> {
// // se il lato sinistro c'e' allora ho la biblio e la venue
// // se non c'e' allora ho solo gli altri valori
// EmitPerManifestation epm = t2._1();
// Manifestation manifestation = new Manifestation();
// manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getClassname());
// manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
// manifestation
// .setDates(
// Arrays
// .asList(
// Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
// if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
// switch (epm.getInstance().getRefereed().getClassid()) {
// case "0000":
// manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
// break;
// case "0001":
// manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
// break;
// case "0002":
// manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
// break;
// }
//
// manifestation.setMetadata_curation("unavailable");
// if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
// switch (epm.getInstance().getAccessright().getClassid()) {
// case "OPEN":
// case "OPEN DATA":
// case "OPEN SOURCE":
// manifestation.setAccess_right(AccessRight.OPEN.label);
// break;
// case "CLOSED":
// manifestation.setAccess_right(AccessRight.CLOSED.label);
// break;
// case "RESTRICTED":
// manifestation.setAccess_right(AccessRight.RESTRICTED.label);
// break;
// case "EMBARGO":
// case "12MONTHS":
// case "6MONTHS":
// manifestation.setAccess_right(AccessRight.EMBARGO.label);
// break;
// default:
// manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
//
// }
// manifestation.setLicence(epm.getInstance().getLicense().getValue());
// manifestation.setUrl(epm.getInstance().getUrl().get(0));
// if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) {
// manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
// }
// if (Optional.ofNullable(t2._2()).isPresent())
// manifestation.setBiblio(getBiblio(epm));
// manifestation.setVenue("venue_______::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey()));
// manifestation
// .setHosting_datasource("datasource__::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey()));
// return manifestation;
// }, Encoders.bean(Manifestation.class))
// .collectAsList();
// }
private static Biblio getBiblio(EmitPerManifestation epm) { private static Biblio getBiblio(EmitPerManifestation epm) {
Biblio biblio = new Biblio(); Biblio biblio = new Biblio();
biblio.setEdition(epm.getJournal().getEdition()); biblio.setEdition(epm.getJournal().getEdition());
@ -298,7 +240,7 @@ public class DumpResult implements Serializable {
} }
private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir, private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir,
String outputPath) { String outputPath) {
ModelSupport.entityTypes ModelSupport.entityTypes
.keySet() .keySet()
.parallelStream() .parallelStream()
@ -314,14 +256,14 @@ public class DumpResult implements Serializable {
.joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left") .joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left")
.map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> { .map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> {
ResearchProduct rp = ResultMapper.map(t2._1()); ResearchProduct rp = ResultMapper.map(t2._1());
if(Optional.ofNullable(t2._2()).isPresent()) { if (Optional.ofNullable(t2._2()).isPresent()) {
if(Optional.ofNullable(t2._2().getRelated_products()).isPresent()) if (Optional.ofNullable(t2._2().getRelated_products()).isPresent())
rp.setRelated_products(t2._2().getRelated_products()); rp.setRelated_products(t2._2().getRelated_products());
if(Optional.ofNullable(t2._2().getFunding()).isPresent()) if (Optional.ofNullable(t2._2().getFunding()).isPresent())
rp.setFunding(t2._2().getFunding()); rp.setFunding(t2._2().getFunding());
if(Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent()) if (Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent())
rp.setRelevant_organizations(t2._2().getRelevant_organizations()); rp.setRelevant_organizations(t2._2().getRelevant_organizations());
if(Optional.ofNullable(t2._2().getManifestations()).isPresent()) if (Optional.ofNullable(t2._2().getManifestations()).isPresent())
rp.setManifestations(t2._2().getManifestations()); rp.setManifestations(t2._2().getManifestations());
} }
return rp; return rp;
@ -333,30 +275,37 @@ public class DumpResult implements Serializable {
}); });
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class)); Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
for(EntityType e : ModelSupport.entityTypes.keySet()) { for (EntityType e : ModelSupport.entityTypes.keySet()) {
if(ModelSupport.isResult(e)) if (ModelSupport.isResult(e))
researchProducts = researchProducts.union(Utils.readPath(spark,workingDir + e.name() + "/researchproduct", ResearchProduct.class)); researchProducts = researchProducts
} .union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class));
}
researchProducts researchProducts
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression","gzip") .option("compression", "gzip")
.json(outputPath + "ResearchProduct"); .json(outputPath + "ResearchProduct");
} }
private static void selectRelations(SparkSession spark, String inputPath, String workingDir) { private static void selectRelations(SparkSession spark, String inputPath, String workingDir) {
Dataset<Relation> relation = Utils.readPath(spark, Dataset<Relation> relation = Utils
inputPath + "relation", Relation.class) .readPath(
.filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() && spark,
!r.getDataInfo().getInvisible()) inputPath + "relation", Relation.class)
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) || .filter(
r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) || (FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) || !r.getDataInfo().getInvisible())
r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label)|| .filter(
r.getRelClass().equalsIgnoreCase(RelationType.PART.label) || (FilterFunction<Relation>) r -> r
r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) || .getRelClass()
r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label)); .equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.PART.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label));
relation relation
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING()) .groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
@ -373,12 +322,14 @@ public class DumpResult implements Serializable {
rpp.getOrganizations().add(Utils.getIdentifier(Prefixes.ORGANIZATION, target)); rpp.getOrganizations().add(Utils.getIdentifier(Prefixes.ORGANIZATION, target));
break; break;
case "isproducedby": case "isproducedby":
rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT ,target)); rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT, target));
break; break;
default: default:
if (!remainignRelations.keySet().contains(relClass)) if (!remainignRelations.keySet().contains(relClass))
remainignRelations.put(relClass, new ArrayList<>()); remainignRelations.put(relClass, new ArrayList<>());
remainignRelations.get(relClass).add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target)); remainignRelations
.get(relClass)
.add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target));
} }
} }
for (String key : remainignRelations.keySet()) for (String key : remainignRelations.keySet())

View File

@ -1,156 +1,179 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Journal; import java.io.Serializable;
import eu.dnetlib.dhp.skgif.model.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.Serializable; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.*; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import java.util.stream.Collectors; import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Journal;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple2;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 21/02/24 * @Date 21/02/24
*/ */
public class DumpVenue implements Serializable { public class DumpVenue implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpVenue.class); private static final Logger log = LoggerFactory.getLogger(DumpVenue.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
DumpVenue.class DumpVenue.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir"); final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir); log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath + "Venue"); Utils.removeOutputDir(spark, outputPath + "Venue");
mapDatasource(spark, inputPath, outputPath, workingDir); mapVenue(spark, inputPath, outputPath, workingDir);
}); });
} }
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath, String workingDir) { private static void mapVenue(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Utils.readPath(spark, inputPath + "datasource", Datasource.class) Dataset<EmitPerManifestation> manifestationDataset = Utils
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible() && ! d.getDataInfo().getDeletedbyinference() .readPath(spark, workingDir + "datasourcePublisher", EmitPerManifestation.class);
&& d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive")) Dataset<Datasource> datasourceDataset = Utils
.map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Venue>) d -> { .readPath(spark, inputPath + "datasource", Datasource.class)
Venue venue = new Venue(); .filter(
if(Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent()) (FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted())); && !d.getDataInfo().getDeletedbyinference()
else if(Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent()) && d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline())); datasourceDataset
venue.setIdentifiers(getVenueIdentifier(d.getJournal())); .joinWith(
venue.setName(d.getOfficialname().getValue()); manifestationDataset, datasourceDataset.col("id").equalTo(manifestationDataset.col("hostedby.key")),
venue.setType(VenueType.JOURNAL.label); "left")
//todo add map for publisher. Get from results? .map((MapFunction<Tuple2<Datasource, EmitPerManifestation>, Venue>) t2 -> {
venue.setPublisher("find it from result"); Venue venue = new Venue();
venue.setAcronym(null); Datasource d = t2._1();
venue.setSeries(null); if (Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent())
venue.setIs_currently_full_oa(null); venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted()));
venue.setCreation_date(null); else if (Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent())
venue.setContributions(null); venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline()));
return venue; venue.setIdentifiers(getVenueIdentifier(d.getJournal()));
}, Encoders.bean(Venue.class) ) venue.setName(d.getOfficialname().getValue());
.write() venue.setType(VenueType.JOURNAL.label);
.mode(SaveMode.Overwrite) if (Optional.ofNullable(t2._2()).isPresent())
.option("compression","gzip") venue.setPublisher(t2._2().getPublisher());
.json(workingDir + "Venues"); venue.setAcronym(null);
venue.setSeries(null);
venue.setIs_currently_full_oa(null);
venue.setCreation_date(null);
venue.setContributions(null);
return venue;
}, Encoders.bean(Venue.class))
Utils.readPath(spark, workingDir + "Venues", Venue.class) .write()
.groupByKey((MapFunction<Venue, String>)v -> v.getLocal_identifier() , Encoders.STRING()) .mode(SaveMode.Overwrite)
.mapGroups((MapGroupsFunction<String, Venue, Venue>) (k,v) -> v.next(), Encoders.bean(Venue.class) ) .option("compression", "gzip")
.write() .json(workingDir + "Venues");
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath + "Venues");
}
private static List<Identifier> getVenueIdentifier(Journal journal) { Utils
List<Identifier> identifiers = new ArrayList<>(); .readPath(spark, workingDir + "Venues", Venue.class)
if (Optional.ofNullable((journal.getIssnOnline())).isPresent()) .groupByKey((MapFunction<Venue, String>) v -> v.getLocal_identifier(), Encoders.STRING())
identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline())); .mapGroups((MapGroupsFunction<String, Venue, Venue>) (k, v) -> v.next(), Encoders.bean(Venue.class))
if(Optional.ofNullable(journal.getIssnPrinted()).isPresent()) .write()
identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted())); .mode(SaveMode.Overwrite)
if (Optional.ofNullable(journal.getIssnLinking()).isPresent()) .option("compression", "gzip")
identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking())); .json(outputPath + "Venues");
return identifiers; }
}
private static List<String> getResearchProductAccessPolicy(List<String> value) { private static List<Identifier> getVenueIdentifier(Journal journal) {
List<Identifier> identifiers = new ArrayList<>();
if (Optional.ofNullable((journal.getIssnOnline())).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline()));
if (Optional.ofNullable(journal.getIssnPrinted()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted()));
if (Optional.ofNullable(journal.getIssnLinking()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking()));
return identifiers;
}
return value.stream().map(v -> getResearchProductAccessPolicy(v)).filter(Objects::nonNull) private static List<String> getResearchProductAccessPolicy(List<String> value) {
.map(v -> v.get(0)).distinct().collect(Collectors.toList());
}
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
//if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
//if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch(value){
case "open"://(https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted"://(https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed"://(https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
private static List<String> getEoscProductType(List<String> researchentitytypes) { return value
.stream()
.map(v -> getResearchProductAccessPolicy(v))
.filter(Objects::nonNull)
.map(v -> v.get(0))
.distinct()
.collect(Collectors.toList());
}
List<String> eoscProductType = new ArrayList<>(); private static List<String> getResearchProductAccessPolicy(String value) {
if(researchentitytypes != null) { // "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch (value) {
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
if (researchentitytypes.contains("Software")) private static List<String> getEoscProductType(List<String> researchentitytypes) {
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature")) List<String> eoscProductType = new ArrayList<>();
eoscProductType.add("Research Literature"); if (researchentitytypes != null) {
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data"); if (researchentitytypes.contains("Software"))
if (researchentitytypes.contains("Organization") || eoscProductType.add("Research Software");
researchentitytypes.contains("Organizations") || if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
researchentitytypes.contains("Services") || eoscProductType.add("Research Literature");
researchentitytypes.contains("Projects")) if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Other research product"); eoscProductType.add("Research Data");
} if (researchentitytypes.contains("Organization") ||
return eoscProductType; researchentitytypes.contains("Organizations") ||
} researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
} }

View File

@ -7,8 +7,6 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -23,10 +21,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;
@ -42,7 +41,7 @@ public class EmitFromResults implements Serializable {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
EmitFromResults.class EmitFromResults.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/emit_biblio_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/emit_biblio_parameters.json"));
@ -91,17 +90,21 @@ public class EmitFromResults implements Serializable {
Class<R> resultClazz = ModelSupport.entityTypes.get(e); Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Utils Utils
.readPath(spark, inputPath + e.name(), resultClazz) .readPath(spark, inputPath + e.name(), resultClazz)
.filter((FilterFunction<R>) r -> Optional.of(r.getSubject()).isPresent()) .filter((FilterFunction<R>) r -> Optional.ofNullable(r.getSubject()).isPresent())
.flatMap( .flatMap(
(FlatMapFunction<R, Topic>) r -> r (FlatMapFunction<R, Topic>) r -> r
.getSubject() .getSubject()
.stream() .stream()
.filter(s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") || s.getQualifier().getClassid().equalsIgnoreCase("sdg")) .filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos")
|| s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.map(s -> { .map(s -> {
Topic t = new Topic(); Topic t = new Topic();
t t
.setLocal_identifier( .setLocal_identifier(
Utils.getIdentifier(Prefixes.TOPIC ,s.getQualifier().getClassid() + s.getValue())); Utils
.getIdentifier(
Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
t t
.setIdentifiers( .setIdentifiers(
Arrays Arrays
@ -154,7 +157,8 @@ public class EmitFromResults implements Serializable {
p.setGiven_name(a.getName()); p.setGiven_name(a.getName());
String identifier = new String(); String identifier = new String();
if (Optional.ofNullable(a.getPid()).isPresent()) { if (Optional.ofNullable(a.getPid()).isPresent()) {
Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils.getOrcid(a.getPid()); Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils
.getOrcid(a.getPid());
if (orcid != null) { if (orcid != null) {
identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()); identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2());
if (orcid._2()) if (orcid._2())
@ -164,12 +168,15 @@ public class EmitFromResults implements Serializable {
else else
p p
.setIdentifiers( .setIdentifiers(
Arrays.asList(Identifier.newInstance("inferred_orcid", orcid._1()))); Arrays
.asList(Identifier.newInstance("inferred_orcid", orcid._1())));
} else { } else {
if (Optional.ofNullable(a.getRank()).isPresent()) { if (Optional.ofNullable(a.getRank()).isPresent()) {
identifier = Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,r.getId() + a.getRank()); identifier = Utils
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank());
} else { } else {
identifier = Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,r.getId() + count); identifier = Utils
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count);
} }
} }
@ -243,6 +250,32 @@ public class EmitFromResults implements Serializable {
} }
}); });
Dataset<EmitPerManifestation> emitPerManifestationDataset = Utils
.readPath(
spark, workingDir + "software/manifestation", EmitPerManifestation.class)
.union(
Utils
.readPath(
spark, workingDir + "dataset/manifestation", EmitPerManifestation.class))
.union(
Utils
.readPath(
spark, workingDir + "publication/manifestation", EmitPerManifestation.class))
.union(
Utils
.readPath(
spark, workingDir + "otherresearchproduct/manifestation", EmitPerManifestation.class));
emitPerManifestationDataset
.groupByKey((MapFunction<EmitPerManifestation, String>) p -> p.getHostedBy(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, EmitPerManifestation, EmitPerManifestation>) (k, v) -> v.next(),
Encoders.bean(EmitPerManifestation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "/datasourcePublisher");
} }
} }

View File

@ -5,10 +5,9 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoAllowedTypeException; import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoAllowedTypeException;
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoTitleFoundException; import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoTitleFoundException;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;
@ -54,20 +53,21 @@ public class ResultMapper implements Serializable {
for (Author a : input.getAuthor()) { for (Author a : input.getAuthor()) {
count += 1; count += 1;
Contribution contribution = new Contribution(); Contribution contribution = new Contribution();
Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid()); Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid());
if (orcid != null) { if (orcid != null) {
contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2())); contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()));
} else {
if (Optional.ofNullable(a.getRank()).isPresent()) {
contribution
.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + a.getRank()));
} else { } else {
if (Optional.ofNullable(a.getRank()).isPresent()) { contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + count));
contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,input.getId() + a.getRank())); }
} else {
contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON,input.getId() + count));
}
} }
if(Optional.ofNullable(a.getRank()).isPresent()){ if (Optional.ofNullable(a.getRank()).isPresent()) {
contribution.setRank(a.getRank()); contribution.setRank(a.getRank());
} }
contributionList.add(contribution); contributionList.add(contribution);
} }
@ -83,12 +83,15 @@ public class ResultMapper implements Serializable {
input input
.getSubject() .getSubject()
.stream() .stream()
.filter(s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") || .filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") ||
s.getQualifier().getClassid().equalsIgnoreCase("sdg")) s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.map(s -> { .map(s -> {
ResultTopic topic = new ResultTopic(); ResultTopic topic = new ResultTopic();
topic.setTopic(Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue())); topic
if(Optional.ofNullable(s.getDataInfo()).isPresent()){ .setTopic(
Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
if (Optional.ofNullable(s.getDataInfo()).isPresent()) {
Provenance provenance = new Provenance(); Provenance provenance = new Provenance();
provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust())); provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust()));
provenance.setType(s.getDataInfo().getInferenceprovenance()); provenance.setType(s.getDataInfo().getInferenceprovenance());
@ -101,7 +104,6 @@ public class ResultMapper implements Serializable {
} }
} }
private static <E extends Result> void mapType(ResearchProduct out, E input) throws NoAllowedTypeException { private static <E extends Result> void mapType(ResearchProduct out, E input) throws NoAllowedTypeException {
switch (input.getResulttype().getClassid()) { switch (input.getResulttype().getClassid()) {
case "publication": case "publication":
@ -148,7 +150,7 @@ public class ResultMapper implements Serializable {
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title")) .filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList()); .collect(Collectors.toList());
if (!iTitle.isEmpty()) { if (!iTitle.isEmpty()) {
out.setTitles(Collections.singletonMap("none",Arrays.asList(iTitle.get(0).getValue()))); out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
return; return;
} }
@ -158,7 +160,7 @@ public class ResultMapper implements Serializable {
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle")) .filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList()); .collect(Collectors.toList());
if (!iTitle.isEmpty()) { if (!iTitle.isEmpty()) {
out.setTitles(Collections.singletonMap("none",Arrays.asList(iTitle.get(0).getValue()))); out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
} }
} }
@ -169,6 +171,6 @@ public class ResultMapper implements Serializable {
Optional Optional
.ofNullable(input.getDescription()) .ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue()))); .ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setAbstracts(Collections.singletonMap("none",descriptionList)); out.setAbstracts(Collections.singletonMap("none", descriptionList));
} }
} }

View File

@ -5,16 +5,18 @@ import java.io.Serializable;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.skgif.model.Prefixes; import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2; import scala.Tuple2;
/** /**
@ -32,11 +34,11 @@ public class Utils implements Serializable {
} }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) { SparkSession spark, String inputPath, Class<R> clazz) {
return spark return spark
.read() .read()
.textFile(inputPath) .textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); .map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
} }
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) { public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
@ -57,7 +59,7 @@ public class Utils implements Serializable {
return null; return null;
} }
public static String getIdentifier(Prefixes entity, String id){ public static String getIdentifier(Prefixes entity, String id) {
return entity.label + DHPUtils.md5(id); return entity.label + DHPUtils.md5(id);
} }

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,216 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="emit_from_result"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="emit_from_result">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Extraction</name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResults</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="dump_result"/>
<error to="Kill"/>
</action>
<action name="dump_result">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpResult</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_datasource"/>
<error to="Kill"/>
</action>
<action name="dump_datasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpDatasource</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_venue"/>
<error to="Kill"/>
</action>
<action name="dump_venue">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpVenue</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_organization"/>
<error to="Kill"/>
</action>
<action name="dump_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpOrganization</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="dump_grant"/>
<error to="Kill"/>
</action>
<action name="dump_grant">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpGrant</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -7,7 +7,6 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResultJobTest;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.*;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -17,10 +16,10 @@ import org.junit.jupiter.api.Test;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromResultJobTest;
import eu.dnetlib.dhp.oa.zenodoapi.MissingConceptDoiException; import eu.dnetlib.dhp.oa.zenodoapi.MissingConceptDoiException;
import eu.dnetlib.dhp.oa.zenodoapi.ZenodoAPIClient; import eu.dnetlib.dhp.oa.zenodoapi.ZenodoAPIClient;
@Disabled @Disabled
public class ZenodoUploadTest { public class ZenodoUploadTest {
@ -162,8 +161,6 @@ public class ZenodoUploadTest {
} }
@Test @Test
void depositBigFile() throws MissingConceptDoiException, IOException { void depositBigFile() throws MissingConceptDoiException, IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,

View File

@ -1,8 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.dhp.skgif.model.Datasource; import java.io.Serializable;
import eu.dnetlib.dhp.skgif.model.Organization; import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -18,76 +21,76 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable;
import java.nio.file.Files; import eu.dnetlib.dhp.skgif.model.Datasource;
import java.nio.file.Path; import eu.dnetlib.dhp.skgif.model.Organization;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 22/02/24 * @Date 22/02/24
*/ */
public class DumpDatasourceTest implements Serializable { public class DumpDatasourceTest implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpDatasourceTest.class); private static final Logger log = LoggerFactory.getLogger(DumpDatasourceTest.class);
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpDatasourceTest.class.getSimpleName()); workingDir = Files.createTempDirectory(DumpDatasourceTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(DumpDatasourceTest.class.getSimpleName()); conf.setAppName(DumpDatasourceTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true"); conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false"); conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(DumpDatasourceTest.class.getSimpleName()) .appName(DumpDatasourceTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
spark.stop(); spark.stop();
} }
@Test @Test
public void testDumpDatasource() throws Exception { public void testDumpDatasource() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath(); .getPath();
DumpDatasource
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpDatasource.main( });
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
}); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<Datasource> datasource = sc
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
JavaRDD<Datasource> datasource = sc Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
.textFile(workingDir.toString() + "/Datasource") Assertions.assertEquals(5, datasourceDataset.count());
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)); datasourceDataset.show(false);
Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
Assertions.assertEquals(5,datasourceDataset.count());
datasourceDataset.show(false);
// Assertions.assertEquals(7, relationDataset.count()); // Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -115,32 +118,33 @@ Assertions.assertEquals(5,datasourceDataset.count());
// //
// //
} }
@Test @Test
public void testDumpDatasourceComplete() throws Exception { public void testDumpDatasourceComplete() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath(); .getPath();
DumpDatasource
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpDatasource.main( });
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
}); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<Datasource> datasource = sc
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
JavaRDD<Datasource> datasource = sc Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class));
.textFile(workingDir.toString() + "/Datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
Dataset<Datasource> datasourceDataset = spark.createDataset(datasource.rdd(), Encoders.bean(Datasource.class)); datasourceDataset
.foreach((ForeachFunction<Datasource>) d -> System.out.println(OBJECT_MAPPER.writeValueAsString(d)));
datasourceDataset.foreach((ForeachFunction<Datasource>) d -> System.out.println(OBJECT_MAPPER.writeValueAsString(d)));
// Assertions.assertEquals(7, relationDataset.count()); // Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -168,5 +172,5 @@ Assertions.assertEquals(5,datasourceDataset.count());
// //
// //
} }
} }

View File

@ -1,8 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.dhp.skgif.model.Datasource; import java.io.Serializable;
import eu.dnetlib.dhp.skgif.model.Grant; import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -17,76 +20,76 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable;
import java.nio.file.Files; import eu.dnetlib.dhp.skgif.model.Datasource;
import java.nio.file.Path; import eu.dnetlib.dhp.skgif.model.Grant;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 22/02/24 * @Date 22/02/24
*/ */
public class DumpGrantTest implements Serializable { public class DumpGrantTest implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpGrantTest.class); private static final Logger log = LoggerFactory.getLogger(DumpGrantTest.class);
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpGrantTest.class.getSimpleName()); workingDir = Files.createTempDirectory(DumpGrantTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(DumpGrantTest.class.getSimpleName()); conf.setAppName(DumpGrantTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true"); conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false"); conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(DumpGrantTest.class.getSimpleName()) .appName(DumpGrantTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
spark.stop(); spark.stop();
} }
@Test @Test
public void testDumpGrant() throws Exception { public void testDumpGrant() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath(); .getPath();
DumpGrant
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpGrant.main( });
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
}); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<Grant> grant = sc
.textFile(workingDir.toString() + "/Grant")
.map(item -> OBJECT_MAPPER.readValue(item, Grant.class));
JavaRDD<Grant> grant = sc Dataset<Grant> grantDataset = spark.createDataset(grant.rdd(), Encoders.bean(Grant.class));
.textFile(workingDir.toString() + "/Grant") Assertions.assertEquals(12, grantDataset.count());
.map(item -> OBJECT_MAPPER.readValue(item, Grant.class)); grantDataset.show(false);
Dataset<Grant> grantDataset = spark.createDataset(grant.rdd(), Encoders.bean(Grant.class));
Assertions.assertEquals(12,grantDataset.count());
grantDataset.show(false);
// Assertions.assertEquals(7, relationDataset.count()); // Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -114,5 +117,5 @@ Assertions.assertEquals(12,grantDataset.count());
// //
// //
} }
} }

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; import java.io.Serializable;
import eu.dnetlib.dhp.skgif.model.Organization; import java.nio.file.Files;
import eu.dnetlib.dhp.skgif.model.Prefixes; import java.nio.file.Path;
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -20,77 +21,79 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable;
import java.nio.file.Files; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import java.nio.file.Path; import eu.dnetlib.dhp.skgif.model.Organization;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 22/02/24 * @Date 22/02/24
*/ */
public class DumpOrganizationTest implements Serializable { public class DumpOrganizationTest implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpOrganizationTest.class); private static final Logger log = LoggerFactory.getLogger(DumpOrganizationTest.class);
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpOrganizationTest.class.getSimpleName()); workingDir = Files.createTempDirectory(DumpOrganizationTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(DumpOrganizationTest.class.getSimpleName()); conf.setAppName(DumpOrganizationTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true"); conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false"); conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(DumpOrganizationTest.class.getSimpleName()) .appName(DumpOrganizationTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
spark.stop(); spark.stop();
} }
@Test @Test
public void testDumpOrganization() throws Exception { public void testDumpOrganization() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath(); .getPath();
DumpOrganization
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
DumpOrganization });
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/"
}); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<Organization> organization = sc
.textFile(workingDir.toString() + "/Organization")
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class));
JavaRDD<Organization> organization = sc Dataset<Organization> organizationDataset = spark
.textFile(workingDir.toString() + "/Organization") .createDataset(organization.rdd(), Encoders.bean(Organization.class));
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class)); Assertions.assertEquals(34 - 19, organizationDataset.count());
organizationDataset.show(false);
Dataset<Organization> organizationDataset = spark.createDataset(organization.rdd(), Encoders.bean(Organization.class));
Assertions.assertEquals(34-19,organizationDataset.count());
organizationDataset.show(false);
// Assertions.assertEquals(7, relationDataset.count()); // Assertions.assertEquals(7, relationDataset.count());
// RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); // RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first();
// Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); // Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size());
@ -118,5 +121,5 @@ Assertions.assertEquals(34-19,organizationDataset.count());
// //
// //
} }
} }

View File

@ -1,10 +1,13 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif; package eu.dnetlib.dhp.oa.graph.dump.skgif;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; import java.nio.file.Files;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct; import java.nio.file.Path;
import eu.dnetlib.dhp.skgif.model.*; import java.util.stream.Collectors;
import eu.dnetlib.dhp.utils.DHPUtils;
import javax.validation.constraints.AssertTrue;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -21,261 +24,456 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import javax.validation.constraints.AssertTrue; import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.nio.file.Files; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import java.nio.file.Path; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import java.util.stream.Collectors; import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 20/02/24 * @Date 20/02/24
*/ */
public class DumpResultTest { public class DumpResultTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class); private static final Logger log = LoggerFactory.getLogger(DumpResultTest.class);
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName()); workingDir = Files.createTempDirectory(DumpResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(DumpResultTest.class.getSimpleName()); conf.setAppName(DumpResultTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true"); conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false"); conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(DumpResultTest.class.getSimpleName()) .appName(DumpResultTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
spark.stop(); spark.stop();
} }
@Test @Test
public void testEmitFromResult() throws Exception { public void testEmitFromResult() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph/")
.getPath(); .getPath();
final String workingDir = getClass() final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir/")
.getPath(); .getPath();
DumpResult DumpResult
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath, "-sourcePath", sourcePath,
"-workingDir", workingDir "-workingDir", workingDir
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<RelationPerProduct> relation = sc JavaRDD<RelationPerProduct> relation = sc
.textFile(workingDir + "/aggrelation") .textFile(workingDir + "/aggrelation")
.map(item -> OBJECT_MAPPER.readValue(item, RelationPerProduct.class)); .map(item -> OBJECT_MAPPER.readValue(item, RelationPerProduct.class));
Dataset<RelationPerProduct> relationDataset = spark.createDataset(relation.rdd(), Encoders.bean(RelationPerProduct.class)); Dataset<RelationPerProduct> relationDataset = spark
.createDataset(relation.rdd(), Encoders.bean(RelationPerProduct.class));
relationDataset.show(false); relationDataset.show(false);
Assertions.assertEquals(7, relationDataset.count()); Assertions.assertEquals(7, relationDataset.count());
RelationPerProduct temp = relationDataset.filter((FilterFunction<RelationPerProduct>) r -> r.getResultId().equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).first(); RelationPerProduct temp = relationDataset
Assertions.assertEquals(3, temp.getFunding().size()+temp.getRelatedProduct().size()+temp.getOrganizations().size()); .filter(
Assertions.assertEquals(1, temp.getFunding().size()); (FilterFunction<RelationPerProduct>) r -> r
Assertions.assertEquals(2, temp.getRelatedProduct().size()); .getResultId()
Assertions.assertEquals(1, temp.getRelatedProduct().stream().filter(rp -> rp.getRelation_type().equalsIgnoreCase("issupplementedby")).count()); .equalsIgnoreCase("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
Assertions.assertEquals(1, temp.getRelatedProduct().stream().filter(rp -> rp.getRelation_type().equalsIgnoreCase("isdocumentedby")).count()); .first();
Assertions
.assertEquals(
3, temp.getFunding().size() + temp.getRelatedProduct().size() + temp.getOrganizations().size());
Assertions.assertEquals(1, temp.getFunding().size());
Assertions.assertEquals(2, temp.getRelatedProduct().size());
Assertions
.assertEquals(
1,
temp
.getRelatedProduct()
.stream()
.filter(rp -> rp.getRelation_type().equalsIgnoreCase("issupplementedby"))
.count());
Assertions
.assertEquals(
1,
temp
.getRelatedProduct()
.stream()
.filter(rp -> rp.getRelation_type().equalsIgnoreCase("isdocumentedby"))
.count());
JavaRDD<ResearchProduct> researchProduct = sc JavaRDD<ResearchProduct> researchProduct = sc
.textFile(workingDir.toString() + "/publication/researchproduct") .textFile(workingDir.toString() + "/publication/researchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class)); .map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class)); .createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
Assertions.assertEquals(1, researchProductDataset.filter((FilterFunction<ResearchProduct>) p -> p.getLocal_identifier().equalsIgnoreCase(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))).count()); Assertions
ResearchProduct product = researchProductDataset.filter((FilterFunction<ResearchProduct>) p -> p.getLocal_identifier().equalsIgnoreCase(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))).first(); .assertEquals(
Assertions.assertEquals(2, product.getRelevant_organizations().size()); 1,
Assertions.assertEquals(1, product.getFunding().size()); researchProductDataset
Assertions.assertEquals(0, product.getRelated_products().size()); .filter(
Assertions.assertEquals(1, product.getContributions().size()); (FilterFunction<ResearchProduct>) p -> p
Assertions.assertEquals(2, product.getManifestations().size()); .getLocal_identifier()
.equalsIgnoreCase(
Utils
.getIdentifier(
Prefixes.RESEARCH_PRODUCT,
"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")))
.count());
ResearchProduct product = researchProductDataset
.filter(
(FilterFunction<ResearchProduct>) p -> p
.getLocal_identifier()
.equalsIgnoreCase(
Utils
.getIdentifier(
Prefixes.RESEARCH_PRODUCT, "50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")))
.first();
Assertions.assertEquals(2, product.getRelevant_organizations().size());
Assertions.assertEquals(1, product.getFunding().size());
Assertions.assertEquals(0, product.getRelated_products().size());
Assertions.assertEquals(1, product.getContributions().size());
Assertions.assertEquals(2, product.getManifestations().size());
researchProductDataset.show(false); researchProductDataset.show(false);
}
@Test
public void testEmitFromDedupedResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath();
} final String workingDir = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir_complete_entities/")
.getPath();
@Test DumpResult
public void testEmitFromDedupedResult() throws Exception { .main(
final String sourcePath = getClass() new String[] {
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") "-isSparkSessionManaged", Boolean.FALSE.toString(),
.getPath(); "-sourcePath", sourcePath,
"-workingDir", workingDir,
"-outputPath", workingDir
final String workingDir = getClass() });
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDir_complete_entities/")
.getPath();
DumpResult final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-workingDir", workingDir,
"-outputPath", workingDir
}); JavaRDD<ResearchProduct> researchProduct = sc
.textFile(workingDir.toString() + "ResearchProduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
JavaRDD<ResearchProduct> researchProduct = sc Assertions.assertEquals(1, researchProductDataset.count());
.textFile(workingDir.toString() + "ResearchProduct")
.map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
org.apache.spark.sql.Dataset<ResearchProduct> researchProductDataset = spark ResearchProduct rp = researchProductDataset.first();
.createDataset(researchProduct.rdd(), Encoders.bean(ResearchProduct.class));
Assertions.assertEquals(1, researchProductDataset.count()); // check the local identifier
Assertions.assertEquals("product_____::e22a152ab43b9215d14ece613f76ec84", rp.getLocal_identifier());
ResearchProduct rp = researchProductDataset.first(); // check the pids of the result
Assertions.assertEquals(3, rp.getIdentifiers().size());
Assertions
.assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("doi")).count());
Assertions
.assertEquals(
"10.1007/s40199-021-00403-x",
rp
.getIdentifiers()
.stream()
.filter(p -> p.getScheme().equalsIgnoreCase("doi"))
.collect(Collectors.toList())
.get(0)
.getValue());
Assertions
.assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("pmid")).count());
Assertions
.assertEquals(
"34327650",
rp
.getIdentifiers()
.stream()
.filter(p -> p.getScheme().equalsIgnoreCase("pmid"))
.collect(Collectors.toList())
.get(0)
.getValue());
Assertions
.assertEquals(1, rp.getIdentifiers().stream().filter(p -> p.getScheme().equalsIgnoreCase("pmc")).count());
Assertions
.assertEquals(
"PMC8602609",
rp
.getIdentifiers()
.stream()
.filter(p -> p.getScheme().equalsIgnoreCase("pmc"))
.collect(Collectors.toList())
.get(0)
.getValue());
//check the local identifier // check the title
Assertions.assertEquals("product_____::e22a152ab43b9215d14ece613f76ec84", rp.getLocal_identifier()); Assertions.assertEquals(1, rp.getTitles().keySet().size());
Assertions.assertTrue(rp.getTitles().keySet().contains("none"));
Assertions.assertEquals(1, rp.getTitles().get("none").size());
//check the pids of the result // check abstract
Assertions.assertEquals(3,rp.getIdentifiers().size()); Assertions.assertEquals(1, rp.getAbstracts().keySet().size());
Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("doi")).count()); Assertions.assertTrue(rp.getAbstracts().keySet().contains("none"));
Assertions.assertEquals("10.1007/s40199-021-00403-x", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("doi")).collect(Collectors.toList()).get(0).getValue()); Assertions.assertEquals(1, rp.getAbstracts().get("none").size());
Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmid")).count());
Assertions.assertEquals("34327650", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmid")).collect(Collectors.toList()).get(0).getValue());
Assertions.assertEquals(1, rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmc")).count());
Assertions.assertEquals("PMC8602609", rp.getIdentifiers().stream().filter(p->p.getScheme().equalsIgnoreCase("pmc")).collect(Collectors.toList()).get(0).getValue());
//check the title // check type
Assertions.assertEquals(1, rp.getTitles().keySet().size()); Assertions.assertEquals("literature", rp.getProduct_type());
Assertions.assertTrue(rp.getTitles().keySet().contains("none"));
Assertions.assertEquals(1, rp.getTitles().get("none").size());
//check abstract // check topics
Assertions.assertEquals(1, rp.getAbstracts().keySet().size()); Assertions.assertEquals(3, rp.getTopics().size());
Assertions.assertTrue(rp.getAbstracts().keySet().contains("none")); Assertions
Assertions.assertEquals(1, rp.getAbstracts().get("none").size()); .assertTrue(
rp
.getTopics()
.stream()
.anyMatch(
t -> t
.getTopic()
.equalsIgnoreCase(Prefixes.TOPIC.label + DHPUtils.md5("FOSSustained delivery"))));
//check type // check contributions
Assertions.assertEquals("literature", rp.getProduct_type()); Assertions.assertEquals(4, rp.getContributions().size());
Assertions
.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("person")).count());
Assertions
.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("temp")).count());
rp.getContributions().forEach(c -> Assertions.assertTrue(c.getDeclared_affiliation() == null));
Assertions
.assertEquals(
1,
rp
.getContributions()
.stream()
.filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-8284-6269true")))
.collect(Collectors.toList())
.get(0)
.getRank());
Assertions
.assertEquals(
2,
rp
.getContributions()
.stream()
.filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0002-0940-893xtrue")))
.collect(Collectors.toList())
.get(0)
.getRank());
Assertions
.assertEquals(
3,
rp
.getContributions()
.stream()
.filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-5291-577xtrue")))
.collect(Collectors.toList())
.get(0)
.getRank());
Assertions
.assertEquals(
4,
rp
.getContributions()
.stream()
.filter(
c -> c
.getPerson()
.equals(
Utils
.getIdentifier(
Prefixes.TEMPORARY_PERSON,
"50|doi_dedup___::0000661be7c602727bae9690778b16514")))
.collect(Collectors.toList())
.get(0)
.getRank());
researchProductDataset.show(10, 100, true);
//check topics // check manifestation 1
Assertions.assertEquals(3, rp.getTopics().size()); Assertions.assertEquals(3, rp.getManifestations().size());
Assertions.assertTrue(rp.getTopics().stream().anyMatch(t -> t.getTopic().equalsIgnoreCase(Prefixes.TOPIC.label + DHPUtils.md5("FOSSustained delivery")))); Manifestation manifestation = rp
.getManifestations()
.stream()
.filter(
m -> m
.getHosting_datasource()
.equals(
Utils.getIdentifier(Prefixes.DATASOURCE, "10|doajarticles::6107489403b31fc7cf37cb7fda35f7f1")))
.collect(Collectors.toList())
.get(0);
Assertions.assertEquals("Article", manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.CLOSED.label, manifestation.getAccess_right());
Assertions.assertEquals("Springer Nature TDM", manifestation.getLicence());
Assertions.assertEquals("https://doi.org/10.1007/s40199-021-00403-x", manifestation.getUrl());
Assertions.assertEquals("10.1007/s40199-021-00403-x", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() != null);
Biblio biblio = manifestation.getBiblio();
Assertions.assertTrue(biblio.getEdition() == null);
Assertions.assertTrue(biblio.getIssue() == null);
Assertions.assertEquals("Springer Science and Business Media LLC", biblio.getPublisher());
Assertions.assertEquals("29", biblio.getVolume());
Assertions.assertEquals("415", biblio.getStart_page());
Assertions.assertEquals("438", biblio.getEnd_page());
//check contributions // check manifestation 2
Assertions.assertEquals(4, rp.getContributions().size()); manifestation = rp
Assertions.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("person")).count()); .getManifestations()
Assertions.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().startsWith("temp")).count()); .stream()
rp.getContributions().forEach(c -> Assertions.assertTrue(c.getDeclared_affiliation()==null)); .filter(
Assertions.assertEquals(1, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-8284-6269true"))) m -> m
.collect(Collectors.toList()).get(0).getRank()); .getHosting_datasource()
Assertions.assertEquals(2, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0002-0940-893xtrue"))) .equals(
.collect(Collectors.toList()).get(0).getRank()); Utils.getIdentifier(Prefixes.DATASOURCE, "10|openaire____::55045bd2a65019fd8e6741a755395c8c")))
Assertions.assertEquals(3, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.PERSON, "0000-0001-5291-577xtrue"))) .collect(Collectors.toList())
.collect(Collectors.toList()).get(0).getRank()); .get(0);
Assertions.assertEquals(4, rp.getContributions().stream().filter(c -> c.getPerson().equals(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, "50|doi_dedup___::0000661be7c602727bae9690778b16514"))) Assertions.assertEquals("Article", manifestation.getProduct_local_type());
.collect(Collectors.toList()).get(0).getRank()); Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
researchProductDataset.show(10,100,true); Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2020-01-03", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.UNAVAILABLE.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://pubmed.ncbi.nlm.nih.gov/34327650", manifestation.getUrl());
Assertions.assertEquals("34327650", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
//check manifestation 1 // check manifestation 3
Assertions.assertEquals(3, rp.getManifestations().size()); manifestation = rp
Manifestation manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|doajarticles::6107489403b31fc7cf37cb7fda35f7f1"))) .getManifestations()
.collect(Collectors.toList()).get(0); .stream()
Assertions.assertEquals("Article" , manifestation.getProduct_local_type()); .filter(
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); m -> m
Assertions.assertEquals(1, manifestation.getDates().size()); .getHosting_datasource()
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue()); .equals(
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType()); Utils.getIdentifier(Prefixes.DATASOURCE, "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c")))
Assertions.assertEquals(PeerReview.PEER_REVIEWED.label, manifestation.getPeer_review()); .collect(Collectors.toList())
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation()); .get(0);
Assertions.assertEquals(AccessRight.CLOSED.label, manifestation.getAccess_right()); Assertions.assertEquals("Other literature type", manifestation.getProduct_local_type());
Assertions.assertEquals("Springer Nature TDM", manifestation.getLicence()); Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals("https://doi.org/10.1007/s40199-021-00403-x", manifestation.getUrl()); Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("10.1007/s40199-021-00403-x", manifestation.getPid()); Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertTrue(manifestation.getBiblio() != null); Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Biblio biblio = manifestation.getBiblio(); Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertTrue(biblio.getEdition() == null); Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertTrue(biblio.getIssue() == null); Assertions.assertEquals(AccessRight.OPEN.label, manifestation.getAccess_right());
Assertions.assertEquals("Springer Science and Business Media LLC",biblio.getPublisher() ); Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("29", biblio.getVolume()); Assertions.assertEquals("https://europepmc.org/articles/PMC8602609/", manifestation.getUrl());
Assertions.assertEquals("415", biblio.getStart_page()); Assertions.assertEquals("PMC8602609", manifestation.getPid());
Assertions.assertEquals("438", biblio.getEnd_page()); Assertions.assertTrue(manifestation.getBiblio() == null);
//check manifestation 2 // check relevant organization
manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|openaire____::55045bd2a65019fd8e6741a755395c8c"))) Assertions.assertEquals(1, rp.getRelevant_organizations().size());
.collect(Collectors.toList()).get(0); Assertions
Assertions.assertEquals("Article" , manifestation.getProduct_local_type()); .assertEquals(
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema()); Prefixes.ORGANIZATION.label + "601e510b1fda7cc6cb03329531502171",
Assertions.assertEquals(1, manifestation.getDates().size()); rp.getRelevant_organizations().get(0));
Assertions.assertEquals("2020-01-03", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.UNAVAILABLE.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://pubmed.ncbi.nlm.nih.gov/34327650", manifestation.getUrl());
Assertions.assertEquals("34327650", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
//check manifestation 3 // check funding
manifestation = rp.getManifestations().stream().filter(m -> m.getHosting_datasource().equals(Utils.getIdentifier(Prefixes.DATASOURCE , "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c"))) Assertions.assertEquals(1, rp.getFunding().size());
.collect(Collectors.toList()).get(0); Assertions.assertEquals(Prefixes.GRANT.label + "a7795022763d413f5de59036ebbd0c52", rp.getFunding().get(0));
Assertions.assertEquals("Other literature type" , manifestation.getProduct_local_type());
Assertions.assertEquals("dnet:publication_resource", manifestation.getProduct_local_type_schema());
Assertions.assertEquals(1, manifestation.getDates().size());
Assertions.assertEquals("2021-07-29", manifestation.getDates().get(0).getValue());
Assertions.assertEquals("publishing", manifestation.getDates().get(0).getType());
Assertions.assertEquals(PeerReview.NON_PEER_REVIEWED.label, manifestation.getPeer_review());
Assertions.assertEquals("unavailable", manifestation.getMetadata_curation());
Assertions.assertEquals(AccessRight.OPEN.label, manifestation.getAccess_right());
Assertions.assertEquals(null, manifestation.getLicence());
Assertions.assertEquals("https://europepmc.org/articles/PMC8602609/", manifestation.getUrl());
Assertions.assertEquals("PMC8602609", manifestation.getPid());
Assertions.assertTrue(manifestation.getBiblio() == null);
//check relevant organization // check related products
Assertions.assertEquals(1,rp.getRelevant_organizations().size()); Assertions.assertEquals(5, rp.getRelated_products().size());
Assertions.assertEquals(Prefixes.ORGANIZATION.label + "601e510b1fda7cc6cb03329531502171", rp.getRelevant_organizations().get(0)); Assertions
.assertEquals(
4,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.CITATION.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.DOCUMENTS.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.PART.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.SUPPLEMENT.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
Assertions
.assertEquals(
1,
rp
.getRelated_products()
.stream()
.filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.VERSION.label))
.collect(Collectors.toList())
.get(0)
.getProduct_list()
.size());
}
//check funding
Assertions.assertEquals(1,rp.getFunding().size());
Assertions.assertEquals(Prefixes.GRANT.label + "a7795022763d413f5de59036ebbd0c52", rp.getFunding().get(0));
//check related products
Assertions.assertEquals(5, rp.getRelated_products().size());
Assertions.assertEquals(4, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.CITATION.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.DOCUMENTS.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.PART.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.SUPPLEMENT.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
Assertions.assertEquals(1, rp.getRelated_products().stream().filter(r -> r.getRelation_type().equalsIgnoreCase(RelationType.VERSION.label)).collect(Collectors.toList()).get(0).getProduct_list().size());
}
} }

View File

@ -5,7 +5,6 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import eu.dnetlib.dhp.skgif.model.Topic;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -24,9 +23,9 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.skgif.model.Persons; import eu.dnetlib.dhp.skgif.model.Persons;
import eu.dnetlib.dhp.skgif.model.Topic;
//@Disabled //@Disabled
public class EmitFromResultJobTest { public class EmitFromResultJobTest {
@ -101,17 +100,54 @@ public class EmitFromResultJobTest {
.createDataset(persons.rdd(), Encoders.bean(Persons.class)); .createDataset(persons.rdd(), Encoders.bean(Persons.class));
personsDataset.show(false); personsDataset.show(false);
Persons claudiaBorer = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")) Persons claudiaBorer = personsDataset
.first(); .filter(
(FilterFunction<Persons>) p -> p
.getLocal_identifier()
.equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
.first();
Assertions.assertEquals(2, personsDataset.filter((FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia") && p.getFamily_name().equalsIgnoreCase("borer")).count()); Assertions
Assertions.assertEquals(1, personsDataset.filter((FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia") && p.getFamily_name().equalsIgnoreCase("borer") && !p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")).count()); .assertEquals(
2,
personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia")
&& p.getFamily_name().equalsIgnoreCase("borer"))
.count());
Assertions
.assertEquals(
1,
personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getGiven_name().equalsIgnoreCase("claudia")
&& p.getFamily_name().equalsIgnoreCase("borer")
&& !p
.getLocal_identifier()
.equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
.count());
Assertions.assertEquals("claudia", claudiaBorer.getGiven_name().toLowerCase()); Assertions.assertEquals("claudia", claudiaBorer.getGiven_name().toLowerCase());
Assertions.assertEquals("borer", claudiaBorer.getFamily_name().toLowerCase()); Assertions.assertEquals("borer", claudiaBorer.getFamily_name().toLowerCase());
Assertions.assertEquals(2, personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person")).count()); Assertions
Assertions.assertEquals(1, personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person") && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")).count()); .assertEquals(
Persons orcidPerson = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person") && p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916")).first(); 2,
personsDataset
.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person"))
.count());
Assertions
.assertEquals(
1,
personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person")
&& p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916"))
.count());
Persons orcidPerson = personsDataset
.filter(
(FilterFunction<Persons>) p -> p.getLocal_identifier().startsWith("person")
&& p.getIdentifiers().get(0).getValue().equals("0000-0002-5597-4916"))
.first();
Assertions.assertEquals("M.", orcidPerson.getGiven_name()); Assertions.assertEquals("M.", orcidPerson.getGiven_name());
Assertions.assertEquals("Kooi", orcidPerson.getFamily_name()); Assertions.assertEquals("Kooi", orcidPerson.getFamily_name());
Assertions.assertEquals(1, orcidPerson.getIdentifiers().size()); Assertions.assertEquals(1, orcidPerson.getIdentifiers().size());
@ -119,58 +155,57 @@ public class EmitFromResultJobTest {
Assertions.assertEquals("0000-0002-5597-4916", orcidPerson.getIdentifiers().get(0).getValue()); Assertions.assertEquals("0000-0002-5597-4916", orcidPerson.getIdentifiers().get(0).getValue());
Dataset<EmitPerManifestation> manifestationDataset = spark Dataset<EmitPerManifestation> manifestationDataset = spark
.createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class)); .createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class));
manifestationDataset.show(false); manifestationDataset.show(false);
Assertions.assertEquals(4, manifestationDataset.count()); Assertions.assertEquals(4, manifestationDataset.count());
Dataset<Topic> topicDataset = spark Dataset<Topic> topicDataset = spark
.createDataset(topics.rdd(), Encoders.bean(Topic.class)); .createDataset(topics.rdd(), Encoders.bean(Topic.class));
Assertions.assertEquals(0, topicDataset.count()); Assertions.assertEquals(0, topicDataset.count());
} }
@Test @Test
public void testEmitFromResultComplete() throws Exception { public void testEmitFromResultComplete() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/") .getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graph_complete_entities/")
.getPath(); .getPath();
EmitFromResults EmitFromResults
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath, "-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/result/", "-outputPath", workingDir.toString() + "/result/",
"-workingDir", workingDir.toString() + "/" "-workingDir", workingDir.toString() + "/"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Persons> persons = sc JavaRDD<Persons> persons = sc
.textFile(workingDir.toString() + "/result/Persons") .textFile(workingDir.toString() + "/result/Persons")
.map(item -> OBJECT_MAPPER.readValue(item, Persons.class)); .map(item -> OBJECT_MAPPER.readValue(item, Persons.class));
org.apache.spark.sql.Dataset<Persons> personsDataset = spark org.apache.spark.sql.Dataset<Persons> personsDataset = spark
.createDataset(persons.rdd(), Encoders.bean(Persons.class)); .createDataset(persons.rdd(), Encoders.bean(Persons.class));
personsDataset.foreach((ForeachFunction<Persons>) p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p))); personsDataset.foreach((ForeachFunction<Persons>) p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p)));
JavaRDD<Topic> topics = sc JavaRDD<Topic> topics = sc
.textFile(workingDir.toString() + "/result/Topic") .textFile(workingDir.toString() + "/result/Topic")
.map(item -> OBJECT_MAPPER.readValue(item, Topic.class)); .map(item -> OBJECT_MAPPER.readValue(item, Topic.class));
Dataset<Topic> topicDataset = spark Dataset<Topic> topicDataset = spark
.createDataset(topics.rdd(), Encoders.bean(Topic.class)); .createDataset(topics.rdd(), Encoders.bean(Topic.class));
Assertions.assertEquals(3, topicDataset.count()); Assertions.assertEquals(3, topicDataset.count());
topicDataset.foreach((ForeachFunction<Topic>) t -> System.out.println(OBJECT_MAPPER.writeValueAsString(t))); topicDataset.foreach((ForeachFunction<Topic>) t -> System.out.println(OBJECT_MAPPER.writeValueAsString(t)));
JavaRDD<EmitPerManifestation> manifestation = sc JavaRDD<EmitPerManifestation> manifestation = sc
.textFile(workingDir.toString() + "/publication/manifestation") .textFile(workingDir.toString() + "/publication/manifestation")
.map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class)); .map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class));
Dataset<EmitPerManifestation> manifestationDataset = spark Dataset<EmitPerManifestation> manifestationDataset = spark
.createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class)); .createDataset(manifestation.rdd(), Encoders.bean(EmitPerManifestation.class));
manifestationDataset.show(false); manifestationDataset.show(false);
// Persons claudiaBorer = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db")) // Persons claudiaBorer = personsDataset.filter((FilterFunction<Persons>) p -> p.getLocal_identifier().equalsIgnoreCase("tmp_person__::2c1eea261f7d9a97ab7ca8c4200781db"))
@ -194,6 +229,5 @@ public class EmitFromResultJobTest {
// Assertions.assertEquals(4, manifestationDataset.count()); // Assertions.assertEquals(4, manifestationDataset.count());
// //
} }
} }