Compare commits
8 Commits
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | 7b715b2bb8 | |
Miriam Baglioni | 752fd896e4 | |
Miriam Baglioni | 0c887ca015 | |
Miriam Baglioni | ebde629d49 | |
Miriam Baglioni | e2b9989199 | |
Miriam Baglioni | c3be9a7b14 | |
Miriam Baglioni | 9a8a9ac7df | |
Miriam Baglioni | b1b48a90dc |
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 04/09/23
|
||||
*/
|
||||
public enum AccessRight {
|
||||
OPEN("open"), CLOSED("closed"), EMBARGO("embargo"), RESTRICTED("restricted"), UNAVAILABLE("unavailable");
|
||||
|
||||
public final String label;
|
||||
|
||||
private AccessRight(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 05/09/23
|
||||
*/
|
||||
public class Affiliation implements Serializable {
|
||||
private String organization;
|
||||
@JsonProperty("start_date")
|
||||
private String start_date;
|
||||
@JsonProperty("end_date")
|
||||
private String end_date;
|
||||
|
||||
public String getOrganization() {
|
||||
return organization;
|
||||
}
|
||||
|
||||
public void setOrganization(String organization) {
|
||||
this.organization = organization;
|
||||
}
|
||||
|
||||
public String getStart_date() {
|
||||
return start_date;
|
||||
}
|
||||
|
||||
public void setStart_date(String start_date) {
|
||||
this.start_date = start_date;
|
||||
}
|
||||
|
||||
public String getEnd_date() {
|
||||
return end_date;
|
||||
}
|
||||
|
||||
public void setEnd_date(String end_date) {
|
||||
this.end_date = end_date;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Biblio implements Serializable {
|
||||
private String issue;
|
||||
@JsonProperty("start_page")
|
||||
private String start_page;
|
||||
@JsonProperty("end_page")
|
||||
private String end_page;
|
||||
private String volume;
|
||||
private String edition;
|
||||
private String number;
|
||||
private String publisher;
|
||||
private String series;
|
||||
|
||||
public String getIssue() {
|
||||
return issue;
|
||||
}
|
||||
|
||||
public void setIssue(String issue) {
|
||||
this.issue = issue;
|
||||
}
|
||||
|
||||
public String getStart_page() {
|
||||
return start_page;
|
||||
}
|
||||
|
||||
public void setStart_page(String start_page) {
|
||||
this.start_page = start_page;
|
||||
}
|
||||
|
||||
public String getEnd_page() {
|
||||
return end_page;
|
||||
}
|
||||
|
||||
public void setEnd_page(String end_page) {
|
||||
this.end_page = end_page;
|
||||
}
|
||||
|
||||
public String getVolume() {
|
||||
return volume;
|
||||
}
|
||||
|
||||
public void setVolume(String volume) {
|
||||
this.volume = volume;
|
||||
}
|
||||
|
||||
public String getEdition() {
|
||||
return edition;
|
||||
}
|
||||
|
||||
public void setEdition(String edition) {
|
||||
this.edition = edition;
|
||||
}
|
||||
|
||||
public String getNumber() {
|
||||
return number;
|
||||
}
|
||||
|
||||
public void setNumber(String number) {
|
||||
this.number = number;
|
||||
}
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(String publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getSeries() {
|
||||
return series;
|
||||
}
|
||||
|
||||
public void setSeries(String series) {
|
||||
this.series = series;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Contribution implements Serializable {
|
||||
private String person;
|
||||
@JsonProperty("declared_affiliations")
|
||||
private List<String> declared_affiliation;
|
||||
private List<String> roles;
|
||||
private Integer rank;
|
||||
|
||||
public String getPerson() {
|
||||
return person;
|
||||
}
|
||||
|
||||
public void setPerson(String person) {
|
||||
this.person = person;
|
||||
}
|
||||
|
||||
public List<String> getDeclared_affiliation() {
|
||||
return declared_affiliation;
|
||||
}
|
||||
|
||||
public void setDeclared_affiliation(List<String> declared_affiliation) {
|
||||
this.declared_affiliation = declared_affiliation;
|
||||
}
|
||||
|
||||
public List<String> getRoles() {
|
||||
return roles;
|
||||
}
|
||||
|
||||
public void setRoles(List<String> roles) {
|
||||
this.roles = roles;
|
||||
}
|
||||
|
||||
public Integer getRank() {
|
||||
return rank;
|
||||
}
|
||||
|
||||
public void setRank(Integer rank) {
|
||||
this.rank = rank;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 22/02/24
|
||||
*/
|
||||
public class Contributor implements Serializable {
|
||||
private String person; // I would not map it because we have only information regarding the person (if any)
|
||||
// associated to the leading organization
|
||||
private String organization; // contributors.person
|
||||
|
||||
private String role;// private
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class Datasource implements Serializable {
|
||||
private String local_identifier;// id
|
||||
private List<Identifier> identifiers; // .schema pid.qualifier.classid;identifiers.value pid.value
|
||||
private String name; // officialname.value
|
||||
private String submission_policy_url;// submissionpolicyurl
|
||||
private String preservation_policy_url;// preservationpolicyurl
|
||||
private Boolean version_control;// versioncontrol bool
|
||||
private List<PersistentIdentitySystems> persistent_identity_systems;// . product_type researchentitytype list type
|
||||
// to be remapped to the eosc types
|
||||
// persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values
|
||||
private String jurisdiction;// jurisdiction.classname
|
||||
private String data_source_classification;// eoscdatasourcetype.classname
|
||||
private List<String> research_product_type;// researchentitytype list type to be remapped to the eosc types
|
||||
private Boolean thematic;// thematic bool
|
||||
private List<Licence> research_product_license; // .name not mappable listresearch_product_license.url not mappable
|
||||
private List<String> research_product_access_policy;// "databaseaccesstype if open => open access
|
||||
// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
|
||||
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
|
||||
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list
|
||||
private List<Licence> research_product_metadata_license; // .name not mappable list
|
||||
// research_product_metadata_license.url not mappable
|
||||
private List<String> research_product_metadata_access_policy;// researchproductmetadataccesspolicies list with the
|
||||
// same mapping of research_product_access_policy
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getSubmission_policy_url() {
|
||||
return submission_policy_url;
|
||||
}
|
||||
|
||||
public void setSubmission_policy_url(String submission_policy_url) {
|
||||
this.submission_policy_url = submission_policy_url;
|
||||
}
|
||||
|
||||
public String getPreservation_policy_url() {
|
||||
return preservation_policy_url;
|
||||
}
|
||||
|
||||
public void setPreservation_policy_url(String preservation_policy_url) {
|
||||
this.preservation_policy_url = preservation_policy_url;
|
||||
}
|
||||
|
||||
public Boolean getVersion_control() {
|
||||
return version_control;
|
||||
}
|
||||
|
||||
public void setVersion_control(Boolean version_control) {
|
||||
this.version_control = version_control;
|
||||
}
|
||||
|
||||
public List<PersistentIdentitySystems> getPersistent_identity_systems() {
|
||||
return persistent_identity_systems;
|
||||
}
|
||||
|
||||
public void setPersistent_identity_systems(List<PersistentIdentitySystems> persistent_identity_systems) {
|
||||
this.persistent_identity_systems = persistent_identity_systems;
|
||||
}
|
||||
|
||||
public String getJurisdiction() {
|
||||
return jurisdiction;
|
||||
}
|
||||
|
||||
public void setJurisdiction(String jurisdiction) {
|
||||
this.jurisdiction = jurisdiction;
|
||||
}
|
||||
|
||||
public String getData_source_classification() {
|
||||
return data_source_classification;
|
||||
}
|
||||
|
||||
public void setData_source_classification(String data_source_classification) {
|
||||
this.data_source_classification = data_source_classification;
|
||||
}
|
||||
|
||||
public List<String> getResearch_product_type() {
|
||||
return research_product_type;
|
||||
}
|
||||
|
||||
public void setResearch_product_type(List<String> research_product_type) {
|
||||
this.research_product_type = research_product_type;
|
||||
}
|
||||
|
||||
public Boolean getThematic() {
|
||||
return thematic;
|
||||
}
|
||||
|
||||
public void setThematic(Boolean thematic) {
|
||||
this.thematic = thematic;
|
||||
}
|
||||
|
||||
public List<Licence> getResearch_product_license() {
|
||||
return research_product_license;
|
||||
}
|
||||
|
||||
public void setResearch_product_license(List<Licence> research_product_license) {
|
||||
this.research_product_license = research_product_license;
|
||||
}
|
||||
|
||||
public List<String> getResearch_product_access_policy() {
|
||||
return research_product_access_policy;
|
||||
}
|
||||
|
||||
public void setResearch_product_access_policy(List<String> research_product_access_policy) {
|
||||
this.research_product_access_policy = research_product_access_policy;
|
||||
}
|
||||
|
||||
public List<Licence> getResearch_product_metadata_license() {
|
||||
return research_product_metadata_license;
|
||||
}
|
||||
|
||||
public void setResearch_product_metadata_license(List<Licence> research_product_metadata_license) {
|
||||
this.research_product_metadata_license = research_product_metadata_license;
|
||||
}
|
||||
|
||||
public List<String> getResearch_product_metadata_access_policy() {
|
||||
return research_product_metadata_access_policy;
|
||||
}
|
||||
|
||||
public void setResearch_product_metadata_access_policy(List<String> research_product_metadata_access_policy) {
|
||||
this.research_product_metadata_access_policy = research_product_metadata_access_policy;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Dates implements Serializable {
|
||||
private String value;
|
||||
private String type;
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public static Dates newInstance(String value, String type) {
|
||||
Dates d = new Dates();
|
||||
d.value = value;
|
||||
d.type = type;
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import org.codehaus.jackson.annotate.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 22/02/24
|
||||
*/
|
||||
public class Grant implements Serializable {
|
||||
private String local_identifier;// id
|
||||
private List<Identifier> identifiers;// .schema pid.qualifier.classid identifiers.value pid.value
|
||||
// identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname
|
||||
// identifiers.value project.code
|
||||
|
||||
private String title;// title.value
|
||||
@JsonProperty(value = "abstract")
|
||||
private String summary;// summary.value
|
||||
private String acronym; // acronym.value
|
||||
private String funder;// fundingtree to be used the xpath //funder/name
|
||||
private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
|
||||
private String currency;// currency.value
|
||||
private Float funded_amount;// ' fundedamount.value
|
||||
private List<String> keywords;// subject.value
|
||||
private String start_date;// startdate.value
|
||||
private String end_date;// enddate.value
|
||||
private String website;// websiteurl.value
|
||||
private List<String> beneficiaries;// organization.id for the organizations in the relation with semantic class
|
||||
// isParticipant produces the list of organization internal identifiers
|
||||
private List<Contributor> contributors;//
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getSummary() {
|
||||
return summary;
|
||||
}
|
||||
|
||||
public void setSummary(String summary) {
|
||||
this.summary = summary;
|
||||
}
|
||||
|
||||
public String getAcronym() {
|
||||
return acronym;
|
||||
}
|
||||
|
||||
public void setAcronym(String acronym) {
|
||||
this.acronym = acronym;
|
||||
}
|
||||
|
||||
public String getFunder() {
|
||||
return funder;
|
||||
}
|
||||
|
||||
public void setFunder(String funder) {
|
||||
this.funder = funder;
|
||||
}
|
||||
|
||||
public String getFunding_stream() {
|
||||
return funding_stream;
|
||||
}
|
||||
|
||||
public void setFunding_stream(String funding_stream) {
|
||||
this.funding_stream = funding_stream;
|
||||
}
|
||||
|
||||
public String getCurrency() {
|
||||
return currency;
|
||||
}
|
||||
|
||||
public void setCurrency(String currency) {
|
||||
this.currency = currency;
|
||||
}
|
||||
|
||||
public Float getFunded_amount() {
|
||||
return funded_amount;
|
||||
}
|
||||
|
||||
public void setFunded_amount(Float funded_amount) {
|
||||
this.funded_amount = funded_amount;
|
||||
}
|
||||
|
||||
public List<String> getKeywords() {
|
||||
return keywords;
|
||||
}
|
||||
|
||||
public void setKeywords(List<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
public String getStart_date() {
|
||||
return start_date;
|
||||
}
|
||||
|
||||
public void setStart_date(String start_date) {
|
||||
this.start_date = start_date;
|
||||
}
|
||||
|
||||
public String getEnd_date() {
|
||||
return end_date;
|
||||
}
|
||||
|
||||
public void setEnd_date(String end_date) {
|
||||
this.end_date = end_date;
|
||||
}
|
||||
|
||||
public String getWebsite() {
|
||||
return website;
|
||||
}
|
||||
|
||||
public void setWebsite(String website) {
|
||||
this.website = website;
|
||||
}
|
||||
|
||||
public List<String> getBeneficiaries() {
|
||||
return beneficiaries;
|
||||
}
|
||||
|
||||
public void setBeneficiaries(List<String> beneficiaries) {
|
||||
this.beneficiaries = beneficiaries;
|
||||
}
|
||||
|
||||
public List<Contributor> getContributors() {
|
||||
return contributors;
|
||||
}
|
||||
|
||||
public void setContributors(List<Contributor> contributors) {
|
||||
this.contributors = contributors;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Identifier implements Serializable {
|
||||
private String scheme;
|
||||
private String value;
|
||||
|
||||
public String getScheme() {
|
||||
return scheme;
|
||||
}
|
||||
|
||||
public void setScheme(String scheme) {
|
||||
this.scheme = scheme;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static Identifier newInstance(String scheme, String value) {
|
||||
Identifier i = new Identifier();
|
||||
i.value = value;
|
||||
i.scheme = scheme;
|
||||
return i;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class Licence implements Serializable {
|
||||
}
|
|
@ -0,0 +1,138 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Manifestation implements Serializable {
|
||||
@JsonProperty("product_local_type")
|
||||
private String product_local_type;
|
||||
@JsonProperty("product_local_type_schema")
|
||||
private String product_local_type_schema;
|
||||
private List<Dates> dates;
|
||||
@JsonProperty("peer_review")
|
||||
private String peer_review;
|
||||
@JsonProperty("metadata_curation")
|
||||
private String metadata_curation;
|
||||
private String url;
|
||||
private String pid;
|
||||
@JsonProperty("access_right")
|
||||
private String access_right;
|
||||
private String licence;
|
||||
@JsonProperty("licance_schema")
|
||||
private String licence_schema;
|
||||
private Biblio biblio;
|
||||
private String venue;
|
||||
@JsonProperty("hosting_datasource")
|
||||
private String hosting_datasource;
|
||||
|
||||
public String getProduct_local_type() {
|
||||
return product_local_type;
|
||||
}
|
||||
|
||||
public void setProduct_local_type(String product_local_type) {
|
||||
this.product_local_type = product_local_type;
|
||||
}
|
||||
|
||||
public String getProduct_local_type_schema() {
|
||||
return product_local_type_schema;
|
||||
}
|
||||
|
||||
public void setProduct_local_type_schema(String product_local_type_schema) {
|
||||
this.product_local_type_schema = product_local_type_schema;
|
||||
}
|
||||
|
||||
public List<Dates> getDates() {
|
||||
return dates;
|
||||
}
|
||||
|
||||
public void setDates(List<Dates> dates) {
|
||||
this.dates = dates;
|
||||
}
|
||||
|
||||
public String getPeer_review() {
|
||||
return peer_review;
|
||||
}
|
||||
|
||||
public void setPeer_review(String peer_review) {
|
||||
this.peer_review = peer_review;
|
||||
}
|
||||
|
||||
public String getMetadata_curation() {
|
||||
return metadata_curation;
|
||||
}
|
||||
|
||||
public void setMetadata_curation(String metadata_curation) {
|
||||
this.metadata_curation = metadata_curation;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(String pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public String getAccess_right() {
|
||||
return access_right;
|
||||
}
|
||||
|
||||
public void setAccess_right(String access_right) {
|
||||
this.access_right = access_right;
|
||||
}
|
||||
|
||||
public String getLicence() {
|
||||
return licence;
|
||||
}
|
||||
|
||||
public void setLicence(String licence) {
|
||||
this.licence = licence;
|
||||
}
|
||||
|
||||
public String getLicence_schema() {
|
||||
return licence_schema;
|
||||
}
|
||||
|
||||
public void setLicence_schema(String licence_schema) {
|
||||
this.licence_schema = licence_schema;
|
||||
}
|
||||
|
||||
public Biblio getBiblio() {
|
||||
return biblio;
|
||||
}
|
||||
|
||||
public void setBiblio(Biblio biblio) {
|
||||
this.biblio = biblio;
|
||||
}
|
||||
|
||||
public String getVenue() {
|
||||
return venue;
|
||||
}
|
||||
|
||||
public void setVenue(String venue) {
|
||||
this.venue = venue;
|
||||
}
|
||||
|
||||
public String getHosting_datasource() {
|
||||
return hosting_datasource;
|
||||
}
|
||||
|
||||
public void setHosting_datasource(String hosting_datasource) {
|
||||
this.hosting_datasource = hosting_datasource;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 04/09/23
|
||||
*/
|
||||
public enum MetadataCuration {
|
||||
YES("yes"), NO("no"), UNAVAILABLE("unavailable");
|
||||
|
||||
public final String label;
|
||||
|
||||
private MetadataCuration(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class Organization implements Serializable {
|
||||
private String local_identifier; // id
|
||||
private List<Identifier> identifiers; // pid.qualifier.classid; pid.value list
|
||||
private String name; // legalname.value
|
||||
|
||||
private String short_name; // legalshortname.value
|
||||
private List<String> other_names;// alternative_names.value list
|
||||
private String website;// websiteurl.value
|
||||
private String country; // country.classid
|
||||
private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other"
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getShort_name() {
|
||||
return short_name;
|
||||
}
|
||||
|
||||
public void setShort_name(String short_name) {
|
||||
this.short_name = short_name;
|
||||
}
|
||||
|
||||
public List<String> getOther_names() {
|
||||
return other_names;
|
||||
}
|
||||
|
||||
public void setOther_names(List<String> other_names) {
|
||||
this.other_names = other_names;
|
||||
}
|
||||
|
||||
public String getWebsite() {
|
||||
return website;
|
||||
}
|
||||
|
||||
public void setWebsite(String website) {
|
||||
this.website = website;
|
||||
}
|
||||
|
||||
public String getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(String country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
public enum OrganizationTypes {
|
||||
ARCHIVE("archive"),
|
||||
|
||||
COMPANY("company"),
|
||||
|
||||
EDUCATION("education"), FACILITY("facility"), GOVERNMENT("government"), HEALTHCARE("healthcare"), NONPROFIT(
|
||||
"nonprofit"), FUNDER("funder"), OTHER("other");
|
||||
|
||||
public final String label;
|
||||
|
||||
private OrganizationTypes(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 04/09/23
|
||||
*/
|
||||
public enum PeerReview {
|
||||
PEER_REVIEWED("peer-reviewed"), NON_PEER_REVIEWED("not peer-reviewed"), DOUBLE_BLIND("double-blind"), SINGLE_BLIND(
|
||||
"single-blind"), UNAVAILABLE("unavailable"), OPEN("open peer review");
|
||||
|
||||
public final String label;
|
||||
|
||||
private PeerReview(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class PersistentIdentitySystems implements Serializable {
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import org.codehaus.jackson.annotate.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 05/09/23
|
||||
*/
|
||||
public class Persons implements Serializable {
|
||||
@JsonProperty("local_identifier")
|
||||
private String local_identifier;
|
||||
private List<Identifier> identifiers;
|
||||
@JsonProperty("given_name")
|
||||
private String given_name;
|
||||
@JsonProperty("family_name")
|
||||
private String family_name;
|
||||
private String agent;
|
||||
@JsonProperty("declared_affiliations")
|
||||
private List<Affiliation> declared_affiliations;
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getGiven_name() {
|
||||
return given_name;
|
||||
}
|
||||
|
||||
public void setGiven_name(String given_name) {
|
||||
this.given_name = given_name;
|
||||
}
|
||||
|
||||
public String getFamily_name() {
|
||||
return family_name;
|
||||
}
|
||||
|
||||
public void setFamily_name(String family_name) {
|
||||
this.family_name = family_name;
|
||||
}
|
||||
|
||||
public String getAgent() {
|
||||
return agent;
|
||||
}
|
||||
|
||||
public void setAgent(String agent) {
|
||||
this.agent = agent;
|
||||
}
|
||||
|
||||
public List<Affiliation> getDeclared_affiliations() {
|
||||
return declared_affiliations;
|
||||
}
|
||||
|
||||
public void setDeclared_affiliations(List<Affiliation> declared_affiliations) {
|
||||
this.declared_affiliations = declared_affiliations;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public enum Prefixes implements Serializable {
|
||||
RESEARCH_PRODUCT("product_____::"),
|
||||
|
||||
ORGANIZATION("organization::"),
|
||||
|
||||
GRANT("grant_______::"),
|
||||
|
||||
PERSON(
|
||||
"person______::"),
|
||||
|
||||
TEMPORARY_PERSON("temp_person_::"),
|
||||
|
||||
DATASOURCE("datasource__::"), TOPIC("topic_______::"), VENUE("venue_______::");
|
||||
|
||||
public final String label;
|
||||
|
||||
private Prefixes(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Provenance implements Serializable {
|
||||
private String type;
|
||||
private double trust;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public double getTrust() {
|
||||
return trust;
|
||||
}
|
||||
|
||||
public void setTrust(double trust) {
|
||||
this.trust = trust;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 05/09/23
|
||||
*/
|
||||
public enum RelationType implements Serializable {
|
||||
RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION(
|
||||
"hasAuthorInstitution"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), SUPPLEMENT(
|
||||
"IsSupplementedBy"), DOCUMENTS(
|
||||
"IsDocumentedBy"), PART("IsPartOf"), VERSION("IsNewVersionOf"), CITATION("Cites");
|
||||
|
||||
public final String label;
|
||||
|
||||
private RelationType(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Relations implements Serializable {
|
||||
@JsonProperty("relation_type")
|
||||
private String relation_type;
|
||||
@JsonProperty("product_list")
|
||||
private List<String> product_list;
|
||||
|
||||
public static Relations newInstance(String relClass, List<String> target) {
|
||||
Relations r = new Relations();
|
||||
r.relation_type = relClass;
|
||||
r.product_list = target;
|
||||
return r;
|
||||
}
|
||||
|
||||
public String getRelation_type() {
|
||||
return relation_type;
|
||||
}
|
||||
|
||||
public void setRelation_type(String relation_type) {
|
||||
this.relation_type = relation_type;
|
||||
}
|
||||
|
||||
public List<String> getProduct_list() {
|
||||
return product_list;
|
||||
}
|
||||
|
||||
public void setProduct_list(List<String> product_list) {
|
||||
this.product_list = product_list;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class ResearchProduct implements Serializable {
|
||||
@JsonProperty("local_identifier")
|
||||
private String local_identifier;
|
||||
private List<Identifier> identifiers;
|
||||
private Map<String, List<String>> titles;
|
||||
private Map<String, List<String>> abstracts;
|
||||
@JsonProperty("product_type")
|
||||
private String product_type;
|
||||
private List<ResultTopic> topics;
|
||||
private List<Contribution> contributions;
|
||||
private List<Manifestation> manifestations;
|
||||
@JsonProperty("relevant_organizations")
|
||||
private List<String> relevant_organizations;
|
||||
private List<String> funding;
|
||||
@JsonProperty("related_products")
|
||||
private List<Relations> related_products;
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getTitles() {
|
||||
return titles;
|
||||
}
|
||||
|
||||
public void setTitles(Map<String, List<String>> titles) {
|
||||
this.titles = titles;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getAbstracts() {
|
||||
return abstracts;
|
||||
}
|
||||
|
||||
public void setAbstracts(Map<String, List<String>> abstracts) {
|
||||
this.abstracts = abstracts;
|
||||
}
|
||||
|
||||
public String getProduct_type() {
|
||||
return product_type;
|
||||
}
|
||||
|
||||
public void setProduct_type(String product_type) {
|
||||
this.product_type = product_type;
|
||||
}
|
||||
|
||||
public List<ResultTopic> getTopics() {
|
||||
return topics;
|
||||
}
|
||||
|
||||
public void setTopics(List<ResultTopic> topics) {
|
||||
this.topics = topics;
|
||||
}
|
||||
|
||||
public List<Contribution> getContributions() {
|
||||
return contributions;
|
||||
}
|
||||
|
||||
public void setContributions(List<Contribution> contributions) {
|
||||
this.contributions = contributions;
|
||||
}
|
||||
|
||||
public List<Manifestation> getManifestations() {
|
||||
return manifestations;
|
||||
}
|
||||
|
||||
public void setManifestations(List<Manifestation> manifestations) {
|
||||
this.manifestations = manifestations;
|
||||
}
|
||||
|
||||
public List<String> getRelevant_organizations() {
|
||||
return relevant_organizations;
|
||||
}
|
||||
|
||||
public void setRelevant_organizations(List<String> relevant_organizations) {
|
||||
this.relevant_organizations = relevant_organizations;
|
||||
}
|
||||
|
||||
public List<String> getFunding() {
|
||||
return funding;
|
||||
}
|
||||
|
||||
public void setFunding(List<String> funding) {
|
||||
this.funding = funding;
|
||||
}
|
||||
|
||||
public List<Relations> getRelated_products() {
|
||||
return related_products;
|
||||
}
|
||||
|
||||
public void setRelated_products(List<Relations> related_products) {
|
||||
this.related_products = related_products;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public enum ResearchTypes {
|
||||
LITERATURE("literature"), RESEARCH_DATA("research data"), RESEARCH_SOFTWARE("research software"), OTHER("other");
|
||||
|
||||
public final String label;
|
||||
|
||||
private ResearchTypes(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class ResultTopic implements Serializable {
|
||||
private String topic;
|
||||
private Provenance provenance;
|
||||
|
||||
public String getTopic() {
|
||||
return topic;
|
||||
}
|
||||
|
||||
public void setTopic(String topic) {
|
||||
this.topic = topic;
|
||||
}
|
||||
|
||||
public Provenance getProvenance() {
|
||||
return provenance;
|
||||
}
|
||||
|
||||
public void setProvenance(Provenance provenance) {
|
||||
this.provenance = provenance;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Topic implements Serializable {
|
||||
private String local_identifier;
|
||||
private List<Identifier> identifiers;
|
||||
private String name;
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 27/02/24
|
||||
*/
|
||||
public class Venue implements Serializable {
|
||||
private String local_identifier;
|
||||
private List<Identifier> identifiers;
|
||||
private String name;
|
||||
private String acronym;
|
||||
private String type;
|
||||
private String publisher;
|
||||
private String series;
|
||||
private Boolean is_currently_full_oa;
|
||||
|
||||
private String creation_date;
|
||||
private List<VenueContribution> contributions;
|
||||
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getAcronym() {
|
||||
return acronym;
|
||||
}
|
||||
|
||||
public void setAcronym(String acronym) {
|
||||
this.acronym = acronym;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(String publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getSeries() {
|
||||
return series;
|
||||
}
|
||||
|
||||
public void setSeries(String series) {
|
||||
this.series = series;
|
||||
}
|
||||
|
||||
public Boolean getIs_currently_full_oa() {
|
||||
return is_currently_full_oa;
|
||||
}
|
||||
|
||||
public void setIs_currently_full_oa(Boolean is_currently_full_oa) {
|
||||
this.is_currently_full_oa = is_currently_full_oa;
|
||||
}
|
||||
|
||||
public String getCreation_date() {
|
||||
return creation_date;
|
||||
}
|
||||
|
||||
public void setCreation_date(String creation_date) {
|
||||
this.creation_date = creation_date;
|
||||
}
|
||||
|
||||
public List<VenueContribution> getContributions() {
|
||||
return contributions;
|
||||
}
|
||||
|
||||
public void setContributions(List<VenueContribution> contributions) {
|
||||
this.contributions = contributions;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 27/02/24
|
||||
*/
|
||||
public class VenueContribution implements Serializable {
|
||||
private String person;
|
||||
private List<String> roles;
|
||||
|
||||
public String getPerson() {
|
||||
|
||||
return person;
|
||||
}
|
||||
|
||||
public void setPerson(String person) {
|
||||
this.person = person;
|
||||
}
|
||||
|
||||
public List<String> getRoles() {
|
||||
return roles;
|
||||
}
|
||||
|
||||
public void setRoles(List<String> roles) {
|
||||
this.roles = roles;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public enum VenueIdentifierType implements Serializable {
|
||||
|
||||
EISSN("eissn"), ISSN("issn"), LISSN("lissn"), ISBN("isbn"), OPENDOAR(
|
||||
"opendoar"), R3DATA("re3data.org"), FAIRSHARING("fairsharing");
|
||||
|
||||
public final String label;
|
||||
|
||||
private VenueIdentifierType(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public enum VenueType implements Serializable {
|
||||
|
||||
REPOSITORY("repository"), JOURNAL("journal"), CONFERENCE("conference"), BOOK("book"), OTHER(
|
||||
"other"), UNKNOWN("unknown");
|
||||
|
||||
public final String label;
|
||||
|
||||
private VenueType(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.common;
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
|
@ -15,7 +15,7 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.common.MakeTarArchive;
|
||||
import eu.dnetlib.dhp.common.MakeTarArchive;
|
||||
|
||||
public class MakeTar implements Serializable {
|
||||
|
||||
|
|
|
@ -1,793 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump;
|
||||
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Constants.*;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
|
||||
import eu.dnetlib.dhp.oa.model.*;
|
||||
import eu.dnetlib.dhp.oa.model.AccessRight;
|
||||
import eu.dnetlib.dhp.oa.model.Author;
|
||||
import eu.dnetlib.dhp.oa.model.GeoLocation;
|
||||
import eu.dnetlib.dhp.oa.model.Instance;
|
||||
import eu.dnetlib.dhp.oa.model.OpenAccessColor;
|
||||
import eu.dnetlib.dhp.oa.model.OpenAccessRoute;
|
||||
import eu.dnetlib.dhp.oa.model.Result;
|
||||
import eu.dnetlib.dhp.oa.model.Subject;
|
||||
import eu.dnetlib.dhp.oa.model.community.CfHbKeyValue;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityInstance;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.oa.model.community.Context;
|
||||
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class ResultMapper implements Serializable {
|
||||
private static final String NULL = "null";
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in, Map<String, String> communityMap, String dumpType)
|
||||
throws NoAvailableEntityTypeException, CardinalityTooHighException {
|
||||
|
||||
Result out;
|
||||
if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
|
||||
out = new GraphResult();
|
||||
} else {
|
||||
out = new CommunityResult();
|
||||
}
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
try {
|
||||
|
||||
addTypeSpecificInformation(out, input, ort.get());
|
||||
mapAuthor(out, input);
|
||||
mapAccessRight(out, input);
|
||||
mapContributor(out, input);
|
||||
mapCountry(out, input);
|
||||
mapCoverage(out, input);
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
out.setGreen(input.getIsGreen());
|
||||
out.setInDiamondJournal(input.getIsInDiamondJournal());
|
||||
out.setPubliclyFunded(input.getPubliclyFunded());
|
||||
mapOpenAccessColor(out, input);
|
||||
mapDescription(out, input);
|
||||
mapEmbargo(out, input);
|
||||
mapFormat(out, input);
|
||||
out.setId(getEntityId(input.getId(), ENTITY_ID_SEPARATOR));
|
||||
mapOriginalId(out, input);
|
||||
mapInstance(dumpType, out, input);
|
||||
mapLanguage(out, input);
|
||||
mapLastUpdateTimestamp(out, input);
|
||||
mapTitle(out, input);
|
||||
mapPid(out, input);
|
||||
mapDateOfAcceptance(out, input);
|
||||
mapPublisher(out, input);
|
||||
mapSource(out, input);
|
||||
mapSubject(out, input);
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
mapMeasure(out, input);
|
||||
if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
|
||||
mapCollectedfrom((CommunityResult) out, input);
|
||||
mapContext(communityMap, (CommunityResult) out, input);
|
||||
}
|
||||
} catch (ClassCastException cce) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
private static void mapOpenAccessColor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
if (Optional.ofNullable(input.getOpenAccessColor()).isPresent())
|
||||
switch (input.getOpenAccessColor()) {
|
||||
case bronze:
|
||||
out.setOpenAccessColor(OpenAccessColor.bronze);
|
||||
break;
|
||||
case gold:
|
||||
out.setOpenAccessColor(OpenAccessColor.gold);
|
||||
break;
|
||||
case hybrid:
|
||||
out.setOpenAccessColor(OpenAccessColor.hybrid);
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void mapContext(Map<String, String> communityMap, CommunityResult out,
|
||||
eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Set<String> communities = communityMap.keySet();
|
||||
List<Context> contextList = Optional
|
||||
.ofNullable(
|
||||
input
|
||||
.getContext())
|
||||
.map(
|
||||
value -> value
|
||||
.stream()
|
||||
.map(c -> {
|
||||
String communityId = c.getId();
|
||||
if (communityId.contains("::")) {
|
||||
communityId = communityId.substring(0, communityId.indexOf("::"));
|
||||
}
|
||||
if (communities.contains(communityId)) {
|
||||
Context context = new Context();
|
||||
context.setCode(communityId);
|
||||
context.setLabel(communityMap.get(communityId));
|
||||
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
|
||||
if (dataInfo.isPresent()) {
|
||||
List<Provenance> provenance = new ArrayList<>();
|
||||
provenance
|
||||
.addAll(
|
||||
dataInfo
|
||||
.get()
|
||||
.stream()
|
||||
.map(
|
||||
di -> Optional
|
||||
.ofNullable(di.getProvenanceaction())
|
||||
.map(
|
||||
provenanceaction -> Provenance
|
||||
.newInstance(
|
||||
provenanceaction.getClassname(),
|
||||
di.getTrust()))
|
||||
.orElse(null))
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
try {
|
||||
context.setProvenance(getUniqueProvenance(provenance));
|
||||
} catch (NoAvailableEntityTypeException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return context;
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>());
|
||||
|
||||
if (!contextList.isEmpty()) {
|
||||
Set<Integer> hashValue = new HashSet<>();
|
||||
List<Context> remainigContext = new ArrayList<>();
|
||||
contextList.forEach(c -> {
|
||||
if (!hashValue.contains(c.hashCode())) {
|
||||
remainigContext.add(c);
|
||||
hashValue.add(c.hashCode());
|
||||
}
|
||||
});
|
||||
out.setContext(remainigContext);
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapCollectedfrom(CommunityResult out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> CfHbKeyValue.newInstance(getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private static void mapMeasure(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
if (Optional.ofNullable(input.getMeasures()).isPresent() && input.getMeasures().size() > 0) {
|
||||
|
||||
out.setIndicators(Utils.getIndicator(input.getMeasures()));
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapSubject(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
// .filter(
|
||||
// s -> !((s.getQualifier().getClassid().equalsIgnoreCase("fos") &&
|
||||
// Optional.ofNullable(s.getDataInfo()).isPresent()
|
||||
// && Optional.ofNullable(s.getDataInfo().getProvenanceaction()).isPresent() &&
|
||||
// s.getDataInfo().getProvenanceaction().getClassid().equalsIgnoreCase("subject:fos"))
|
||||
// ||
|
||||
// (s.getQualifier().getClassid().equalsIgnoreCase("sdg") &&
|
||||
// Optional.ofNullable(s.getDataInfo()).isPresent()
|
||||
// && Optional.ofNullable(s.getDataInfo().getProvenanceaction()).isPresent() &&
|
||||
// s
|
||||
// .getDataInfo()
|
||||
// .getProvenanceaction()
|
||||
// .getClassid()
|
||||
// .equalsIgnoreCase("subject:sdg"))))
|
||||
.filter(s -> !s.getValue().equalsIgnoreCase(NULL))
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
|
||||
out.setSubjects(subjectList);
|
||||
}
|
||||
|
||||
private static void mapSource(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(
|
||||
value -> out.setSource(value.stream().map(Field::getValue).collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
private static void mapPublisher(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<Field<String>> oStr;
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapDateOfAcceptance(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<Field<String>> oStr;
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapPid(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setPid(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
p -> ResultPid
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
private static void mapTitle(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapLastUpdateTimestamp(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapLanguage(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
Qualifier language = oL.get();
|
||||
out.setLanguage(Language.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapInstance(String dumpType, Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
if (DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
|
||||
((GraphResult) out)
|
||||
.setInstance(
|
||||
oInst.get().stream().map(ResultMapper::getGraphInstance).collect(Collectors.toList()));
|
||||
} else {
|
||||
((CommunityResult) out)
|
||||
.setInstance(
|
||||
oInst
|
||||
.get()
|
||||
.stream()
|
||||
.map(ResultMapper::getCommunityInstance)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapOriginalId(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
out.setOriginalId(new ArrayList<>());
|
||||
Optional
|
||||
.ofNullable(input.getOriginalId())
|
||||
.ifPresent(
|
||||
v -> out
|
||||
.setOriginalId(
|
||||
input
|
||||
.getOriginalId()
|
||||
.stream()
|
||||
.filter(s -> !s.startsWith("50|"))
|
||||
.collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
private static void mapFormat(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
}
|
||||
|
||||
private static void mapEmbargo(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapDescription(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
}
|
||||
|
||||
private static void mapCoverage(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
}
|
||||
|
||||
private static void mapCountry(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
ResultCountry country = new ResultCountry();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
private static void mapContributor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
}
|
||||
|
||||
private static void mapAccessRight(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
|
||||
Optional<Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent() && Constants.ACCESS_RIGHTS_COAR_MAP.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.ACCESS_RIGHTS_COAR_MAP.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
|
||||
BestAccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.COAR_CODE_LABEL_MAP.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapAuthor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(
|
||||
ats -> out.setAuthor(ats.stream().map(ResultMapper::getAuthor).collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
private static void addTypeSpecificInformation(Result out, eu.dnetlib.dhp.schema.oaf.Result input,
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier ort) throws NoAvailableEntityTypeException {
|
||||
switch (ort.getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
Dataset id = (Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
|
||||
Software is = (Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(Field::getValue)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
|
||||
OtherResearchProduct ir = (OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
|
||||
break;
|
||||
default:
|
||||
throw new NoAvailableEntityTypeException();
|
||||
}
|
||||
}
|
||||
|
||||
private static Instance getGraphInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
Instance instance = new Instance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
return instance;
|
||||
|
||||
}
|
||||
|
||||
private static CommunityInstance getCommunityInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
if (Optional.ofNullable(i.getCollectedfrom()).isPresent() &&
|
||||
Optional.ofNullable(i.getCollectedfrom().getKey()).isPresent() &&
|
||||
StringUtils.isNotBlank(i.getCollectedfrom().getKey()))
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
CfHbKeyValue
|
||||
.newInstance(
|
||||
getEntityId(i.getCollectedfrom().getKey(), ENTITY_ID_SEPARATOR),
|
||||
i.getCollectedfrom().getValue()));
|
||||
|
||||
if (Optional.ofNullable(i.getHostedby()).isPresent() &&
|
||||
Optional.ofNullable(i.getHostedby().getKey()).isPresent() &&
|
||||
StringUtils.isNotBlank(i.getHostedby().getKey()))
|
||||
instance
|
||||
.setHostedby(
|
||||
CfHbKeyValue
|
||||
.newInstance(
|
||||
getEntityId(i.getHostedby().getKey(), ENTITY_ID_SEPARATOR), i.getHostedby().getValue()));
|
||||
|
||||
return instance;
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.AccessRight> opAr = Optional.ofNullable(i.getAccessright());
|
||||
|
||||
if (opAr.isPresent() && Constants.ACCESS_RIGHTS_COAR_MAP.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.ACCESS_RIGHTS_COAR_MAP.get(opAr.get().getClassid());
|
||||
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.COAR_CODE_LABEL_MAP.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
|
||||
if (opAr.get().getOpenAccessRoute() != null) {
|
||||
switch (opAr.get().getOpenAccessRoute()) {
|
||||
case hybrid:
|
||||
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.hybrid);
|
||||
break;
|
||||
case gold:
|
||||
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.gold);
|
||||
break;
|
||||
case green:
|
||||
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.green);
|
||||
break;
|
||||
case bronze:
|
||||
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.bronze);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(
|
||||
pid -> instance
|
||||
.setPid(
|
||||
pid
|
||||
.stream()
|
||||
.map(p -> ResultPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.ifPresent(
|
||||
ai -> instance
|
||||
.setAlternateIdentifier(
|
||||
ai
|
||||
.stream()
|
||||
.map(p -> AlternateIdentifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
Optional<Field<String>> oPca = Optional.ofNullable(i.getProcessingchargeamount());
|
||||
Optional<Field<String>> oPcc = Optional.ofNullable(i.getProcessingchargecurrency());
|
||||
if (oPca.isPresent() && oPcc.isPresent()) {
|
||||
Field<String> pca = oPca.get();
|
||||
Field<String> pcc = oPcc.get();
|
||||
if (!pca.getValue().trim().equals("") && !pcc.getValue().trim().equals("")) {
|
||||
APC apc = new APC();
|
||||
apc.setCurrency(oPcc.get().getValue());
|
||||
apc.setAmount(oPca.get().getValue());
|
||||
instance.setArticleprocessingcharge(apc);
|
||||
}
|
||||
|
||||
}
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(instance::setUrl);
|
||||
|
||||
}
|
||||
|
||||
private static List<Provenance> getUniqueProvenance(List<Provenance> provenance)
|
||||
throws NoAvailableEntityTypeException {
|
||||
Provenance iProv = new Provenance();
|
||||
|
||||
Provenance hProv = new Provenance();
|
||||
Provenance lProv = new Provenance();
|
||||
|
||||
for (Provenance p : provenance) {
|
||||
switch (p.getProvenance()) {
|
||||
case Constants.HARVESTED:
|
||||
hProv = getHighestTrust(hProv, p);
|
||||
break;
|
||||
case Constants.INFERRED:
|
||||
iProv = getHighestTrust(iProv, p);
|
||||
// To be removed as soon as the new beta run has been done
|
||||
// this fixex issue of not set trust during bulktagging
|
||||
if (StringUtils.isEmpty(iProv.getTrust())) {
|
||||
iProv.setTrust(Constants.DEFAULT_TRUST);
|
||||
}
|
||||
break;
|
||||
case Constants.USER_CLAIM:
|
||||
lProv = getHighestTrust(lProv, p);
|
||||
break;
|
||||
default:
|
||||
throw new NoAvailableEntityTypeException();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return Arrays
|
||||
.asList(iProv, hProv, lProv)
|
||||
.stream()
|
||||
.filter(p -> !StringUtils.isEmpty(p.getProvenance()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private static Provenance getHighestTrust(Provenance hProv, Provenance p) {
|
||||
if (StringUtils.isNoneEmpty(hProv.getTrust(), p.getTrust()))
|
||||
return hProv.getTrust().compareTo(p.getTrust()) > 0 ? hProv : p;
|
||||
|
||||
return (StringUtils.isEmpty(p.getTrust()) && !StringUtils.isEmpty(hProv.getTrust())) ? hProv : p;
|
||||
|
||||
}
|
||||
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(SubjectSchemeValue.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
if (!s.getQualifier().getClassid().equalsIgnoreCase("fos") &&
|
||||
!s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
|
||||
return subject;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
AuthorPid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static AuthorPid getAuthorPid(StructuredProperty pid) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return AuthorPid
|
||||
.newInstance(
|
||||
AuthorPidSchemeValue
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return AuthorPid
|
||||
.newInstance(
|
||||
AuthorPidSchemeValue
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private static AuthorPid getOrcid(List<StructuredProperty> p) {
|
||||
List<StructuredProperty> pidList = p.stream().map(pid -> {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID) ||
|
||||
(pid.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING))) {
|
||||
return pid;
|
||||
}
|
||||
return null;
|
||||
}).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
|
||||
if (pidList.size() == 1) {
|
||||
return getAuthorPid(pidList.get(0));
|
||||
}
|
||||
|
||||
List<StructuredProperty> orcid = pidList
|
||||
.stream()
|
||||
.filter(
|
||||
ap -> ap
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
.equals(ModelConstants.ORCID))
|
||||
.collect(Collectors.toList());
|
||||
if (orcid.size() == 1) {
|
||||
return getAuthorPid(orcid.get(0));
|
||||
}
|
||||
orcid = pidList
|
||||
.stream()
|
||||
.filter(
|
||||
ap -> ap
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
.equals(ModelConstants.ORCID_PENDING))
|
||||
.collect(Collectors.toList());
|
||||
if (orcid.size() == 1) {
|
||||
return getAuthorPid(orcid.get(0));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,91 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
/**
|
||||
* This class connects with the community APIs for production. It saves the information about the
|
||||
* context that will guide the dump of the results. The information saved is a HashMap. The key is the id of a community
|
||||
* - research infrastructure/initiative , the value is the label of the research community - research
|
||||
* infrastructure/initiative.
|
||||
*/
|
||||
|
||||
public class SaveCommunityMap implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SaveCommunityMap.class);
|
||||
private final transient UtilCommunityAPI queryInformationSystem;
|
||||
|
||||
private final transient BufferedWriter writer;
|
||||
|
||||
public SaveCommunityMap(String hdfsPath, String hdfsNameNode) throws IOException {
|
||||
final Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, true);
|
||||
}
|
||||
|
||||
queryInformationSystem = new UtilCommunityAPI();
|
||||
|
||||
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
|
||||
writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SaveCommunityMap.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_cm_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String nameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", nameNode);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final Boolean singleCommunity = Optional
|
||||
.ofNullable(parser.get("singleDeposition"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(false);
|
||||
|
||||
final String community_id = Optional.ofNullable(parser.get("communityId")).orElse(null);
|
||||
|
||||
final SaveCommunityMap scm = new SaveCommunityMap(outputPath, nameNode);
|
||||
|
||||
scm.saveCommunityMap(singleCommunity, community_id);
|
||||
|
||||
}
|
||||
|
||||
private void saveCommunityMap(boolean singleCommunity, String communityId)
|
||||
throws IOException {
|
||||
final String communityMapString = Utils.OBJECT_MAPPER
|
||||
.writeValueAsString(queryInformationSystem.getCommunityMap(singleCommunity, communityId));
|
||||
log.info("communityMap {} ", communityMapString);
|
||||
writer
|
||||
.write(
|
||||
communityMapString);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,203 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.MAPPER;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.communityapi.model.*;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.complete.ContextInfo;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class UtilCommunityAPI {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(UtilCommunityAPI.class);
|
||||
|
||||
public CommunityMap getCommunityMap(boolean singleCommunity, String communityId)
|
||||
throws IOException {
|
||||
if (singleCommunity)
|
||||
return getMap(Arrays.asList(getCommunity(communityId)));
|
||||
return getMap(getValidCommunities());
|
||||
|
||||
}
|
||||
|
||||
private CommunityMap getMap(List<CommunityModel> communities) {
|
||||
final CommunityMap map = new CommunityMap();
|
||||
communities.forEach(c -> map.put(c.getId(), c.getName()));
|
||||
return map;
|
||||
}
|
||||
|
||||
public List<String> getCommunityCsv(List<String> comms) {
|
||||
return comms.stream().map(c -> {
|
||||
try {
|
||||
CommunityModel community = getCommunity(c);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(DHPUtils.md5(community.getId()));
|
||||
builder.append(Constants.SEP);
|
||||
builder.append(community.getName());
|
||||
builder.append(Constants.SEP);
|
||||
builder.append(community.getId());
|
||||
builder.append(Constants.SEP);
|
||||
builder
|
||||
.append(
|
||||
community.getDescription());
|
||||
return builder.toString();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}).collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private List<CommunityModel> getValidCommunities() throws IOException {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper
|
||||
.readValue(eu.dnetlib.dhp.communityapi.QueryCommunityAPI.communities(), CommunitySummary.class)
|
||||
.stream()
|
||||
.filter(
|
||||
community -> (community.getStatus().equals("all") || community.getStatus().equalsIgnoreCase("public"))
|
||||
&&
|
||||
(community.getType().equals("ri") || community.getType().equals("community")))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private CommunityModel getCommunity(String id) throws IOException {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper
|
||||
.readValue(eu.dnetlib.dhp.communityapi.QueryCommunityAPI.community(id), CommunityModel.class);
|
||||
|
||||
}
|
||||
|
||||
public List<ContextInfo> getContextInformation() throws IOException {
|
||||
|
||||
return getValidCommunities()
|
||||
.stream()
|
||||
.map(c -> getContext(c))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
public ContextInfo getContext(CommunityModel c) {
|
||||
|
||||
ContextInfo cinfo = new ContextInfo();
|
||||
cinfo.setId(c.getId());
|
||||
cinfo.setDescription(c.getDescription());
|
||||
CommunityModel cm = null;
|
||||
try {
|
||||
cm = getCommunity(c.getId());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
cinfo.setSubject(new ArrayList<>());
|
||||
cinfo.getSubject().addAll(cm.getSubjects());
|
||||
cinfo.setZenodocommunity(c.getZenodoCommunity());
|
||||
cinfo.setType(c.getType());
|
||||
return cinfo;
|
||||
}
|
||||
|
||||
public List<ContextInfo> getContextRelation() throws IOException {
|
||||
return getValidCommunities().stream().map(c -> {
|
||||
ContextInfo cinfo = new ContextInfo();
|
||||
cinfo.setId(c.getId());
|
||||
cinfo.setDatasourceList(getDatasourceList(c.getId()));
|
||||
cinfo.setProjectList(getProjectList(c.getId()));
|
||||
|
||||
return cinfo;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private List<String> getDatasourceList(String id) {
|
||||
List<String> datasourceList = new ArrayList<>();
|
||||
try {
|
||||
|
||||
new ObjectMapper()
|
||||
.readValue(
|
||||
eu.dnetlib.dhp.communityapi.QueryCommunityAPI.communityDatasource(id),
|
||||
DatasourceList.class)
|
||||
.stream()
|
||||
.forEach(ds -> {
|
||||
if (Optional.ofNullable(ds.getOpenaireId()).isPresent()) {
|
||||
|
||||
datasourceList.add(ds.getOpenaireId());
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return datasourceList;
|
||||
}
|
||||
|
||||
private List<String> getProjectList(String id) {
|
||||
int page = -1;
|
||||
int size = 100;
|
||||
ContentModel cm = null;
|
||||
;
|
||||
ArrayList<String> projectList = new ArrayList<>();
|
||||
do {
|
||||
page++;
|
||||
try {
|
||||
cm = new ObjectMapper()
|
||||
.readValue(
|
||||
eu.dnetlib.dhp.communityapi.QueryCommunityAPI
|
||||
.communityProjects(
|
||||
id, String.valueOf(page), String.valueOf(size)),
|
||||
ContentModel.class);
|
||||
if (cm.getContent().size() > 0) {
|
||||
cm.getContent().forEach(p -> {
|
||||
if (Optional.ofNullable(p.getOpenaireId()).isPresent())
|
||||
projectList.add(p.getOpenaireId());
|
||||
|
||||
});
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} while (!cm.getLast());
|
||||
|
||||
return projectList;
|
||||
}
|
||||
|
||||
/**
|
||||
* it returns for each organization the list of associated communities
|
||||
*/
|
||||
public CommunityEntityMap getCommunityOrganization() throws IOException {
|
||||
CommunityEntityMap organizationMap = new CommunityEntityMap();
|
||||
getValidCommunities()
|
||||
.forEach(community -> {
|
||||
String id = community.getId();
|
||||
try {
|
||||
List<String> associatedOrgs = MAPPER
|
||||
.readValue(
|
||||
eu.dnetlib.dhp.communityapi.QueryCommunityAPI.communityPropagationOrganization(id),
|
||||
OrganizationList.class);
|
||||
associatedOrgs.forEach(o -> {
|
||||
if (!organizationMap
|
||||
.keySet()
|
||||
.contains(o))
|
||||
organizationMap.put(o, new ArrayList<>());
|
||||
organizationMap.get(o).add(community.getId());
|
||||
});
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
return organizationMap;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,200 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump;
|
||||
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Constants.*;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.complete.Constants;
|
||||
import eu.dnetlib.dhp.oa.model.Indicator;
|
||||
import eu.dnetlib.dhp.oa.model.Score;
|
||||
import eu.dnetlib.dhp.oa.model.UsageCounts;
|
||||
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Relation;
|
||||
import eu.dnetlib.dhp.oa.model.graph.ResearchCommunity;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Measure;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class Utils {
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
public static final String ENTITY_ID_SEPARATOR = "|";
|
||||
|
||||
private Utils() {
|
||||
}
|
||||
|
||||
public static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static String getContextId(String id) {
|
||||
|
||||
return String
|
||||
.format(
|
||||
"%s::%s", Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5(id));
|
||||
}
|
||||
|
||||
public static CommunityMap getCommunityMap(SparkSession spark, String communityMapPath) {
|
||||
|
||||
return new Gson().fromJson(spark.read().textFile(communityMapPath).collectAsList().get(0), CommunityMap.class);
|
||||
|
||||
}
|
||||
|
||||
public static CommunityMap readCommunityMap(FileSystem fileSystem, String communityMapPath) throws IOException {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(new Path(communityMapPath))));
|
||||
StringBuilder sb = new StringBuilder();
|
||||
try {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
sb.append(line);
|
||||
}
|
||||
} finally {
|
||||
br.close();
|
||||
|
||||
}
|
||||
|
||||
return new Gson().fromJson(sb.toString(), CommunityMap.class);
|
||||
}
|
||||
|
||||
public static String getEntityId(String id, String separator) {
|
||||
return id.substring(id.indexOf(separator) + 1);
|
||||
}
|
||||
|
||||
public static Dataset<String> getEntitiesId(SparkSession spark, String inputPath) {
|
||||
Dataset<String> dumpedIds = Utils
|
||||
.readPath(spark, inputPath + "/publication", GraphResult.class)
|
||||
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING())
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/dataset", GraphResult.class)
|
||||
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/software", GraphResult.class)
|
||||
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/otherresearchproduct", GraphResult.class)
|
||||
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/organization", eu.dnetlib.dhp.oa.model.graph.Organization.class)
|
||||
.map(
|
||||
(MapFunction<eu.dnetlib.dhp.oa.model.graph.Organization, String>) o -> o.getId(),
|
||||
Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.oa.model.graph.Project.class)
|
||||
.map(
|
||||
(MapFunction<eu.dnetlib.dhp.oa.model.graph.Project, String>) o -> o.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/datasource", eu.dnetlib.dhp.oa.model.graph.Datasource.class)
|
||||
.map(
|
||||
(MapFunction<eu.dnetlib.dhp.oa.model.graph.Datasource, String>) o -> o.getId(),
|
||||
Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/communities_infrastructures", ResearchCommunity.class)
|
||||
.map((MapFunction<ResearchCommunity, String>) c -> c.getId(), Encoders.STRING()));
|
||||
return dumpedIds;
|
||||
}
|
||||
|
||||
public static Dataset<Relation> getValidRelations(Dataset<Relation> relations,
|
||||
Dataset<String> entitiesIds) {
|
||||
Dataset<Tuple2<String, Relation>> relationSource = relations
|
||||
.map(
|
||||
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getSource(), r),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
|
||||
|
||||
Dataset<Tuple2<String, Relation>> relJoinSource = relationSource
|
||||
.joinWith(entitiesIds, relationSource.col("_1").equalTo(entitiesIds.col("value")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple2<String, Relation>, String>, Tuple2<String, Relation>>) t2 -> new Tuple2<>(
|
||||
t2._1()._2().getTarget(), t2._1()._2()),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
|
||||
|
||||
return relJoinSource
|
||||
.joinWith(entitiesIds, relJoinSource.col("_1").equalTo(entitiesIds.col("value")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple2<String, Relation>, String>, Relation>) t2 -> t2._1()._2(),
|
||||
Encoders.bean(Relation.class));
|
||||
}
|
||||
|
||||
public static Indicator getIndicator(List<Measure> measures) {
|
||||
Indicator i = new Indicator();
|
||||
for (eu.dnetlib.dhp.schema.oaf.Measure m : measures) {
|
||||
switch (m.getId()) {
|
||||
case USAGE_COUNT_DOWNLOADS:
|
||||
getUsageCounts(i).setDownloads(m.getUnit().get(0).getValue());
|
||||
break;
|
||||
case USAGE_COUNT_VIEWS:
|
||||
getUsageCounts(i).setViews(m.getUnit().get(0).getValue());
|
||||
break;
|
||||
default:
|
||||
getImpactMeasure(i).add(getScore(m.getId(), m.getUnit()));
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static UsageCounts getUsageCounts(Indicator i) {
|
||||
if (i.getUsageCounts() == null) {
|
||||
i.setUsageCounts(new UsageCounts());
|
||||
}
|
||||
return i.getUsageCounts();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static List<Score> getImpactMeasure(Indicator i) {
|
||||
if (i.getBipIndicators() == null) {
|
||||
i.setBipIndicators(new ArrayList<>());
|
||||
}
|
||||
return i.getBipIndicators();
|
||||
}
|
||||
|
||||
private static Score getScore(String indicator, List<KeyValue> unit) {
|
||||
Score s = new Score();
|
||||
s.setIndicator(indicator);
|
||||
for (KeyValue u : unit) {
|
||||
if (u.getKey().equals("score")) {
|
||||
s.setScore(u.getValue());
|
||||
} else {
|
||||
s.setClazz(u.getValue());
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class CommunityMap extends HashMap<String, String> implements Serializable {
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.oa.model.community.Context;
|
||||
|
||||
/**
|
||||
* This class splits the dumped results according to the research community - research initiative/infrastructure they
|
||||
* are related to. The information about the community is found in the element "context.id" in the result. Since the
|
||||
* context that can be found in the result can be associated not only to communities, a community Map is provided. It
|
||||
* will guide the splitting process. Note: the repartition(1) just before writing the results related to a community.
|
||||
* This is a choice due to uploading constraints (just one file for each community) As soon as a better solution will be
|
||||
* in place remove the repartition
|
||||
*/
|
||||
public class CommunitySplit implements Serializable {
|
||||
|
||||
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath) {
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
execSplit(spark, inputPath, outputPath, communityMap);
|
||||
});
|
||||
}
|
||||
|
||||
private static void execSplit(SparkSession spark, String inputPath, String outputPath,
|
||||
CommunityMap communities) {
|
||||
|
||||
Dataset<CommunityResult> result = Utils
|
||||
.readPath(spark, inputPath + "/publication", CommunityResult.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/orp", CommunityResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
|
||||
|
||||
communities
|
||||
.keySet()
|
||||
.stream()
|
||||
.parallel()
|
||||
.forEach(c -> {
|
||||
result
|
||||
.filter(
|
||||
(FilterFunction<CommunityResult>) r -> Optional.ofNullable(r.getContext()).isPresent() &&
|
||||
r.getContext().stream().anyMatch(con -> con.getCode().equals(c)))
|
||||
.map(
|
||||
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
|
||||
Encoders.STRING())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(outputPath + "/" + c.replace(" ", "_"));
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.oa.model.community.Project;
|
||||
|
||||
public class ResultProject implements Serializable {
|
||||
private String resultId;
|
||||
private List<Project> projectsList;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public List<Project> getProjectsList() {
|
||||
return projectsList;
|
||||
}
|
||||
|
||||
public void setProjectsList(List<Project> projectsList) {
|
||||
this.projectsList = projectsList;
|
||||
}
|
||||
}
|
|
@ -1,155 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Spark action to trigger the dump of results associated to research community - reseach initiative/infrasctructure The
|
||||
* actual dump if performed via the class DumpProducts that is used also for the entire graph dump
|
||||
*/
|
||||
public class SparkDumpCommunityProducts implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpCommunityProducts.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
String communityMapPath = Optional
|
||||
.ofNullable(parser.get("communityMapPath"))
|
||||
.orElse(null);
|
||||
|
||||
String dumpType = Optional
|
||||
.ofNullable(parser.get("dumpType"))
|
||||
.orElse(null);
|
||||
|
||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
resultDump(
|
||||
spark, inputPath, outputPath, communityMapPath, inputClazz, dumpType);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public static <I extends OafEntity> void resultDump(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
String communityMapPath,
|
||||
Class<I> inputClazz,
|
||||
String dumpType) {
|
||||
|
||||
CommunityMap communityMap = null;
|
||||
if (!StringUtils.isEmpty(communityMapPath))
|
||||
communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
|
||||
CommunityMap finalCommunityMap = communityMap;
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(
|
||||
(MapFunction<I, CommunityResult>) value -> execMap(value, finalCommunityMap, dumpType),
|
||||
Encoders.bean(CommunityResult.class))
|
||||
.filter((FilterFunction<CommunityResult>) value -> value != null)
|
||||
.map(
|
||||
(MapFunction<CommunityResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends OafEntity, O extends eu.dnetlib.dhp.oa.model.Result> O execMap(I value,
|
||||
CommunityMap communityMap, String dumpType) throws NoAvailableEntityTypeException, CardinalityTooHighException {
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
|
||||
if (Boolean.FALSE.equals(odInfo.isPresent())) {
|
||||
return null;
|
||||
}
|
||||
if (Boolean.TRUE.equals(odInfo.get().getDeletedbyinference())
|
||||
|| Boolean.TRUE.equals(odInfo.get().getInvisible())) {
|
||||
return null;
|
||||
}
|
||||
if (StringUtils.isEmpty(dumpType)) {
|
||||
Set<String> communities = communityMap.keySet();
|
||||
|
||||
Optional<List<Context>> inputContext = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Result) value).getContext());
|
||||
if (!inputContext.isPresent()) {
|
||||
return null;
|
||||
}
|
||||
List<String> toDumpFor = inputContext.get().stream().map(c -> {
|
||||
if (communities.contains(c.getId())) {
|
||||
return c.getId();
|
||||
}
|
||||
if (c.getId().contains("::") && communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
|
||||
return c.getId().substring(0, c.getId().indexOf("::"));
|
||||
}
|
||||
return null;
|
||||
}).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
if (toDumpFor.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return (O) ResultMapper.map(value, communityMap, Constants.DUMPTYPE.COMMUNITY.getType());
|
||||
|
||||
}
|
||||
}
|
|
@ -1,206 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.community.Funder;
|
||||
import eu.dnetlib.dhp.oa.model.community.Project;
|
||||
import eu.dnetlib.dhp.oa.model.community.Validated;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Preparation of the Project information to be added to the dumped results. For each result associated to at least one
|
||||
* Project, a serialization of an instance af ResultProject class is done. ResultProject contains the resultId, and the
|
||||
* list of Projects (as in eu.dnetlib.dhp.schema.dump.oaf.community.Project) it is associated to
|
||||
*/
|
||||
public class SparkPrepareResultProject implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkPrepareResultProject.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/project_prep_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
Boolean substring = Optional
|
||||
.ofNullable(parser.get("substring"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
prepareResultProjectList(spark, inputPath, outputPath, substring);
|
||||
});
|
||||
}
|
||||
|
||||
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath,
|
||||
Boolean substring) {
|
||||
Dataset<Relation> relation = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equalsIgnoreCase(ModelConstants.IS_PRODUCED_BY));
|
||||
|
||||
Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils
|
||||
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class);
|
||||
|
||||
projects
|
||||
.joinWith(relation, projects.col("id").equalTo(relation.col("target")), "inner")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value
|
||||
._2()
|
||||
.getSource(),
|
||||
Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s,
|
||||
it) -> {
|
||||
Set<String> projectSet = new HashSet<>();
|
||||
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
|
||||
ResultProject rp = new ResultProject();
|
||||
if (substring)
|
||||
rp.setResultId(getEntityId(s, ENTITY_ID_SEPARATOR));
|
||||
else
|
||||
rp.setResultId(s);
|
||||
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
|
||||
projectSet.add(p.getId());
|
||||
Project ps = getProject(p, first._2);
|
||||
|
||||
List<Project> projList = new ArrayList<>();
|
||||
projList.add(ps);
|
||||
rp.setProjectsList(projList);
|
||||
it.forEachRemaining(c -> {
|
||||
eu.dnetlib.dhp.schema.oaf.Project op = c._1();
|
||||
if (!projectSet.contains(op.getId())) {
|
||||
projList
|
||||
.add(getProject(op, c._2));
|
||||
|
||||
projectSet.add(op.getId());
|
||||
|
||||
}
|
||||
|
||||
});
|
||||
return rp;
|
||||
}, Encoders.bean(ResultProject.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op, Relation relation) {
|
||||
Project p = Project
|
||||
.newInstance(
|
||||
getEntityId(op.getId(), ENTITY_ID_SEPARATOR),
|
||||
op.getCode().getValue(),
|
||||
Optional
|
||||
.ofNullable(op.getAcronym())
|
||||
.map(Field::getValue)
|
||||
.orElse(null),
|
||||
Optional
|
||||
.ofNullable(op.getTitle())
|
||||
.map(Field::getValue)
|
||||
.orElse(null),
|
||||
Optional
|
||||
.ofNullable(op.getFundingtree())
|
||||
.map(value -> {
|
||||
List<Funder> tmp = value
|
||||
.stream()
|
||||
.map(ft -> getFunder(ft.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
if (!tmp.isEmpty()) {
|
||||
return tmp.get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.orElse(null));
|
||||
|
||||
Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo());
|
||||
Provenance provenance = new Provenance();
|
||||
if (di.isPresent()) {
|
||||
provenance.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
provenance.setTrust(di.get().getTrust());
|
||||
p.setProvenance(provenance);
|
||||
}
|
||||
if (Boolean.TRUE.equals(relation.getValidated())) {
|
||||
p.setValidated(Validated.newInstance(relation.getValidated(), relation.getValidationDate()));
|
||||
}
|
||||
return p;
|
||||
|
||||
}
|
||||
|
||||
private static Funder getFunder(String fundingtree) {
|
||||
final Funder f = new Funder();
|
||||
final Document doc;
|
||||
try {
|
||||
final SAXReader reader = new SAXReader();
|
||||
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
|
||||
doc = reader.read(new StringReader(fundingtree));
|
||||
f.setShortName(((Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
|
||||
f.setName(((Node) (doc.selectNodes("//funder/name").get(0))).getText());
|
||||
f.setJurisdiction(((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
|
||||
for (Object o : doc.selectNodes("//funding_level_0")) {
|
||||
List<Node> node = ((Node) o).selectNodes("./name");
|
||||
f.setFundingStream((node.get(0)).getText());
|
||||
}
|
||||
|
||||
return f;
|
||||
} catch (DocumentException | SAXException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
/**
|
||||
* Spark job to trigger the split of results associated to research community - reseach initiative/infrasctructure. The
|
||||
* actual split is performed by the class CommunitySplit
|
||||
*/
|
||||
public class SparkSplitForCommunity implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSplitForCommunity.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String communityMapPath = parser.get("communityMapPath");
|
||||
|
||||
CommunitySplit split = new CommunitySplit();
|
||||
split.run(isSparkSessionManaged, inputPath, outputPath, communityMapPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,95 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkUpdateProjectInfo implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkUpdateProjectInfo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String preparedInfoPath = parser.get("preparedInfoPath");
|
||||
log.info("preparedInfoPath: {}", preparedInfoPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
extend(spark, inputPath, outputPath, preparedInfoPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void extend(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
String preparedInfoPath) {
|
||||
Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class);
|
||||
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
|
||||
|
||||
result
|
||||
.joinWith(
|
||||
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
|
||||
"left")
|
||||
.map((MapFunction<Tuple2<CommunityResult, ResultProject>, CommunityResult>) value -> {
|
||||
CommunityResult r = value._1();
|
||||
Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList()));
|
||||
return r;
|
||||
}, Encoders.bean(CommunityResult.class))
|
||||
.map(
|
||||
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
|
||||
Encoders.STRING())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.text(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Constants implements Serializable {
|
||||
|
||||
public static final String IS_HOSTED_BY = "isHostedBy";
|
||||
public static final String HOSTS = "hosts";
|
||||
|
||||
public static final String IS_FUNDED_BY = "isFundedBy";
|
||||
public static final String FUNDS = "funds";
|
||||
|
||||
public static final String FUNDINGS = "fundings";
|
||||
|
||||
public static final String RESULT_ENTITY = "result";
|
||||
public static final String DATASOURCE_ENTITY = "datasource";
|
||||
public static final String CONTEXT_ENTITY = "context";
|
||||
public static final String ORGANIZATION_ENTITY = "organization";
|
||||
public static final String PROJECT_ENTITY = "project";
|
||||
|
||||
public static final String CONTEXT_ID = "00";
|
||||
public static final String CONTEXT_NS_PREFIX = "context_____";
|
||||
public static final String UNKNOWN = "UNKNOWN";
|
||||
|
||||
}
|
|
@ -1,84 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Deserialization of the information in the context needed to create Context Entities, and relations between context
|
||||
* entities and datasources and projects
|
||||
*/
|
||||
public class ContextInfo implements Serializable {
|
||||
private String id;
|
||||
private String description;
|
||||
private String type;
|
||||
private String zenodocommunity;
|
||||
private String name;
|
||||
private List<String> projectList;
|
||||
private List<String> datasourceList;
|
||||
private List<String> subject;
|
||||
|
||||
public List<String> getSubject() {
|
||||
return subject;
|
||||
}
|
||||
|
||||
public void setSubject(List<String> subject) {
|
||||
this.subject = subject;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getZenodocommunity() {
|
||||
return zenodocommunity;
|
||||
}
|
||||
|
||||
public void setZenodocommunity(String zenodocommunity) {
|
||||
this.zenodocommunity = zenodocommunity;
|
||||
}
|
||||
|
||||
public List<String> getProjectList() {
|
||||
return projectList;
|
||||
}
|
||||
|
||||
public void setProjectList(List<String> projectList) {
|
||||
this.projectList = projectList;
|
||||
}
|
||||
|
||||
public List<String> getDatasourceList() {
|
||||
return datasourceList;
|
||||
}
|
||||
|
||||
public void setDatasourceList(List<String> datasourceList) {
|
||||
this.datasourceList = datasourceList;
|
||||
}
|
||||
}
|
|
@ -1,106 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.graph.ResearchInitiative;
|
||||
|
||||
/**
|
||||
* Writes on HDFS Context entities. It queries the Information System at the lookup url provided as parameter and
|
||||
* collects the general information for contexes of type community or ri. The general information is the id of the
|
||||
* context, its label, the subjects associated to the context, its zenodo community, description and type. This
|
||||
* information is used to create a new Context Entity
|
||||
*/
|
||||
public class CreateContextEntities implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CreateContextEntities.class);
|
||||
private final transient Configuration conf;
|
||||
private final transient BufferedWriter writer;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CreateContextEntities.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
log.info("hdfsPath: {}", hdfsPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", hdfsNameNode);
|
||||
|
||||
final CreateContextEntities cce = new CreateContextEntities(hdfsPath, hdfsNameNode);
|
||||
|
||||
log.info("Processing contexts...");
|
||||
cce.execute(Process::getEntity);
|
||||
|
||||
cce.close();
|
||||
|
||||
}
|
||||
|
||||
private void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public CreateContextEntities(String hdfsPath, String hdfsNameNode) throws IOException {
|
||||
this.conf = new Configuration();
|
||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fsDataOutputStream = fileSystem.append(hdfsWritePath);
|
||||
} else {
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
}
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodecByClassName("org.apache.hadoop.io.compress.GzipCodec");
|
||||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(codec.createOutputStream(fsDataOutputStream),
|
||||
StandardCharsets.UTF_8));
|
||||
|
||||
}
|
||||
|
||||
public <R extends ResearchInitiative> void execute(final Function<ContextInfo, R> producer)
|
||||
throws IOException {
|
||||
|
||||
UtilCommunityAPI queryInformationSystem = new UtilCommunityAPI();
|
||||
|
||||
final Consumer<ContextInfo> consumer = ci -> writeEntity(producer.apply(ci));
|
||||
|
||||
queryInformationSystem.getContextInformation().forEach(ci -> consumer.accept(ci));
|
||||
}
|
||||
|
||||
protected <R extends ResearchInitiative> void writeEntity(final R r) {
|
||||
try {
|
||||
writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r));
|
||||
writer.newLine();
|
||||
} catch (final IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.MyRuntimeException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.MasterDuplicate;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
|
||||
/**
|
||||
* Writes the set of new Relation between the context and datasources. At the moment the relation between the context
|
||||
* and the project is not created because of a low coverage in the profiles of openaire ids related to projects
|
||||
*/
|
||||
public class CreateContextRelation implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(CreateContextRelation.class);
|
||||
private final transient Configuration conf;
|
||||
private final transient BufferedWriter writer;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
CreateContextRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json")));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
log.info("hdfsPath: {}", hdfsPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
log.info("hdfsNameNode: {}", hdfsNameNode);
|
||||
|
||||
final CreateContextRelation cce = new CreateContextRelation(hdfsPath, hdfsNameNode);
|
||||
|
||||
log.info("Creating relation for datasources and projects...");
|
||||
cce
|
||||
.execute(
|
||||
Process::getRelation);
|
||||
|
||||
cce.close();
|
||||
|
||||
}
|
||||
|
||||
private void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public CreateContextRelation(String hdfsPath, String hdfsNameNode)
|
||||
throws IOException {
|
||||
this.conf = new Configuration();
|
||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fsDataOutputStream = fileSystem.append(hdfsWritePath);
|
||||
} else {
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
}
|
||||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
|
||||
}
|
||||
|
||||
public void execute(final Function<ContextInfo, List<Relation>> producer) throws IOException {
|
||||
|
||||
final Consumer<ContextInfo> consumer = ci -> producer.apply(ci).forEach(this::writeEntity);
|
||||
|
||||
UtilCommunityAPI queryCommunityAPI = new UtilCommunityAPI();
|
||||
queryCommunityAPI.getContextRelation().forEach(ci -> consumer.accept(ci));
|
||||
}
|
||||
|
||||
protected void writeEntity(final Relation r) {
|
||||
try {
|
||||
writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r));
|
||||
writer.newLine();
|
||||
} catch (final Exception e) {
|
||||
throw new MyRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,203 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.graph.RelType;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Relation;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Creates new Relations (as in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation) from the information in the Entity. The
|
||||
* new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
|
||||
* to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
|
||||
* -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
|
||||
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result. Note for
|
||||
* context: it gets the first provenance in the dataInfo. If more than one is present the others are not dumped
|
||||
*/
|
||||
public class Extractor implements Serializable {
|
||||
|
||||
public void run(Boolean isSparkSessionManaged,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
Class<? extends Result> inputClazz,
|
||||
String communityMapPath) {
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
extractRelationResult(
|
||||
spark, inputPath, outputPath, inputClazz, Utils.getCommunityMap(spark, communityMapPath));
|
||||
});
|
||||
}
|
||||
|
||||
private <R extends Result> void extractRelationResult(SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
Class<R> inputClazz,
|
||||
CommunityMap communityMap) {
|
||||
|
||||
Set<Integer> hashCodes = new HashSet<>();
|
||||
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.flatMap((FlatMapFunction<R, Relation>) value -> {
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
extractRelationsFromInstance(hashCodes, value, relationList);
|
||||
Set<String> communities = communityMap.keySet();
|
||||
Optional
|
||||
.ofNullable(value.getContext())
|
||||
.ifPresent(contexts -> contexts.forEach(context -> {
|
||||
String id = context.getId();
|
||||
if (id.contains(":")) {
|
||||
id = id.substring(0, id.indexOf(":"));
|
||||
}
|
||||
if (communities.contains(id)) {
|
||||
String contextId = Utils.getContextId(id);
|
||||
Provenance provenance = Optional
|
||||
.ofNullable(context.getDataInfo())
|
||||
.map(
|
||||
dinfo -> Optional
|
||||
.ofNullable(dinfo.get(0).getProvenanceaction())
|
||||
.map(
|
||||
paction -> Provenance
|
||||
.newInstance(
|
||||
paction.getClassid(),
|
||||
dinfo.get(0).getTrust()))
|
||||
.orElse(null))
|
||||
.orElse(null);
|
||||
Relation r = getRelation(
|
||||
getEntityId(value.getId(), ENTITY_ID_SEPARATOR), contextId,
|
||||
Constants.RESULT_ENTITY,
|
||||
Constants.CONTEXT_ENTITY,
|
||||
ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP, provenance);
|
||||
if (!hashCodes.contains(r.hashCode())) {
|
||||
relationList
|
||||
.add(r);
|
||||
hashCodes.add(r.hashCode());
|
||||
}
|
||||
r = getRelation(
|
||||
contextId, getEntityId(value.getId(), ENTITY_ID_SEPARATOR),
|
||||
Constants.CONTEXT_ENTITY,
|
||||
Constants.RESULT_ENTITY,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
ModelConstants.RELATIONSHIP, provenance);
|
||||
if (!hashCodes.contains(r.hashCode())) {
|
||||
relationList
|
||||
.add(
|
||||
r);
|
||||
hashCodes.add(r.hashCode());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}));
|
||||
|
||||
return relationList.iterator();
|
||||
}, Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private <R extends Result> void extractRelationsFromInstance(Set<Integer> hashCodes, R value,
|
||||
List<Relation> relationList) {
|
||||
Optional
|
||||
.ofNullable(value.getInstance())
|
||||
.ifPresent(inst -> inst.forEach(instance -> {
|
||||
Optional
|
||||
.ofNullable(instance.getCollectedfrom())
|
||||
.ifPresent(
|
||||
cf -> getRelatioPair(
|
||||
value, relationList, cf,
|
||||
ModelConstants.IS_PROVIDED_BY, ModelConstants.PROVIDES, hashCodes));
|
||||
Optional
|
||||
.ofNullable(instance.getHostedby())
|
||||
.ifPresent(
|
||||
hb -> getRelatioPair(
|
||||
value, relationList, hb,
|
||||
Constants.IS_HOSTED_BY, Constants.HOSTS, hashCodes));
|
||||
}));
|
||||
}
|
||||
|
||||
private static <R extends Result> void getRelatioPair(R value, List<Relation> relationList, KeyValue cf,
|
||||
String resultDatasource, String datasourceResult,
|
||||
Set<Integer> hashCodes) {
|
||||
Provenance provenance = Optional
|
||||
.ofNullable(cf.getDataInfo())
|
||||
.map(
|
||||
dinfo -> Optional
|
||||
.ofNullable(dinfo.getProvenanceaction())
|
||||
.map(
|
||||
paction -> Provenance
|
||||
.newInstance(
|
||||
paction.getClassname(),
|
||||
dinfo.getTrust()))
|
||||
.orElse(
|
||||
Provenance
|
||||
.newInstance(
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.HARVESTED,
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)))
|
||||
.orElse(
|
||||
Provenance
|
||||
.newInstance(
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.HARVESTED,
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST));
|
||||
Relation r = getRelation(
|
||||
getEntityId(value.getId(), ENTITY_ID_SEPARATOR),
|
||||
getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY,
|
||||
resultDatasource, ModelConstants.PROVISION,
|
||||
provenance);
|
||||
if (!hashCodes.contains(r.hashCode())) {
|
||||
relationList
|
||||
.add(r);
|
||||
hashCodes.add(r.hashCode());
|
||||
}
|
||||
|
||||
r = getRelation(
|
||||
getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), getEntityId(value.getId(), ENTITY_ID_SEPARATOR),
|
||||
Constants.DATASOURCE_ENTITY, Constants.RESULT_ENTITY,
|
||||
datasourceResult, ModelConstants.PROVISION,
|
||||
provenance);
|
||||
|
||||
if (!hashCodes.contains(r.hashCode())) {
|
||||
relationList
|
||||
.add(r);
|
||||
hashCodes.add(r.hashCode());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static Relation getRelation(String source, String target, String sourceType, String targetType,
|
||||
String relName, String relType, Provenance provenance) {
|
||||
Relation r = new Relation();
|
||||
r.setSource(source);
|
||||
r.setSourceType(sourceType);
|
||||
r.setTarget(target);
|
||||
r.setTargetType(targetType);
|
||||
r.setReltype(RelType.newInstance(relName, relType));
|
||||
r.setProvenance(provenance);
|
||||
return r;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class MergedRels implements Serializable {
|
||||
private String organizationId;
|
||||
private String representativeId;
|
||||
|
||||
public String getOrganizationId() {
|
||||
return organizationId;
|
||||
}
|
||||
|
||||
public void setOrganizationId(String organizationId) {
|
||||
this.organizationId = organizationId;
|
||||
}
|
||||
|
||||
public String getRepresentativeId() {
|
||||
return representativeId;
|
||||
}
|
||||
|
||||
public void setRepresentativeId(String representativeId) {
|
||||
this.representativeId = representativeId;
|
||||
}
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
public class OrganizationMap extends HashMap<String, List<String>> {
|
||||
|
||||
public OrganizationMap() {
|
||||
super();
|
||||
}
|
||||
|
||||
public List<String> get(String key) {
|
||||
|
||||
if (super.get(key) == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return super.get(key);
|
||||
}
|
||||
}
|
|
@ -1,99 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.MyRuntimeException;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
||||
/**
|
||||
* It process the ContextInfo information to produce a new Context Entity or a set of Relations between the generic
|
||||
* context entity and datasource/projects related to the context.
|
||||
*/
|
||||
public class Process implements Serializable {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <R extends ResearchInitiative> R getEntity(ContextInfo ci) {
|
||||
try {
|
||||
ResearchInitiative ri;
|
||||
if (ci.getType().equals("community")) {
|
||||
ri = new ResearchCommunity();
|
||||
((ResearchCommunity) ri).setSubject(ci.getSubject());
|
||||
ri.setType(Constants.RESEARCH_COMMUNITY);
|
||||
} else {
|
||||
ri = new ResearchInitiative();
|
||||
ri.setType(Constants.RESEARCH_INFRASTRUCTURE);
|
||||
}
|
||||
ri.setId(Utils.getContextId(ci.getId()));
|
||||
ri.setAcronym(ci.getId());
|
||||
|
||||
ri.setDescription(ci.getDescription());
|
||||
ri.setName(ci.getName());
|
||||
if (StringUtils.isNotEmpty(ci.getZenodocommunity())) {
|
||||
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
|
||||
}
|
||||
return (R) ri;
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new MyRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Relation> getRelation(ContextInfo ci) {
|
||||
try {
|
||||
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
ci
|
||||
.getDatasourceList()
|
||||
.forEach(ds -> relationList.addAll(addRelations(ci, ds, ModelSupport.idPrefixEntity.get("10"))));
|
||||
|
||||
ci
|
||||
.getProjectList()
|
||||
.forEach(p -> relationList.addAll(addRelations(ci, p, ModelSupport.idPrefixEntity.get("40"))));
|
||||
|
||||
return relationList;
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new MyRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Relation> addRelations(ContextInfo ci, String ds, String nodeType) {
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
String contextId = Utils.getContextId(ci.getId());
|
||||
relationList
|
||||
.add(
|
||||
Relation
|
||||
.newInstance(
|
||||
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
|
||||
ds, nodeType,
|
||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
Provenance
|
||||
.newInstance(
|
||||
Constants.USER_CLAIM,
|
||||
Constants.DEFAULT_TRUST)));
|
||||
|
||||
relationList
|
||||
.add(
|
||||
Relation
|
||||
.newInstance(
|
||||
ds, nodeType,
|
||||
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
|
||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
Provenance
|
||||
.newInstance(
|
||||
Constants.USER_CLAIM,
|
||||
Constants.DEFAULT_TRUST)));
|
||||
return relationList;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,252 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Relation;
|
||||
import it.unimi.dsi.fastutil.objects.Object2BooleanMap;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Reads all the entities of the same type (Relation / Results) and saves them in the same folder
|
||||
*/
|
||||
public class SparkCollectAndSave implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkCollectAndSave.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkCollectAndSave.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_collect_and_save.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final Boolean aggregateResult = Optional
|
||||
.ofNullable(parser.get("resultAggregation"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "/result");
|
||||
run(spark, inputPath, outputPath, aggregateResult);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
|
||||
if (aggregate) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/publication", GraphResult.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/result/dataset", GraphResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/result/software", GraphResult.class))
|
||||
.map(
|
||||
(MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(outputPath + "/result");
|
||||
} else {
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/publication", GraphResult.class),
|
||||
outputPath + "/publication");
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/dataset", GraphResult.class),
|
||||
outputPath + "/dataset");
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class),
|
||||
outputPath + "/otherresearchproduct");
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/software", GraphResult.class),
|
||||
outputPath + "/software");
|
||||
|
||||
}
|
||||
|
||||
// Dataset<String> dumpedIds = Utils.getEntitiesId(spark, outputPath);
|
||||
|
||||
// Dataset<Relation> relations = Utils
|
||||
// .readPath(spark, inputPath + "/relation/publication", Relation.class)
|
||||
// .union(Utils.readPath(spark, inputPath + "/relation/dataset", Relation.class))
|
||||
// .union(Utils.readPath(spark, inputPath + "/relation/orp", Relation.class))
|
||||
// .union(Utils.readPath(spark, inputPath + "/relation/software", Relation.class))
|
||||
// .union(Utils.readPath(spark, inputPath + "/relation/contextOrg", Relation.class))
|
||||
// .union(Utils.readPath(spark, inputPath + "/relation/context", Relation.class))
|
||||
// .union(Utils.readPath(spark, inputPath + "/relation/relation", Relation.class));
|
||||
|
||||
// Utils.getValidRelations(relations, Utils.getEntitiesId(spark, outputPath))
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/publication", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/dataset", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/orp", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/software", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/contextOrg", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/context", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/relation/relation", Relation.class)
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/publication", Relation.class),
|
||||
// inputPath + "/relSource/publication");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/dataset", Relation.class),
|
||||
// inputPath + "/relSource/dataset");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/orp", Relation.class),
|
||||
// inputPath + "/relSource/orp");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/software", Relation.class),
|
||||
// inputPath + "/relSource/software");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/contextOrg", Relation.class),
|
||||
// inputPath + "/relSource/contextOrg");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/context", Relation.class),
|
||||
// inputPath + "/relSource/context");
|
||||
// relSource(
|
||||
// inputPath, dumpedIds, Utils
|
||||
// .readPath(spark, inputPath + "/relation/relation", Relation.class),
|
||||
// inputPath + "/relSource/relation");
|
||||
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/publication", Relation.class),
|
||||
// SaveMode.Overwrite);
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/dataset", Relation.class),
|
||||
// SaveMode.Append);
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/orp", Relation.class),
|
||||
// SaveMode.Append);
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/software", Relation.class),
|
||||
// SaveMode.Append);
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/contextOrg", Relation.class),
|
||||
// SaveMode.Append);
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/context", Relation.class),
|
||||
// SaveMode.Append);
|
||||
// relTarget(
|
||||
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/relation", Relation.class),
|
||||
// SaveMode.Append);
|
||||
|
||||
}
|
||||
|
||||
private static void relTarget(String outputPath, Dataset<String> dumpedIds, Dataset<Relation> relJoinSource,
|
||||
SaveMode saveMode) {
|
||||
relJoinSource
|
||||
.joinWith(dumpedIds, relJoinSource.col("target").equalTo(dumpedIds.col("value")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
|
||||
Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.mode(saveMode)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation");
|
||||
}
|
||||
|
||||
private static void relSource(String inputPath, Dataset<String> dumpedIds, Dataset<Relation> relations,
|
||||
String outputPath) {
|
||||
|
||||
relations
|
||||
.joinWith(dumpedIds, relations.col("source").equalTo(dumpedIds.col("value")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
|
||||
Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static void write(Dataset<GraphResult> dataSet, String outputPath) {
|
||||
dataSet
|
||||
.map((MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(outputPath);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,639 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
|
||||
import eu.dnetlib.dhp.oa.model.Container;
|
||||
import eu.dnetlib.dhp.oa.model.Result;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Datasource;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Organization;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Project;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* Spark Job that fires the dump for the entites
|
||||
*/
|
||||
public class SparkDumpEntitiesJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpEntitiesJob.class);
|
||||
public static final String COMPRESSION = "compression";
|
||||
public static final String GZIP = "gzip";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpEntitiesJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
Optional<String> communityMap = Optional.ofNullable(parser.get("communityMapPath"));
|
||||
String communityMapPath = null;
|
||||
if (communityMap.isPresent())
|
||||
communityMapPath = communityMap.get();
|
||||
|
||||
Class<? extends OafEntity> inputClazz = (Class<? extends OafEntity>) Class.forName(resultClassName);
|
||||
|
||||
run(isSparkSessionManaged, inputPath, outputPath, communityMapPath, inputClazz);
|
||||
|
||||
}
|
||||
|
||||
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath,
|
||||
Class<? extends OafEntity> inputClazz) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
switch (ModelSupport.idPrefixMap.get(inputClazz)) {
|
||||
case "50":
|
||||
String finalCommunityMapPath = communityMapPath;
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
resultDump(
|
||||
spark, inputPath, outputPath, finalCommunityMapPath, inputClazz);
|
||||
});
|
||||
|
||||
break;
|
||||
case "40":
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
projectMap(spark, inputPath, outputPath, inputClazz);
|
||||
|
||||
});
|
||||
break;
|
||||
case "20":
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
organizationMap(spark, inputPath, outputPath, inputClazz);
|
||||
|
||||
});
|
||||
break;
|
||||
case "10":
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
datasourceMap(spark, inputPath, outputPath, inputClazz);
|
||||
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public static <I extends OafEntity> void resultDump(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
String communityMapPath,
|
||||
Class<I> inputClazz) {
|
||||
|
||||
CommunityMap communityMap = null;
|
||||
if (!StringUtils.isEmpty(communityMapPath))
|
||||
communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
|
||||
CommunityMap finalCommunityMap = communityMap;
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(
|
||||
(MapFunction<I, GraphResult>) value -> execMap(value, finalCommunityMap),
|
||||
Encoders.bean(GraphResult.class))
|
||||
.filter((FilterFunction<GraphResult>) value -> value != null)
|
||||
.map((MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.text(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends OafEntity, O extends Result> O execMap(I value,
|
||||
CommunityMap communityMap) throws NoAvailableEntityTypeException, CardinalityTooHighException {
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
|
||||
if (Boolean.FALSE.equals(odInfo.isPresent())) {
|
||||
return null;
|
||||
}
|
||||
if (Boolean.TRUE.equals(odInfo.get().getDeletedbyinference())
|
||||
|| Boolean.TRUE.equals(odInfo.get().getInvisible())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (O) ResultMapper.map(value, communityMap, Constants.DUMPTYPE.COMPLETE.getType());
|
||||
|
||||
}
|
||||
|
||||
private static <E extends OafEntity> void datasourceMap(SparkSession spark, String inputPath, String outputPath,
|
||||
Class<E> inputClazz) {
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(
|
||||
(MapFunction<E, Datasource>) d -> mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) d),
|
||||
Encoders.bean(Datasource.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <E extends OafEntity> void projectMap(SparkSession spark, String inputPath, String outputPath,
|
||||
Class<E> inputClazz) {
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(
|
||||
(MapFunction<E, Project>) p -> mapProject((eu.dnetlib.dhp.schema.oaf.Project) p),
|
||||
Encoders.bean(Project.class))
|
||||
.filter((FilterFunction<Project>) p -> p != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static Datasource mapDatasource(eu.dnetlib.dhp.schema.oaf.Datasource d) {
|
||||
if (Boolean.TRUE.equals(d.getDataInfo().getDeletedbyinference()))
|
||||
return null;
|
||||
Datasource datasource = new Datasource();
|
||||
|
||||
datasource.setId(getEntityId(d.getId(), ENTITY_ID_SEPARATOR));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getOriginalId())
|
||||
.ifPresent(
|
||||
oId -> datasource.setOriginalId(oId.stream().filter(Objects::nonNull).collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getPid())
|
||||
.ifPresent(
|
||||
pids -> datasource
|
||||
.setPid(
|
||||
pids
|
||||
.stream()
|
||||
.map(p -> DatasourcePid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDatasourcetype())
|
||||
.ifPresent(
|
||||
dsType -> datasource
|
||||
.setDatasourcetype(DatasourceSchemeValue.newInstance(dsType.getClassid(), dsType.getClassname())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getOpenairecompatibility())
|
||||
.ifPresent(v -> datasource.setOpenairecompatibility(v.getClassname()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getOfficialname())
|
||||
.ifPresent(oname -> datasource.setOfficialname(oname.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getEnglishname())
|
||||
.ifPresent(ename -> datasource.setEnglishname(ename.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getWebsiteurl())
|
||||
.ifPresent(wsite -> datasource.setWebsiteurl(wsite.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getLogourl())
|
||||
.ifPresent(lurl -> datasource.setLogourl(lurl.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDateofvalidation())
|
||||
.ifPresent(dval -> datasource.setDateofvalidation(dval.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDescription())
|
||||
.ifPresent(dex -> datasource.setDescription(dex.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getSubjects())
|
||||
.ifPresent(
|
||||
sbjs -> datasource.setSubjects(sbjs.stream().map(sbj -> sbj.getValue()).collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getOdpolicies())
|
||||
.ifPresent(odp -> datasource.setPolicies(Arrays.asList(odp.getValue())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getOdlanguages())
|
||||
.ifPresent(
|
||||
langs -> datasource
|
||||
.setLanguages(langs.stream().map(lang -> lang.getValue()).collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getOdcontenttypes())
|
||||
.ifPresent(
|
||||
ctypes -> datasource
|
||||
.setContenttypes(ctypes.stream().map(ctype -> ctype.getValue()).collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getReleasestartdate())
|
||||
.ifPresent(rd -> datasource.setReleasestartdate(rd.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getReleaseenddate())
|
||||
.ifPresent(ed -> datasource.setReleaseenddate(ed.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getMissionstatementurl())
|
||||
.ifPresent(ms -> datasource.setMissionstatementurl(ms.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDatabaseaccesstype())
|
||||
.ifPresent(ar -> datasource.setAccessrights(ar.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDatauploadtype())
|
||||
.ifPresent(dut -> datasource.setUploadrights(dut.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDatabaseaccessrestriction())
|
||||
.ifPresent(dar -> datasource.setDatabaseaccessrestriction(dar.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getDatauploadrestriction())
|
||||
.ifPresent(dur -> datasource.setDatauploadrestriction(dur.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getVersioning())
|
||||
.ifPresent(v -> datasource.setVersioning(v.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getCitationguidelineurl())
|
||||
.ifPresent(cu -> datasource.setCitationguidelineurl(cu.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getPidsystems())
|
||||
.ifPresent(ps -> datasource.setPidsystems(ps.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getCertificates())
|
||||
.ifPresent(c -> datasource.setCertificates(c.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getPolicies())
|
||||
.ifPresent(ps -> datasource.setPolicies(ps.stream().map(p -> p.getValue()).collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(d.getJournal())
|
||||
.ifPresent(j -> datasource.setJournal(getContainer(j)));
|
||||
|
||||
// Optional
|
||||
// .ofNullable(d.getMeasures())
|
||||
// .ifPresent(m -> datasource.setIndicators(Utils.getIndicator(d.getMeasures())));
|
||||
|
||||
return datasource;
|
||||
|
||||
}
|
||||
|
||||
private static Container getContainer(Journal j) {
|
||||
Container c = new Container();
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getName())
|
||||
.ifPresent(n -> c.setName(n));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getIssnPrinted())
|
||||
.ifPresent(issnp -> c.setIssnPrinted(issnp));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getIssnOnline())
|
||||
.ifPresent(issno -> c.setIssnOnline(issno));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getIssnLinking())
|
||||
.ifPresent(isnl -> c.setIssnLinking(isnl));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getEp())
|
||||
.ifPresent(ep -> c.setEp(ep));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getIss())
|
||||
.ifPresent(iss -> c.setIss(iss));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getSp())
|
||||
.ifPresent(sp -> c.setSp(sp));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getVol())
|
||||
.ifPresent(vol -> c.setVol(vol));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getEdition())
|
||||
.ifPresent(edition -> c.setEdition(edition));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getConferencedate())
|
||||
.ifPresent(cdate -> c.setConferencedate(cdate));
|
||||
|
||||
Optional
|
||||
.ofNullable(j.getConferenceplace())
|
||||
.ifPresent(cplace -> c.setConferenceplace(cplace));
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
private static Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p) throws DocumentException {
|
||||
if (Boolean.TRUE.equals(p.getDataInfo().getDeletedbyinference()))
|
||||
return null;
|
||||
|
||||
Project project = new Project();
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getId())
|
||||
.ifPresent(id -> project.setId(getEntityId(id, ENTITY_ID_SEPARATOR)));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getWebsiteurl())
|
||||
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getCode())
|
||||
.ifPresent(code -> project.setCode(code.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getAcronym())
|
||||
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getTitle())
|
||||
.ifPresent(title -> project.setTitle(title.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getStartdate())
|
||||
.ifPresent(sdate -> project.setStartdate(sdate.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getEnddate())
|
||||
.ifPresent(edate -> project.setEnddate(edate.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getCallidentifier())
|
||||
.ifPresent(cide -> project.setCallidentifier(cide.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getKeywords())
|
||||
.ifPresent(key -> project.setKeywords(key.getValue()));
|
||||
|
||||
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
|
||||
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
|
||||
boolean mandate = false;
|
||||
if (omandate.isPresent()) {
|
||||
if (omandate.get().getValue().equals("true")) {
|
||||
mandate = true;
|
||||
}
|
||||
}
|
||||
if (oecsc39.isPresent()) {
|
||||
if (oecsc39.get().getValue().equals("true")) {
|
||||
mandate = true;
|
||||
}
|
||||
}
|
||||
|
||||
project.setOpenaccessmandateforpublications(mandate);
|
||||
project.setOpenaccessmandatefordataset(false);
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getEcarticle29_3())
|
||||
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true")));
|
||||
|
||||
project
|
||||
.setSubject(
|
||||
Optional
|
||||
.ofNullable(p.getSubjects())
|
||||
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>()));
|
||||
|
||||
Optional
|
||||
.ofNullable(p.getSummary())
|
||||
.ifPresent(summary -> project.setSummary(summary.getValue()));
|
||||
|
||||
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
|
||||
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
|
||||
Optional<Float> ototalcost = Optional.ofNullable(p.getTotalcost());
|
||||
|
||||
if (ocurrency.isPresent()) {
|
||||
if (ofundedamount.isPresent()) {
|
||||
if (ototalcost.isPresent()) {
|
||||
project
|
||||
.setGranted(
|
||||
Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
|
||||
} else {
|
||||
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
project
|
||||
.setH2020programme(
|
||||
Optional
|
||||
.ofNullable(p.getH2020classification())
|
||||
.map(
|
||||
classification -> classification
|
||||
.stream()
|
||||
.map(
|
||||
c -> Programme
|
||||
.newInstance(
|
||||
c.getH2020Programme().getCode(), c.getH2020Programme().getDescription()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>()));
|
||||
|
||||
Optional<List<Field<String>>> ofundTree = Optional
|
||||
.ofNullable(p.getFundingtree());
|
||||
List<Funder> funList = new ArrayList<>();
|
||||
if (ofundTree.isPresent()) {
|
||||
for (Field<String> fundingtree : ofundTree.get()) {
|
||||
funList.add(getFunder(fundingtree.getValue()));
|
||||
}
|
||||
}
|
||||
project.setFunding(funList);
|
||||
|
||||
// if (Optional.ofNullable(p.getMeasures()).isPresent()) {
|
||||
// project.setIndicators(Utils.getIndicator(p.getMeasures()));
|
||||
// }
|
||||
return project;
|
||||
}
|
||||
|
||||
public static Funder getFunder(String fundingtree) throws DocumentException {
|
||||
Funder f = new Funder();
|
||||
final Document doc;
|
||||
|
||||
doc = new SAXReader().read(new StringReader(fundingtree));
|
||||
f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
|
||||
f.setName(((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText());
|
||||
f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
|
||||
|
||||
String id = "";
|
||||
|
||||
StringBuilder bld = new StringBuilder();
|
||||
|
||||
int level = 0;
|
||||
List<org.dom4j.Node> nodes = doc.selectNodes("//funding_level_" + level);
|
||||
while (!nodes.isEmpty()) {
|
||||
for (org.dom4j.Node n : nodes) {
|
||||
|
||||
List node = n.selectNodes("./id");
|
||||
id = ((org.dom4j.Node) node.get(0)).getText();
|
||||
id = id.substring(id.indexOf("::") + 2);
|
||||
|
||||
node = n.selectNodes("./description");
|
||||
bld.append(((Node) node.get(0)).getText() + " - ");
|
||||
|
||||
}
|
||||
level += 1;
|
||||
nodes = doc.selectNodes("//funding_level_" + level);
|
||||
}
|
||||
String description = bld.toString();
|
||||
if (!id.equals("")) {
|
||||
Fundings fundings = new Fundings();
|
||||
fundings.setId(id);
|
||||
fundings.setDescription(description.substring(0, description.length() - 3).trim());
|
||||
f.setFunding_stream(fundings);
|
||||
}
|
||||
|
||||
return f;
|
||||
|
||||
}
|
||||
|
||||
private static <E extends OafEntity> void organizationMap(SparkSession spark, String inputPath, String outputPath,
|
||||
Class<E> inputClazz) {
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(
|
||||
(MapFunction<E, Organization>) o -> mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) o),
|
||||
Encoders.bean(Organization.class))
|
||||
.filter((FilterFunction<Organization>) o -> o != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
|
||||
eu.dnetlib.dhp.schema.oaf.Organization org) {
|
||||
if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference()))
|
||||
return null;
|
||||
if (!Optional.ofNullable(org.getLegalname()).isPresent()
|
||||
&& !Optional.ofNullable(org.getLegalshortname()).isPresent())
|
||||
return null;
|
||||
|
||||
Organization organization = new Organization();
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalshortname())
|
||||
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalname())
|
||||
.ifPresent(value -> organization.setLegalname(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getWebsiteurl())
|
||||
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getAlternativeNames())
|
||||
.ifPresent(
|
||||
value -> organization
|
||||
.setAlternativenames(
|
||||
value
|
||||
.stream()
|
||||
.map(v -> v.getValue())
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getCountry())
|
||||
.ifPresent(
|
||||
value -> {
|
||||
if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
|
||||
organization
|
||||
.setCountry(
|
||||
eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getId())
|
||||
.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getPid())
|
||||
.ifPresent(
|
||||
value -> organization
|
||||
.setPid(
|
||||
value
|
||||
.stream()
|
||||
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
return organization;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,132 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.graph.RelType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
/**
|
||||
* Dumps eu.dnetlib.dhp.schema.oaf.Relation in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation
|
||||
*/
|
||||
public class SparkDumpRelationJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpRelationJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpRelationJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
Optional<String> rs = Optional.ofNullable(parser.get("removeSet"));
|
||||
final Set<String> removeSet = new HashSet<>();
|
||||
if (rs.isPresent()) {
|
||||
Collections.addAll(removeSet, rs.get().split(";"));
|
||||
}
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
dumpRelation(spark, inputPath, outputPath, removeSet);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, Set<String> removeSet) {
|
||||
Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class);
|
||||
relations
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !removeSet.contains(r.getRelClass())
|
||||
&& !r.getSubRelType().equalsIgnoreCase("resultService"))
|
||||
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
||||
relNew
|
||||
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setReltype(
|
||||
RelType
|
||||
.newInstance(
|
||||
relation.getRelClass(),
|
||||
relation.getSubRelType()));
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
DataInfo dInfo = odInfo.get();
|
||||
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
|
||||
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
|
||||
relNew
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
dInfo.getProvenanceaction().getClassname(),
|
||||
dInfo.getTrust()));
|
||||
}
|
||||
}
|
||||
if (Boolean.TRUE.equals(relation.getValidated())) {
|
||||
relNew.setValidated(relation.getValidated());
|
||||
relNew.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
|
||||
return relNew;
|
||||
|
||||
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,54 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Spark job that fires the extraction of relations from entities
|
||||
*/
|
||||
public class SparkExtractRelationFromEntities implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkExtractRelationFromEntities.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkExtractRelationFromEntities.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final String communityMapPath = parser.get("communityMapPath");
|
||||
|
||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
Extractor extractor = new Extractor();
|
||||
extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,188 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelSupport.idPrefixMap;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.communityapi.model.CommunityEntityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.graph.RelType;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
/**
|
||||
* Create new Relations between Context Entities and Organizations whose products are associated to the context. It
|
||||
* produces relation such as: organization <-> isRelatedTo <-> context
|
||||
*/
|
||||
public class SparkOrganizationRelation implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkOrganizationRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_organization_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
UtilCommunityAPI queryCommunityAPI = new UtilCommunityAPI();
|
||||
final CommunityEntityMap organizationMap = queryCommunityAPI.getCommunityOrganization();
|
||||
|
||||
final String serializedOrganizationMap = new Gson().toJson(organizationMap);
|
||||
log.info("organization map : {}", serializedOrganizationMap);
|
||||
|
||||
final String communityMapPath = parser.get("communityMapPath");
|
||||
log.info("communityMapPath: {}", communityMapPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
extractRelation(spark, inputPath, organizationMap, outputPath, communityMapPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void extractRelation(SparkSession spark, String inputPath, CommunityEntityMap organizationMap,
|
||||
String outputPath, String communityMapPath) {
|
||||
|
||||
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
|
||||
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
|
||||
|
||||
relationDataset.createOrReplaceTempView("relation");
|
||||
|
||||
List<eu.dnetlib.dhp.oa.model.graph.Relation> relList = new ArrayList<>();
|
||||
|
||||
Dataset<MergedRels> mergedRelsDataset = spark
|
||||
.sql(
|
||||
"SELECT target organizationId, source representativeId " +
|
||||
"FROM relation " +
|
||||
"WHERE datainfo.deletedbyinference = false " +
|
||||
"AND relclass = 'merges' " +
|
||||
"AND substr(source, 1, 2) = '20'")
|
||||
.as(Encoders.bean(MergedRels.class));
|
||||
|
||||
mergedRelsDataset.map((MapFunction<MergedRels, MergedRels>) mergedRels -> {
|
||||
if (organizationMap.containsKey(getEntityId(mergedRels.getOrganizationId(), ENTITY_ID_SEPARATOR))) {
|
||||
return mergedRels;
|
||||
}
|
||||
return null;
|
||||
}, Encoders.bean(MergedRels.class))
|
||||
.filter(Objects::nonNull)
|
||||
.collectAsList()
|
||||
.forEach(getMergedRelsConsumer(organizationMap, relList, communityMap));
|
||||
|
||||
organizationMap
|
||||
.keySet()
|
||||
.forEach(
|
||||
oId -> organizationMap
|
||||
.get(oId)
|
||||
.forEach(community -> {
|
||||
if (communityMap.containsKey(community)) {
|
||||
addRelations(relList, community, oId);
|
||||
}
|
||||
}));
|
||||
|
||||
spark
|
||||
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static Consumer<MergedRels> getMergedRelsConsumer(CommunityEntityMap organizationMap,
|
||||
List<eu.dnetlib.dhp.oa.model.graph.Relation> relList, CommunityMap communityMap) {
|
||||
return mergedRels -> {
|
||||
String oId = getEntityId(mergedRels.getOrganizationId(), ENTITY_ID_SEPARATOR);
|
||||
organizationMap
|
||||
.get(oId)
|
||||
.forEach(community -> {
|
||||
if (communityMap.containsKey(community)) {
|
||||
addRelations(
|
||||
relList, community, getEntityId(mergedRels.getRepresentativeId(), ENTITY_ID_SEPARATOR));
|
||||
}
|
||||
|
||||
});
|
||||
organizationMap.remove(oId);
|
||||
};
|
||||
}
|
||||
|
||||
private static void addRelations(List<eu.dnetlib.dhp.oa.model.graph.Relation> relList, String community,
|
||||
String organization) {
|
||||
|
||||
String id = Utils.getContextId(community);
|
||||
log.info("create relation for organization: {}", organization);
|
||||
relList
|
||||
.add(
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation
|
||||
.newInstance(
|
||||
id, Constants.CONTEXT_ENTITY,
|
||||
organization,
|
||||
ModelSupport.idPrefixEntity.get(idPrefixMap.get(Organization.class)),
|
||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
Provenance
|
||||
.newInstance(
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.USER_CLAIM,
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)));
|
||||
|
||||
relList
|
||||
.add(
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation
|
||||
.newInstance(
|
||||
organization, ModelSupport.idPrefixEntity.get(idPrefixMap.get(Organization.class)),
|
||||
id, Constants.CONTEXT_ENTITY,
|
||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||
Provenance
|
||||
.newInstance(
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.USER_CLAIM,
|
||||
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.types.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* It selects the valid relations among those present in the graph. One relation is valid if it is not deletedbyinference
|
||||
* and if both the source and the target node are present in the graph and are not deleted by inference nor invisible.
|
||||
* To check this I made a view of the ids of all the entities in the graph, and select the relations for which a join exists
|
||||
* with this view for both the source and the target
|
||||
*/
|
||||
|
||||
public class SparkSelectValidRelationsJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectValidRelationsJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectValidRelationsJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
selectValidRelation2(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void selectValidRelation2(SparkSession spark, String inputPath, String outputPath) {
|
||||
final StructType structureSchema = new StructType()
|
||||
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> df = spark.createDataFrame(new ArrayList<Row>(), structureSchema);
|
||||
List<String> entities = Arrays
|
||||
.asList(
|
||||
"publication", "dataset", "otherresearchproduct", "software", "organization", "project", "datasource");
|
||||
for (String e : entities)
|
||||
df = df
|
||||
.union(
|
||||
spark
|
||||
.read()
|
||||
.schema(structureSchema)
|
||||
.json(inputPath + "/" + e)
|
||||
.filter("dataInfo.deletedbyinference != true and dataInfo.invisible != true"));
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> relations = spark
|
||||
.read()
|
||||
.schema(Encoders.bean(Relation.class).schema())
|
||||
.json(inputPath + "/relation")
|
||||
.filter("dataInfo.deletedbyinference == false");
|
||||
|
||||
relations
|
||||
.join(df, relations.col("source").equalTo(df.col("id")), "leftsemi")
|
||||
.join(df, relations.col("target").equalTo(df.col("id")), "leftsemi")
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,136 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.country;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
import javax.rmi.CORBA.Util;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.MasterDuplicate;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.SparkDumpResult;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.criteria.VerbResolver;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.criteria.VerbResolverFactory;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.selectionconstraints.Param;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.selectionconstraints.SelectionConstraints;
|
||||
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 27/04/23
|
||||
* Selects the results having in the country the given country
|
||||
*/
|
||||
public class SparkFindResultWithCountry implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkFindResultWithCountry.class);
|
||||
|
||||
public static final String COMPRESSION = "compression";
|
||||
public static final String GZIP = "gzip";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkFindResultWithCountry.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/result_country_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultType = parser.get("resultType");
|
||||
log.info("resultType: {}", resultType);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final String preparedInfoPath = parser.get("resultWithCountry");
|
||||
|
||||
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> inputClazz = (Class<? extends eu.dnetlib.dhp.schema.oaf.Result>) Class
|
||||
.forName(resultClassName);
|
||||
|
||||
run(
|
||||
isSparkSessionManaged, inputPath, outputPath, inputClazz,
|
||||
resultType, preparedInfoPath);
|
||||
|
||||
}
|
||||
|
||||
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath,
|
||||
|
||||
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> inputClazz, String resultType, String preparedInfoPath) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "/original/" + resultType);
|
||||
|
||||
resultDump(
|
||||
spark, inputPath, outputPath, inputClazz, resultType, preparedInfoPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public static <I extends eu.dnetlib.dhp.schema.oaf.Result> void resultDump(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
Class<I> inputClazz,
|
||||
|
||||
String resultType,
|
||||
|
||||
String preparedInfoPath) {
|
||||
|
||||
Dataset<String> resultsWithCountry = spark.read().textFile(preparedInfoPath).distinct();
|
||||
|
||||
Dataset<I> result = Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.filter(
|
||||
(FilterFunction<I>) r -> !r.getDataInfo().getInvisible() && !r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
resultsWithCountry
|
||||
.joinWith(result, resultsWithCountry.col("value").equalTo(result.col("id")))
|
||||
.map((MapFunction<Tuple2<String, I>, I>) t2 -> t2._2(), Encoders.bean(inputClazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/" + resultType);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,173 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.country;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 27/04/23
|
||||
* Finds the results id which are in relation with another entity having the given country
|
||||
* or that have that country in the country list
|
||||
*/
|
||||
public class SparkFindResultsRelatedToCountry implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkFindResultsRelatedToCountry.class);
|
||||
|
||||
public static final String COMPRESSION = "compression";
|
||||
public static final String GZIP = "gzip";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkFindResultsRelatedToCountry.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/result_related_country_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String country = parser.get("country");
|
||||
|
||||
run(
|
||||
isSparkSessionManaged, inputPath, outputPath, country);
|
||||
|
||||
}
|
||||
|
||||
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath,
|
||||
String country) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
findRelatedEntities(
|
||||
spark, inputPath, outputPath, country);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public static <I extends eu.dnetlib.dhp.schema.oaf.Result> void findRelatedEntities(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
String country) {
|
||||
|
||||
Dataset<Project> projectsInCountry = Utils
|
||||
.readPath(spark, inputPath + "/project", Project.class)
|
||||
.filter((FilterFunction<Project>) p -> isCountryInFunderJurisdiction(p.getFundingtree(), country));
|
||||
|
||||
Dataset<Relation> relsProjectResults = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equals(ModelConstants.PRODUCES));
|
||||
|
||||
projectsInCountry
|
||||
.joinWith(relsProjectResults, projectsInCountry.col("id").equalTo(relsProjectResults.col("source")))
|
||||
.map((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._2().getTarget(), Encoders.STRING())
|
||||
.write()
|
||||
.option(COMPRESSION, GZIP)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(outputPath);
|
||||
|
||||
Dataset<Organization> organizationsInCountry = Utils
|
||||
.readPath(spark, inputPath + "/organization", Organization.class)
|
||||
.filter(
|
||||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
|
||||
&& o.getCountry().getClassid().equals(country));
|
||||
|
||||
Dataset<Relation> relsOrganizationResults = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equals(ModelConstants.IS_AUTHOR_INSTITUTION_OF));
|
||||
|
||||
organizationsInCountry
|
||||
.joinWith(
|
||||
relsOrganizationResults,
|
||||
organizationsInCountry.col("id").equalTo(relsOrganizationResults.col("source")))
|
||||
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getTarget(), Encoders.STRING())
|
||||
.write()
|
||||
.option(COMPRESSION, GZIP)
|
||||
.mode(SaveMode.Append)
|
||||
.text(outputPath);
|
||||
|
||||
selectResultWithCountry(spark, inputPath, outputPath, country, "publication", Publication.class);
|
||||
selectResultWithCountry(
|
||||
spark, inputPath, outputPath, country, "dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
selectResultWithCountry(spark, inputPath, outputPath, country, "software", Software.class);
|
||||
selectResultWithCountry(
|
||||
spark, inputPath, outputPath, country, "otherresearchproduct", OtherResearchProduct.class);
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void selectResultWithCountry(SparkSession spark, String inputPath,
|
||||
String outputPath, String country, String type, Class<R> inputClazz) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/" + type, inputClazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible() &&
|
||||
p.getCountry() != null &&
|
||||
p.getCountry().stream().anyMatch(c -> c.getClassid().equals(country)))
|
||||
.map((MapFunction<R, String>) p -> p.getId(), Encoders.STRING())
|
||||
.write()
|
||||
.option(COMPRESSION, GZIP)
|
||||
.mode(SaveMode.Append)
|
||||
.text(outputPath);
|
||||
}
|
||||
|
||||
private static boolean isCountryInFunderJurisdiction(List<Field<String>> fundingtrees, String country) {
|
||||
try {
|
||||
final SAXReader reader = new SAXReader();
|
||||
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
|
||||
for (Field<String> fundingtree : fundingtrees) {
|
||||
final Document doc = reader.read(new StringReader(fundingtree.getValue()));
|
||||
if (((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText().equals(country)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch (DocumentException | SAXException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,102 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 05/05/23
|
||||
*/
|
||||
public class AuthorResult implements Serializable {
|
||||
private String authorId;
|
||||
private String firstName;
|
||||
private String lastName;
|
||||
private String fullName;
|
||||
private String orcid;
|
||||
private String resultId;
|
||||
private String rank;
|
||||
private Boolean fromOrcid;
|
||||
|
||||
public Boolean getFromOrcid() {
|
||||
return fromOrcid;
|
||||
}
|
||||
|
||||
public void setFromOrcid(Boolean fromOrcid) {
|
||||
this.fromOrcid = fromOrcid;
|
||||
}
|
||||
|
||||
public String getFullName() {
|
||||
return fullName;
|
||||
}
|
||||
|
||||
public void setFullName(String fullName) {
|
||||
this.fullName = fullName;
|
||||
}
|
||||
|
||||
public String getAuthorId() {
|
||||
return authorId;
|
||||
}
|
||||
|
||||
public void setAuthorId(String authorId) {
|
||||
this.authorId = authorId;
|
||||
}
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public String getRank() {
|
||||
return rank;
|
||||
}
|
||||
|
||||
public void setRank(String rank) {
|
||||
this.rank = rank;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return authorId;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.authorId = id;
|
||||
}
|
||||
|
||||
public String getFirstName() {
|
||||
return firstName;
|
||||
}
|
||||
|
||||
public void setFirstName(String firstName) {
|
||||
this.firstName = firstName;
|
||||
}
|
||||
|
||||
public String getLastName() {
|
||||
return lastName;
|
||||
}
|
||||
|
||||
public void setLastName(String lastName) {
|
||||
this.lastName = lastName;
|
||||
}
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String orcid) {
|
||||
this.orcid = orcid;
|
||||
}
|
||||
|
||||
public void autosetId() {
|
||||
if (orcid != null) {
|
||||
authorId = DHPUtils.md5(orcid);
|
||||
} else {
|
||||
authorId = DHPUtils.md5(resultId + rank);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 10/05/23
|
||||
*/
|
||||
public class Constants implements Serializable {
|
||||
public final static String SEP = "\t";
|
||||
|
||||
public static final String addQuotes(String id) {
|
||||
// if (StringUtils.isNotEmpty(id))
|
||||
// return "\"" + id + "\"";
|
||||
return id;
|
||||
}
|
||||
}
|
|
@ -1,96 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.commons.lang3.StringUtils.split;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 09/05/23
|
||||
*/
|
||||
//STEP 1
|
||||
public class DumpCommunities implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class);
|
||||
private final BufferedWriter writer;
|
||||
private final static String HEADER = "id" + Constants.SEP + "name" + Constants.SEP + "acronym" + Constants.SEP
|
||||
+ " description \n";
|
||||
private final transient UtilCommunityAPI queryCommunityAPI;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
DumpCommunities.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String nameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", nameNode);
|
||||
|
||||
final List<String> communities = Arrays.asList(split(parser.get("communities"), ";"));
|
||||
|
||||
final DumpCommunities dc = new DumpCommunities(outputPath, nameNode);
|
||||
|
||||
dc.writeCommunity(communities);
|
||||
|
||||
}
|
||||
|
||||
private void writeCommunity(List<String> communities)
|
||||
throws IOException {
|
||||
writer.write(HEADER);
|
||||
writer.flush();
|
||||
|
||||
for (String community : queryCommunityAPI
|
||||
.getCommunityCsv(communities)) {
|
||||
writer
|
||||
.write(
|
||||
community);
|
||||
writer.write("\n");
|
||||
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public DumpCommunities(String hdfsPath, String hdfsNameNode) throws Exception {
|
||||
final Configuration conf = new Configuration();
|
||||
queryCommunityAPI = new UtilCommunityAPI();
|
||||
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, true);
|
||||
}
|
||||
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
|
||||
|
||||
writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,362 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.commons.lang3.StringUtils.remove;
|
||||
import static org.apache.commons.lang3.StringUtils.split;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collector;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.*;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 04/05/23
|
||||
*/
|
||||
//STEP 3
|
||||
public class SparkDumpResults implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpResults.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpResults.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String resultType = parser.get("resultType");
|
||||
log.info("resultType: {}", resultType);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
// Utils.removeOutputDir(spark, outputPath);
|
||||
run(spark, inputPath, inputClazz, resultType, workingPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void run(SparkSession spark, String inputPath,
|
||||
Class<R> inputClazz, String resultType, String workingPath) {
|
||||
|
||||
Dataset<String> resultIds = spark.read().textFile(workingPath + "/resultIds");
|
||||
// resultIds.foreach((ForeachFunction<String>) r -> System.out.println(r));
|
||||
Dataset<R> results = Utils
|
||||
.readPath(spark, inputPath + "/" + resultType, inputClazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible());
|
||||
|
||||
resultIds
|
||||
.joinWith(results, resultIds.col("value").equalTo(results.col("id")))
|
||||
.map((MapFunction<Tuple2<String, R>, R>) t2 -> t2._2(), Encoders.bean(inputClazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/" + resultType + "/temp/result");
|
||||
|
||||
// map results
|
||||
results = Utils.readPath(spark, workingPath + "/" + resultType + "/temp/result", inputClazz);
|
||||
results
|
||||
.map(
|
||||
(MapFunction<R, CSVResult>) r -> mapResultInfo(r),
|
||||
Encoders.bean(CSVResult.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(workingPath + "/" + resultType + "/result");
|
||||
|
||||
// map relations between pid and result
|
||||
results
|
||||
.flatMap((FlatMapFunction<R, CSVPid>) r -> {
|
||||
List<CSVPid> pids = new ArrayList<>();
|
||||
if (Optional.ofNullable(r.getPid()).isPresent() && r.getPid().size() > 0) {
|
||||
pids.addAll(mapPid(r.getPid(), r.getId()));
|
||||
}
|
||||
return pids.iterator();
|
||||
}, Encoders.bean(CSVPid.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(workingPath + "/" + resultType + "/result_pid");
|
||||
|
||||
// map authors from the result
|
||||
// per ogni autore nel result
|
||||
// se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid))
|
||||
// se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua
|
||||
// la sua posizione nell'insieme degli autori) sempre con md5
|
||||
results
|
||||
.flatMap((FlatMapFunction<R, AuthorResult>) r -> {
|
||||
int count = 0;
|
||||
List<AuthorResult> arl = new ArrayList<>();
|
||||
Set<String> authorIds = new HashSet();
|
||||
if (Optional.ofNullable(r.getAuthor()).isPresent()) {
|
||||
for (Author a : r.getAuthor()) {
|
||||
count += 1;
|
||||
AuthorResult ar = new AuthorResult();
|
||||
ar.setResultId(r.getId());
|
||||
if (Optional.ofNullable(a.getRank()).isPresent()) {
|
||||
if (a.getRank() > 0) {
|
||||
ar.setRank(String.valueOf(a.getRank()));
|
||||
} else {
|
||||
ar.setRank(String.valueOf(count));
|
||||
}
|
||||
}
|
||||
ar.setFirstName(removeBreaks(a.getName()));
|
||||
ar.setLastName(removeBreaks(a.getSurname()));
|
||||
ar.setFullName(removeBreaks(a.getFullname()));
|
||||
Tuple2<String, Boolean> orcid = getOrcid(a.getPid());
|
||||
if (Optional.ofNullable(orcid).isPresent()) {
|
||||
ar.setOrcid(orcid._1());
|
||||
ar.setFromOrcid(orcid._2());
|
||||
}
|
||||
|
||||
ar.autosetId();
|
||||
|
||||
if (!authorIds.contains(ar.getAuthorId())) {
|
||||
arl.add(ar);
|
||||
authorIds.add(ar.getAuthorId());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return arl.iterator();
|
||||
}, Encoders.bean(AuthorResult.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/" + resultType + "/temp/authorresult");
|
||||
|
||||
Dataset<AuthorResult> authorResult = Utils
|
||||
.readPath(spark, workingPath + "/" + resultType + "/temp/authorresult", AuthorResult.class);
|
||||
// map the relation between author and result
|
||||
authorResult
|
||||
.map(
|
||||
(MapFunction<AuthorResult, CSVRelResAut>) ar -> {
|
||||
CSVRelResAut ret = new CSVRelResAut();
|
||||
ret.setResult_id(ar.getResultId());
|
||||
ret.setAuthor_id(ar.getAuthorId());
|
||||
return ret;
|
||||
},
|
||||
Encoders.bean(CSVRelResAut.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(workingPath + "/" + resultType + "/result_author");
|
||||
|
||||
// ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose
|
||||
// the one from orcid if any
|
||||
authorResult
|
||||
.groupByKey((MapFunction<AuthorResult, String>) ar -> ar.getAuthorId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, AuthorResult, CSVAuthor>) (k, it) -> {
|
||||
AuthorResult first = it.next();
|
||||
if (!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid())
|
||||
return getAuthorDump(first);
|
||||
while (it.hasNext()) {
|
||||
AuthorResult ar = it.next();
|
||||
if (ar.getFromOrcid())
|
||||
return getAuthorDump(ar);
|
||||
}
|
||||
return getAuthorDump(first);
|
||||
},
|
||||
Encoders.bean(CSVAuthor.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(workingPath + "/" + resultType + "/author");
|
||||
|
||||
}
|
||||
|
||||
private static List<CSVPid> mapPid(List<StructuredProperty> pid, String resultId) {
|
||||
return pid
|
||||
.stream()
|
||||
.map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase())
|
||||
.distinct()
|
||||
.map(p -> {
|
||||
CSVPid ret = new CSVPid();
|
||||
ret.setId(DHPUtils.md5(p + "@" + resultId));
|
||||
ret.setResult_id(resultId);
|
||||
ret.setPid(split(p, "@")[1]);
|
||||
ret.setType(split(p, "@")[0]);
|
||||
|
||||
return ret;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private static CSVAuthor getAuthorDump(AuthorResult ar) {
|
||||
CSVAuthor ret = new CSVAuthor();
|
||||
ret.setFirstname(ar.getFirstName());
|
||||
|
||||
ret.setId(ar.getAuthorId());
|
||||
ret.setLastname(ar.getLastName());
|
||||
|
||||
ret.setFullname(ar.getFullName());
|
||||
|
||||
if (ar.getOrcid() != null) {
|
||||
ret.setOrcid(ar.getOrcid());
|
||||
ret.setFromOrcid(ar.getFromOrcid());
|
||||
} else {
|
||||
ret.setOrcid("");
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
|
||||
if (!Optional.ofNullable(pid).isPresent())
|
||||
return null;
|
||||
if (pid.size() == 0)
|
||||
return null;
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.TRUE);
|
||||
}
|
||||
}
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static String getFieldValue(Field<String> input) {
|
||||
if (input != null &&
|
||||
StringUtils.isNotEmpty(input.getValue())) {
|
||||
return removeBreaks(input.getValue());
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private static <R extends Result> CSVResult mapResultInfo(R r) {
|
||||
CSVResult ret = new CSVResult();
|
||||
ret.setId(removeBreaks(r.getId()));
|
||||
ret.setType(removeBreaks(r.getResulttype().getClassid()));
|
||||
ret.setTitle(getTitle(r.getTitle()));
|
||||
ret.setDescription(getAbstract(r.getDescription()));
|
||||
ret.setAccessright(removeBreaks(r.getBestaccessright().getClassid()));
|
||||
ret.setPublication_date(removeBreaks(getFieldValue(r.getDateofacceptance())));
|
||||
ret.setPublisher(removeBreaks(getFieldValue(r.getPublisher())));
|
||||
|
||||
if (Optional.ofNullable(r.getSubject()).isPresent())
|
||||
ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> {
|
||||
if (StringUtils.isNotEmpty(s.getValue()))
|
||||
return removeBreaks(s.getValue().toLowerCase());
|
||||
else
|
||||
return null;
|
||||
}).filter(Objects::nonNull).distinct().collect(Collectors.toList())));
|
||||
else
|
||||
ret.setKeywords("");
|
||||
|
||||
if (Optional.ofNullable(r.getCountry()).isPresent())
|
||||
ret
|
||||
.setCountry(
|
||||
String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList())));
|
||||
else
|
||||
ret.setCountry("");
|
||||
|
||||
if (Optional.ofNullable(r.getLanguage()).isPresent() && StringUtils.isNotEmpty(r.getLanguage().getClassid())) {
|
||||
ret.setLanguage(r.getLanguage().getClassid());
|
||||
} else {
|
||||
ret.setLanguage("");
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static String getAbstract(List<Field<String>> description) {
|
||||
if (description == null)
|
||||
return "";
|
||||
for (Field<String> abs : description) {
|
||||
if (StringUtils.isNotEmpty(abs.getValue())) {
|
||||
return removeBreaks(abs.getValue());
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static String getTitle(List<StructuredProperty> titles) {
|
||||
String firstTitle = null;
|
||||
for (StructuredProperty title : titles) {
|
||||
if (StringUtils.isEmpty(firstTitle)) {
|
||||
if (StringUtils.isNotEmpty(title.getValue()))
|
||||
firstTitle = removeBreaks(title.getValue());
|
||||
}
|
||||
if (title.getQualifier().getClassid().equals(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())) {
|
||||
if (StringUtils.isNotEmpty(title.getValue()))
|
||||
return removeBreaks(title.getValue());
|
||||
}
|
||||
}
|
||||
if (firstTitle != null) {
|
||||
return removeBreaks(firstTitle);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static String removeBreaks(String input) {
|
||||
if (StringUtils.isNotEmpty(input))
|
||||
return input
|
||||
.replace("\n", " ")
|
||||
.replace("\t", " ")
|
||||
.replace("\r", " ")
|
||||
// .replace("\\", " ")
|
||||
.replace("\"", " ");
|
||||
|
||||
return input;
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,133 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 10/05/23
|
||||
*/
|
||||
//STEP 4
|
||||
public class SparkMoveOnSigleDir implements Serializable {
|
||||
|
||||
// All the products saved in different directories are put under the same one.
|
||||
// For the authors also a step of reconciliation mast be done, since the same author id can be saved in more than
|
||||
// one directory
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkMoveOnSigleDir.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
// Utils.removeOutputDir(spark, outputPath);
|
||||
run(spark, outputPath, workingPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void run(SparkSession spark, String outputPath,
|
||||
String workingPath) {
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingPath + "/publication/result", CSVResult.class)
|
||||
.union(Utils.readPath(spark, workingPath + "/dataset/result", CSVResult.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/software/result", CSVResult.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result", CSVResult.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("header", "true")
|
||||
.option("delimiter", Constants.SEP)
|
||||
.option("compression", "gzip")
|
||||
.csv(outputPath + "/result");
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingPath + "/publication/result_pid", CSVPid.class)
|
||||
.union(Utils.readPath(spark, workingPath + "/dataset/result_pid", CSVPid.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/software/result_pid", CSVPid.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_pid", CSVPid.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("header", "true")
|
||||
.option("delimiter", Constants.SEP)
|
||||
.option("compression", "gzip")
|
||||
.csv(outputPath + "/result_pid");
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingPath + "/publication/result_author", CSVRelResAut.class)
|
||||
.union(Utils.readPath(spark, workingPath + "/dataset/result_author", CSVRelResAut.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/software/result_author", CSVRelResAut.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_author", CSVRelResAut.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("header", "true")
|
||||
.option("delimiter", Constants.SEP)
|
||||
.option("compression", "gzip")
|
||||
.csv(outputPath + "/result_author");
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingPath + "/publication/author", CSVAuthor.class)
|
||||
.union(Utils.readPath(spark, workingPath + "/dataset/author", CSVAuthor.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/software/author", CSVAuthor.class))
|
||||
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/author", CSVAuthor.class))
|
||||
.groupByKey((MapFunction<CSVAuthor, String>) r -> r.getId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, CSVAuthor, CSVAuthor>) (k, it) -> it.next(), Encoders.bean(CSVAuthor.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("header", "true")
|
||||
.option("delimiter", Constants.SEP)
|
||||
.option("compression", "gzip")
|
||||
.csv(outputPath + "/author");
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,227 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVCitation;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 04/05/23
|
||||
*/
|
||||
//STEP 2
|
||||
public class SparkSelectResultsAndDumpRelations implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class);
|
||||
private static String RESULT_COMMUNITY_TABLE = "/result_community";
|
||||
private static String COMMUNITY_RESULT_IDS = "/communityResultIds";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectResultsAndDumpRelations.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
List<String> communityList = null;
|
||||
Optional<String> communities = Optional.ofNullable(parser.get("communities"));
|
||||
if (communities.isPresent()) {
|
||||
communityList = Arrays.asList(communities.get().split(";"));
|
||||
}
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
List<String> finalCommunityList = communityList;
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
// Utils.removeOutputDir(spark, outputPath);
|
||||
run(spark, inputPath, outputPath, workingPath, finalCommunityList);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void run(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingPath,
|
||||
List<String> communityList) {
|
||||
|
||||
// select the result ids related to the set of communities considered
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath + "/publication", Publication.class, communityList, workingPath + COMMUNITY_RESULT_IDS);
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + COMMUNITY_RESULT_IDS);
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath + "/software", Software.class, communityList, workingPath + COMMUNITY_RESULT_IDS);
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList,
|
||||
workingPath + COMMUNITY_RESULT_IDS);
|
||||
|
||||
// write the relations result communities
|
||||
writeCommunityResultRelations(
|
||||
spark, inputPath + "/publication", Publication.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
|
||||
writeCommunityResultRelations(
|
||||
spark, inputPath + "/dataset", Dataset.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
|
||||
writeCommunityResultRelations(
|
||||
spark, inputPath + "/software", Software.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
|
||||
writeCommunityResultRelations(
|
||||
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList,
|
||||
outputPath + RESULT_COMMUNITY_TABLE);
|
||||
|
||||
// select the relations with semantics cites
|
||||
org.apache.spark.sql.Dataset<Relation> relations = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equals(ModelConstants.CITES));
|
||||
|
||||
// select the results target of the selected relations having as source one of the results related to the
|
||||
// communities
|
||||
org.apache.spark.sql.Dataset<String> resultIds = spark
|
||||
.read()
|
||||
.textFile(workingPath + COMMUNITY_RESULT_IDS)
|
||||
.distinct();
|
||||
|
||||
resultIds
|
||||
.joinWith(relations, resultIds.col("value").equalTo(relations.col("source")), "left")
|
||||
.flatMap((FlatMapFunction<Tuple2<String, Relation>, String>) t2 -> {
|
||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||
return Arrays.asList(t2._1(), t2._2().getTarget()).iterator();
|
||||
} else {
|
||||
return Arrays.asList(t2._1()).iterator();
|
||||
}
|
||||
}, Encoders.STRING())
|
||||
.distinct()
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
||||
.text(workingPath + "/resultIds");
|
||||
|
||||
resultIds
|
||||
.joinWith(relations, resultIds.col("value").equalTo(relations.col("source")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, Relation>, CSVCitation>) t2 -> mapToCitation(t2._2()),
|
||||
Encoders.bean(CSVCitation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.option("header", "true")
|
||||
.option("delimiter", Constants.SEP)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.csv(outputPath + "/relation");
|
||||
|
||||
}
|
||||
|
||||
private static CSVCitation mapToCitation(Relation relation) {
|
||||
CSVCitation ret = new CSVCitation();
|
||||
ret.setId(DHPUtils.md5(relation.getSource() + relation.getRelClass().toLowerCase() + relation.getTarget()));
|
||||
ret.setResult_id_cites(relation.getSource());
|
||||
ret.setResult_id_cited(relation.getTarget());
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static <R extends Result> void writeCommunityResultRelations(SparkSession spark, String inputPath,
|
||||
Class<R> clazz, List<String> communityList, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath, clazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||
!p.getDataInfo().getInvisible())
|
||||
.flatMap((FlatMapFunction<R, CSVRELCommunityResult>) p -> {
|
||||
Set<String> inserted = new HashSet<>();
|
||||
List<CSVRELCommunityResult> ret = new ArrayList<>();
|
||||
|
||||
for (String context : p
|
||||
.getContext()
|
||||
.stream()
|
||||
.map(Context::getId)
|
||||
.distinct()
|
||||
.collect(Collectors.toList())) {
|
||||
String cId = context.contains("::")
|
||||
? context.substring(0, context.indexOf("::"))
|
||||
: context;
|
||||
if (communityList.contains(cId) && !inserted.contains(cId)) {
|
||||
CSVRELCommunityResult crc = new CSVRELCommunityResult();
|
||||
crc.setResult_id(p.getId());
|
||||
crc.setCommunity_id(DHPUtils.md5(cId));
|
||||
ret.add(crc);
|
||||
inserted.add(cId);
|
||||
}
|
||||
}
|
||||
return ret.iterator();
|
||||
}, Encoders.bean(CSVRELCommunityResult.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.option("header", "true")
|
||||
.option("delimiter", Constants.SEP)
|
||||
.csv(outputPath);
|
||||
}
|
||||
|
||||
private static <R extends Result> void writeCommunityRelatedIds(SparkSession spark, String inputPath,
|
||||
Class<R> clazz, List<String> communityList, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath, clazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||
!p.getDataInfo().getInvisible() &&
|
||||
isRelatedToCommunities(p, communityList))
|
||||
.map((MapFunction<R, String>) Result::getId, Encoders.STRING())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.text(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> boolean isRelatedToCommunities(R p, List<String> communityList) {
|
||||
return p
|
||||
.getContext()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
c -> communityList.contains(c.getId()) ||
|
||||
(c.getId().contains("::")
|
||||
&& communityList.contains(c.getId().substring(0, c.getId().indexOf("::")))));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,68 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/05/23
|
||||
*/
|
||||
public class CSVAuthor implements Serializable {
|
||||
private String id;
|
||||
private String firstname;
|
||||
private String lastname;
|
||||
private String fullname;
|
||||
private String orcid;
|
||||
private Boolean fromOrcid;
|
||||
|
||||
public Boolean getFromOrcid() {
|
||||
return fromOrcid;
|
||||
}
|
||||
|
||||
public void setFromOrcid(Boolean fromOrcid) {
|
||||
this.fromOrcid = fromOrcid;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = Constants.addQuotes(id);
|
||||
}
|
||||
|
||||
public String getFirstname() {
|
||||
return firstname;
|
||||
}
|
||||
|
||||
public void setFirstname(String firstname) {
|
||||
this.firstname = Constants.addQuotes(firstname);
|
||||
}
|
||||
|
||||
public String getLastname() {
|
||||
return lastname;
|
||||
}
|
||||
|
||||
public void setLastname(String lastname) {
|
||||
this.lastname = Constants.addQuotes(lastname);
|
||||
}
|
||||
|
||||
public String getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public void setFullname(String fullname) {
|
||||
this.fullname = Constants.addQuotes(fullname);
|
||||
}
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String orcid) {
|
||||
this.orcid = Constants.addQuotes(orcid);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/05/23
|
||||
*/
|
||||
public class CSVCitation implements Serializable {
|
||||
private String id;
|
||||
private String result_id_cites;
|
||||
private String result_id_cited;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = Constants.addQuotes(id);
|
||||
}
|
||||
|
||||
public String getResult_id_cites() {
|
||||
return result_id_cites;
|
||||
}
|
||||
|
||||
public void setResult_id_cites(String result_id_cites) {
|
||||
this.result_id_cites = Constants.addQuotes(result_id_cites);
|
||||
}
|
||||
|
||||
public String getResult_id_cited() {
|
||||
return result_id_cited;
|
||||
}
|
||||
|
||||
public void setResult_id_cited(String result_id_cited) {
|
||||
this.result_id_cited = Constants.addQuotes(result_id_cited);
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/05/23
|
||||
*/
|
||||
public class CSVPid implements Serializable {
|
||||
|
||||
private String id;
|
||||
private String result_id;
|
||||
private String pid;
|
||||
private String type;
|
||||
|
||||
public String getResult_id() {
|
||||
return result_id;
|
||||
}
|
||||
|
||||
public void setResult_id(String result_id) {
|
||||
this.result_id = Constants.addQuotes(result_id);
|
||||
}
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(String pid) {
|
||||
this.pid = Constants.addQuotes(pid);
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = Constants.addQuotes(type);
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = Constants.addQuotes(id);
|
||||
}
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/05/23
|
||||
*/
|
||||
public class CSVRELCommunityResult implements Serializable {
|
||||
private String result_id;
|
||||
private String community_id;
|
||||
|
||||
public String getResult_id() {
|
||||
return result_id;
|
||||
}
|
||||
|
||||
public void setResult_id(String result_id) {
|
||||
this.result_id = Constants.addQuotes(result_id);
|
||||
}
|
||||
|
||||
public String getCommunity_id() {
|
||||
return community_id;
|
||||
}
|
||||
|
||||
public void setCommunity_id(String community_id) {
|
||||
this.community_id = Constants.addQuotes(community_id);
|
||||
}
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/05/23
|
||||
*/
|
||||
public class CSVRelResAut implements Serializable {
|
||||
private String result_id;
|
||||
private String author_id;
|
||||
|
||||
public String getResult_id() {
|
||||
return result_id;
|
||||
}
|
||||
|
||||
public void setResult_id(String result_id) {
|
||||
this.result_id = Constants.addQuotes(result_id);
|
||||
}
|
||||
|
||||
public String getAuthor_id() {
|
||||
return author_id;
|
||||
}
|
||||
|
||||
public void setAuthor_id(String author_id) {
|
||||
this.author_id = Constants.addQuotes(author_id);
|
||||
}
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonGetter;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.annotation.JsonSetter;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import sun.swing.StringUIClientPropertyKey;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/05/23
|
||||
*/
|
||||
public class CSVResult implements Serializable {
|
||||
private String id;
|
||||
private String type;
|
||||
private String title;
|
||||
private String description;
|
||||
private String accessright;
|
||||
private String publication_date;
|
||||
private String publisher;
|
||||
private String keywords;
|
||||
private String country;
|
||||
private String language;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = Constants.addQuotes(id);
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = Constants.addQuotes(type);
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = Constants.addQuotes(title);
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = Constants.addQuotes(description);
|
||||
}
|
||||
|
||||
public String getAccessright() {
|
||||
return accessright;
|
||||
}
|
||||
|
||||
public void setAccessright(String accessright) {
|
||||
this.accessright = Constants.addQuotes(accessright);
|
||||
}
|
||||
|
||||
public String getPublication_date() {
|
||||
return publication_date;
|
||||
}
|
||||
|
||||
public void setPublication_date(String publication_date) {
|
||||
this.publication_date = Constants.addQuotes(publication_date);
|
||||
}
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(String publisher) {
|
||||
this.publisher = Constants.addQuotes(publisher);
|
||||
}
|
||||
|
||||
public String getKeywords() {
|
||||
return keywords;
|
||||
}
|
||||
|
||||
public void setKeywords(String keywords) {
|
||||
this.keywords = Constants.addQuotes(keywords);
|
||||
}
|
||||
|
||||
public String getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(String country) {
|
||||
this.country = Constants.addQuotes(country);
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = Constants.addQuotes(language);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.exceptions;
|
||||
|
||||
public class MyRuntimeException extends RuntimeException {
|
||||
|
||||
public MyRuntimeException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public MyRuntimeException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public MyRuntimeException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public MyRuntimeException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public MyRuntimeException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,129 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.funderresults;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.amazonaws.transform.SimpleTypeUnmarshallers;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.oa.model.community.Funder;
|
||||
import eu.dnetlib.dhp.oa.model.community.Project;
|
||||
import io.netty.util.internal.StringUtil;
|
||||
|
||||
/**
|
||||
* Splits the dumped results by funder and stores them in a folder named as the funder nsp (for all the funders, but the EC
|
||||
* for the EC it specifies also the fundingStream (FP7 or H2020)
|
||||
*/
|
||||
public class SparkDumpFunderResults implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpFunderResults.class);
|
||||
private static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpFunderResults.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
writeResultProjectList(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<CommunityResult> result = Utils
|
||||
.readPath(spark, inputPath + "/publication", CommunityResult.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
|
||||
log.info("Number of result {}", result.count());
|
||||
|
||||
Dataset<String> tmp = result
|
||||
.flatMap((FlatMapFunction<CommunityResult, String>) cr -> cr.getProjects().stream().map(p -> {
|
||||
return getFunderName(p);
|
||||
}).collect(Collectors.toList()).iterator(), Encoders.STRING())
|
||||
.distinct();
|
||||
List<String> funderList = tmp.collectAsList();
|
||||
funderList.stream().parallel().forEach(funder -> {
|
||||
result
|
||||
.filter(
|
||||
(FilterFunction<CommunityResult>) r -> Optional.ofNullable(r.getProjects()).isPresent() &&
|
||||
r.getProjects().stream().anyMatch(p -> getFunderName(p).equals(funder)))
|
||||
.map((MapFunction<CommunityResult, String>) r -> MAPPER.writeValueAsString(r), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(outputPath + "/" + funder);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static String getFunderName(Project p) {
|
||||
Optional<Funder> ofunder = Optional.ofNullable(p.getFunder());
|
||||
if (ofunder.isPresent()) {
|
||||
String fName = ofunder.get().getShortName();
|
||||
if (StringUtil.isNullOrEmpty(fName))
|
||||
return ofunder.get().getName();
|
||||
if (fName.equalsIgnoreCase("ec")) {
|
||||
fName += "_" + ofunder.get().getFundingStream();
|
||||
}
|
||||
return fName;
|
||||
} else {
|
||||
String fName = p.getId().substring(0, p.getId().indexOf("_")).toUpperCase();
|
||||
if (fName.equalsIgnoreCase("ec")) {
|
||||
if (p.getId().contains("he")) {
|
||||
fName += "_HE";
|
||||
} else if (p.getId().contains("h2020")) {
|
||||
fName += "_H2020";
|
||||
} else {
|
||||
fName += "_FP7";
|
||||
}
|
||||
} else if (fName.equalsIgnoreCase("conicytf")) {
|
||||
fName = "CONICYT";
|
||||
} else if (fName.equalsIgnoreCase("dfgf")) {
|
||||
fName = "DFG";
|
||||
} else if (fName.equalsIgnoreCase("tubitakf")) {
|
||||
fName = "TUBITAK";
|
||||
} else if (fName.equalsIgnoreCase("euenvagency")) {
|
||||
fName = "EEA";
|
||||
}
|
||||
return fName;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,120 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.funderresults;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.ResultProject;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Selects the results linked to projects. Only for these results the dump will be performed.
|
||||
* The code to perform the dump and to expend the dumped results with the information related to projects
|
||||
* is the one used for the dump of the community products
|
||||
*/
|
||||
public class SparkResultLinkedToProject implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkResultLinkedToProject.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkResultLinkedToProject.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters_link_prj.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final String resultProjectsPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", resultProjectsPath);
|
||||
|
||||
String communityMapPath = parser.get("communityMapPath");
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
writeResultsLinkedToProjects(
|
||||
communityMapPath, spark, inputClazz, inputPath, outputPath, resultProjectsPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <R extends Result> void writeResultsLinkedToProjects(String communityMapPath, SparkSession spark,
|
||||
Class<R> inputClazz,
|
||||
String inputPath, String outputPath, String resultProjectsPath) {
|
||||
|
||||
Dataset<R> results = Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
!r.getDataInfo().getInvisible());
|
||||
Dataset<ResultProject> resultProjectDataset = Utils
|
||||
.readPath(spark, resultProjectsPath, ResultProject.class);
|
||||
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
results
|
||||
.joinWith(resultProjectDataset, results.col("id").equalTo(resultProjectDataset.col("resultId")))
|
||||
.map((MapFunction<Tuple2<R, ResultProject>, CommunityResult>) t2 -> {
|
||||
CommunityResult cr = (CommunityResult) ResultMapper
|
||||
.map(
|
||||
t2._1(),
|
||||
communityMap, Constants.DUMPTYPE.FUNDER.getType());
|
||||
if (cr != null) {
|
||||
cr.setProjects(t2._2().getProjectsList());
|
||||
}
|
||||
return cr;
|
||||
}, Encoders.bean(CommunityResult.class))
|
||||
.filter(Objects::nonNull)
|
||||
.map(
|
||||
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
|
||||
Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(outputPath);
|
||||
|
||||
}
|
||||
}
|
|
@ -1,270 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.organizationonly;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
|
||||
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
|
||||
import eu.dnetlib.dhp.oa.model.Container;
|
||||
import eu.dnetlib.dhp.oa.model.Provenance;
|
||||
import eu.dnetlib.dhp.oa.model.Result;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Datasource;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Organization;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Project;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Spark Job that fires the dump for the entites
|
||||
*/
|
||||
public class SparkDumpOrganizationJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob.class);
|
||||
public static final String COMPRESSION = "compression";
|
||||
public static final String GZIP = "gzip";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
Boolean isSparkSessionManaged = Boolean.TRUE;
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = "/tmp/miriam/organizationsOnly/";
|
||||
log.info("outputPath: {}", outputPath);
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
// Utils.removeOutputDir(spark, outputPath);
|
||||
organizationMap(spark, inputPath, outputPath);
|
||||
// relationMap2(spark, inputPath, outputPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "relation", Relation.class)
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
|
||||
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
||||
relNew
|
||||
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setReltype(
|
||||
RelType
|
||||
.newInstance(
|
||||
relation.getRelClass(),
|
||||
relation.getSubRelType()));
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
DataInfo dInfo = odInfo.get();
|
||||
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
|
||||
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
|
||||
relNew
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
dInfo.getProvenanceaction().getClassname(),
|
||||
dInfo.getTrust()));
|
||||
}
|
||||
}
|
||||
if (Boolean.TRUE.equals(relation.getValidated())) {
|
||||
relNew.setValidated(relation.getValidated());
|
||||
relNew.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
|
||||
return relNew;
|
||||
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "relation");
|
||||
}
|
||||
|
||||
private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
|
||||
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
|
||||
Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
|
||||
organization
|
||||
.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
|
||||
.map(
|
||||
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json("/tmp/orgSource");
|
||||
|
||||
rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
|
||||
|
||||
organization
|
||||
.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
|
||||
.map(
|
||||
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json("/tmp/orgSourceTarget");
|
||||
|
||||
Utils
|
||||
.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
|
||||
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
||||
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
||||
relNew
|
||||
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
|
||||
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
|
||||
|
||||
relNew
|
||||
.setReltype(
|
||||
RelType
|
||||
.newInstance(
|
||||
relation.getRelClass(),
|
||||
relation.getSubRelType()));
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
DataInfo dInfo = odInfo.get();
|
||||
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
|
||||
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
|
||||
relNew
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
dInfo.getProvenanceaction().getClassname(),
|
||||
dInfo.getTrust()));
|
||||
}
|
||||
}
|
||||
if (Boolean.TRUE.equals(relation.getValidated())) {
|
||||
relNew.setValidated(relation.getValidated());
|
||||
relNew.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
|
||||
return relNew;
|
||||
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "relation");
|
||||
}
|
||||
|
||||
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
|
||||
.map(
|
||||
(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
|
||||
Encoders.bean(Organization.class))
|
||||
.filter((FilterFunction<Organization>) o -> o != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.json(outputPath + "/organization");
|
||||
}
|
||||
|
||||
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
|
||||
eu.dnetlib.dhp.schema.oaf.Organization org) {
|
||||
|
||||
Organization organization = new Organization();
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalshortname())
|
||||
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalname())
|
||||
.ifPresent(value -> organization.setLegalname(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getWebsiteurl())
|
||||
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getAlternativeNames())
|
||||
.ifPresent(
|
||||
value -> organization
|
||||
.setAlternativenames(
|
||||
value
|
||||
.stream()
|
||||
.map(v -> v.getValue())
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getCountry())
|
||||
.ifPresent(
|
||||
value -> {
|
||||
if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
|
||||
organization
|
||||
.setCountry(
|
||||
eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getId())
|
||||
.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getPid())
|
||||
.ifPresent(
|
||||
value -> organization
|
||||
.setPid(
|
||||
value
|
||||
.stream()
|
||||
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
return organization;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,87 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.projectssubset;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.graph.Project;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class ProjectsSubsetSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(ProjectsSubsetSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
ProjectsSubsetSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String projectListPath = parser.get("projectListPath");
|
||||
log.info("projectListPath: {}", projectListPath);
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
getNewProjectList(spark, inputPath, outputPath, projectListPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getNewProjectList(SparkSession spark, String inputPath, String outputPath,
|
||||
String projectListPath) {
|
||||
Dataset<String> projectList = spark.read().textFile(projectListPath);
|
||||
Dataset<Project> projects;
|
||||
projects = Utils
|
||||
.readPath(spark, inputPath, Project.class)
|
||||
.map((MapFunction<Project, Project>) p -> {
|
||||
p.setId("40|" + p.getId());
|
||||
return p;
|
||||
}, Encoders.bean(Project.class));
|
||||
projects
|
||||
.joinWith(projectList, projects.col("id").equalTo(projectList.col("value")), "left")
|
||||
.map((MapFunction<Tuple2<Project, String>, Project>) t2 -> {
|
||||
if (Optional.ofNullable(t2._2()).isPresent())
|
||||
return null;
|
||||
return t2._1();
|
||||
}, Encoders.bean(Project.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
Utils
|
||||
.readPath(spark, outputPath, Project.class)
|
||||
.map((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.text(projectListPath);
|
||||
}
|
||||
}
|
|
@ -1,241 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.serafeim;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 04/05/23
|
||||
*/
|
||||
//STEP 2
|
||||
public class SparkSelectResultsAndDumpRelations implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class);
|
||||
private static String RESULT_COMMUNITY_TABLE = "/result_community";
|
||||
private static String COMMUNITY_RESULT_IDS = "/communityResultIds";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectResultsAndDumpRelations.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
List<String> communityList = null;
|
||||
Optional<String> communities = Optional.ofNullable(parser.get("communities"));
|
||||
if (communities.isPresent()) {
|
||||
communityList = Arrays.asList(communities.get().split(";"));
|
||||
}
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
List<String> finalCommunityList = communityList;
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
run(spark, inputPath, outputPath, workingPath, finalCommunityList);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void run(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingPath,
|
||||
List<String> communityList) {
|
||||
|
||||
// select the result ids related to the set of communities considered
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath, Publication.class, communityList, workingPath, "publication");
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath, Dataset.class, communityList, workingPath, "dataset");
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath, Software.class, communityList, workingPath, "software");
|
||||
writeCommunityRelatedIds(
|
||||
spark, inputPath, OtherResearchProduct.class, communityList,
|
||||
workingPath, "otherresearchproduct");
|
||||
|
||||
// select the relations with semantics cites
|
||||
org.apache.spark.sql.Dataset<Relation> relations = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equals(ModelConstants.CITES));
|
||||
|
||||
// select the relations having as source one of the results related to the
|
||||
// communities
|
||||
org.apache.spark.sql.Dataset<String> communityResultIds = spark
|
||||
.read()
|
||||
.textFile(workingPath + COMMUNITY_RESULT_IDS)
|
||||
.distinct();
|
||||
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/publication", Publication.class)
|
||||
.filter(
|
||||
(FilterFunction<Publication>) p -> !p.getDataInfo().getDeletedbyinference()
|
||||
&& !p.getDataInfo().getInvisible())
|
||||
.map((MapFunction<Publication, String>) p -> p.getId(), Encoders.STRING())
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/dataset", Dataset.class)
|
||||
.filter(
|
||||
(FilterFunction<Dataset>) p -> !p.getDataInfo().getDeletedbyinference()
|
||||
&& !p.getDataInfo().getInvisible())
|
||||
.map((MapFunction<Dataset, String>) p -> p.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/software", Software.class)
|
||||
.filter(
|
||||
(FilterFunction<Software>) p -> !p.getDataInfo().getDeletedbyinference()
|
||||
&& !p.getDataInfo().getInvisible())
|
||||
.map((MapFunction<Software, String>) p -> p.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
|
||||
.filter(
|
||||
(FilterFunction<OtherResearchProduct>) p -> !p.getDataInfo().getDeletedbyinference()
|
||||
&& !p.getDataInfo().getInvisible())
|
||||
.map((MapFunction<OtherResearchProduct, String>) p -> p.getId(), Encoders.STRING()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(workingPath + "/resultIds");
|
||||
|
||||
org.apache.spark.sql.Dataset<String> resultIds = spark.read().textFile(workingPath + "/resultIds");
|
||||
|
||||
org.apache.spark.sql.Dataset<Relation> oksource = communityResultIds
|
||||
.joinWith(relations, communityResultIds.col("value").equalTo(relations.col("source")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, Relation>, Relation>) t2 -> t2._2(),
|
||||
Encoders.bean(Relation.class));
|
||||
oksource
|
||||
.joinWith(resultIds, oksource.col("target").equalTo(resultIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath + "/relation");
|
||||
|
||||
writeNodes(
|
||||
spark, inputPath + "/publication", Publication.class, outputPath + "/publication",
|
||||
outputPath + "/relation", workingPath);
|
||||
writeNodes(
|
||||
spark, inputPath + "/dataset", Dataset.class, outputPath + "/dataset", outputPath + "/relation",
|
||||
workingPath);
|
||||
writeNodes(
|
||||
spark, inputPath + "/software", Software.class, outputPath + "/software", outputPath + "/relation",
|
||||
workingPath);
|
||||
writeNodes(
|
||||
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class,
|
||||
outputPath + "/otherresearchproduct", outputPath + "/relation", workingPath);
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void writeNodes(SparkSession spark, String inputPath, Class<R> clazz,
|
||||
String outputPath, String relationPath, String workingPath) {
|
||||
org.apache.spark.sql.Dataset<Relation> citingRelations = Utils.readPath(spark, relationPath, Relation.class);
|
||||
org.apache.spark.sql.Dataset<R> result = Utils
|
||||
.readPath(spark, inputPath, clazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||
!p.getDataInfo().getInvisible());
|
||||
|
||||
// take the distinct result id for source and target of the relations
|
||||
citingRelations
|
||||
.flatMap(
|
||||
(FlatMapFunction<Relation, String>) r -> Arrays
|
||||
.asList(r.getSource(), r.getTarget())
|
||||
.iterator(),
|
||||
Encoders.STRING())
|
||||
.distinct()
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(workingPath + "/relationIds");
|
||||
|
||||
org.apache.spark.sql.Dataset<String> relationIds = spark.read().textFile(workingPath + "/relationIds");
|
||||
|
||||
relationIds
|
||||
.joinWith(result, relationIds.col("value").equalTo(result.col("id")))
|
||||
.map((MapFunction<Tuple2<String, R>, R>) t2 -> t2._2(), Encoders.bean(clazz))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <R extends Result> void writeCommunityRelatedIds(SparkSession spark, String inputPath,
|
||||
Class<R> clazz, List<String> communityList, String outputPath, String resultType) {
|
||||
org.apache.spark.sql.Dataset<R> results = Utils
|
||||
.readPath(spark, inputPath + "/" + resultType, clazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||
!p.getDataInfo().getInvisible() &&
|
||||
isRelatedToCommunities(p, communityList));
|
||||
results
|
||||
.map((MapFunction<R, String>) Result::getId, Encoders.STRING())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.text(outputPath + COMMUNITY_RESULT_IDS);
|
||||
|
||||
// results
|
||||
// // .repartition(10000)
|
||||
// .write()
|
||||
// .option("compression", "gzip")
|
||||
// .mode(SaveMode.Append)
|
||||
// .json(outputPath + "/" + resultType);
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> boolean isRelatedToCommunities(R p, List<String> communityList) {
|
||||
return p
|
||||
.getContext()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
c -> communityList.contains(c.getId()) ||
|
||||
(c.getId().contains("::")
|
||||
&& communityList.contains(c.getId().substring(0, c.getId().indexOf("::")))));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.skgif.model.Identifier;
|
||||
import eu.dnetlib.dhp.skgif.model.Prefixes;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class DumpDatasource implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
DumpDatasource.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "Datasources");
|
||||
|
||||
mapDatasource(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
|
||||
&& !d.getDataInfo().getDeletedbyinference())
|
||||
.map((MapFunction<Datasource, eu.dnetlib.dhp.skgif.model.Datasource>) d -> {
|
||||
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
|
||||
datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId()));
|
||||
datasource
|
||||
.setIdentifiers(
|
||||
d
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
datasource.setName(d.getOfficialname().getValue());
|
||||
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
|
||||
datasource
|
||||
.setJurisdiction(
|
||||
Optional
|
||||
.ofNullable(d.getJurisdiction())
|
||||
.map(v -> v.getClassid())
|
||||
.orElse(new String()));
|
||||
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
|
||||
datasource.setVersion_control(d.getVersioncontrol());
|
||||
|
||||
datasource
|
||||
.setData_source_classification(
|
||||
Optional
|
||||
.ofNullable(d.getEoscdatasourcetype())
|
||||
.map(v -> v.getClassname())
|
||||
.orElse(new String()));
|
||||
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
|
||||
datasource.setThematic(d.getThematic());
|
||||
datasource
|
||||
.setResearch_product_access_policy(
|
||||
Optional
|
||||
.ofNullable(d.getDatabaseaccesstype())
|
||||
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
|
||||
.orElse(new ArrayList<>()));
|
||||
datasource
|
||||
.setResearch_product_metadata_access_policy(
|
||||
Optional
|
||||
.ofNullable(d.getResearchproductmetadataaccesspolicies())
|
||||
.map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
|
||||
.orElse(new ArrayList<>()));
|
||||
return datasource;
|
||||
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "Datasource");
|
||||
}
|
||||
|
||||
private static List<String> getResearchProductAccessPolicy(List<String> value) {
|
||||
|
||||
return value
|
||||
.stream()
|
||||
.map(v -> getResearchProductAccessPolicy(v))
|
||||
.filter(Objects::nonNull)
|
||||
.map(v -> v.get(0))
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static List<String> getResearchProductAccessPolicy(String value) {
|
||||
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
|
||||
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
|
||||
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
|
||||
switch (value) {
|
||||
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
|
||||
return Arrays.asList("open access");
|
||||
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
|
||||
return Arrays.asList("restricted access");
|
||||
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
|
||||
return Arrays.asList("metadata only access");
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> getEoscProductType(List<String> researchentitytypes) {
|
||||
|
||||
List<String> eoscProductType = new ArrayList<>();
|
||||
if (researchentitytypes != null) {
|
||||
|
||||
if (researchentitytypes.contains("Software"))
|
||||
eoscProductType.add("Research Software");
|
||||
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
|
||||
eoscProductType.add("Research Literature");
|
||||
if (researchentitytypes.contains("Research Data"))
|
||||
eoscProductType.add("Research Data");
|
||||
if (researchentitytypes.contains("Organization") ||
|
||||
researchentitytypes.contains("Organizations") ||
|
||||
researchentitytypes.contains("Services") ||
|
||||
researchentitytypes.contains("Projects"))
|
||||
eoscProductType.add("Other research product");
|
||||
}
|
||||
return eoscProductType;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.skgif.model.Grant;
|
||||
import eu.dnetlib.dhp.skgif.model.Identifier;
|
||||
import eu.dnetlib.dhp.skgif.model.Prefixes;
|
||||
import eu.dnetlib.dhp.skgif.model.RelationType;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 22/02/24
|
||||
*/
|
||||
public class DumpGrant implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpGrant.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
DumpGrant.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "Grant");
|
||||
|
||||
mapGrants(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void mapGrants(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<Project> projects = Utils
|
||||
.readPath(spark, inputPath + "project", Project.class)
|
||||
.filter(
|
||||
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||
!p.getDataInfo().getInvisible());
|
||||
Dataset<Relation> relations = Utils
|
||||
.readPath(spark, inputPath + "relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
!r.getDataInfo().getInvisible() &&
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label));
|
||||
projects
|
||||
.joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left")
|
||||
.groupByKey((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._1().getId(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, Grant>) (k, v) -> {
|
||||
Grant g = new Grant();
|
||||
Tuple2<Project, Relation> first = v.next();
|
||||
g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k));
|
||||
g.setIdentifiers(getProjectIdentifier(first._1()));
|
||||
g.setTitle(first._1().getTitle().getValue());
|
||||
g
|
||||
.setSummary(
|
||||
Optional
|
||||
.ofNullable(first._1().getSummary())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(new String()));
|
||||
g
|
||||
.setAcronym(
|
||||
Optional
|
||||
.ofNullable(first._1().getAcronym())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(new String()));
|
||||
g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue()));
|
||||
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
|
||||
g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue()));
|
||||
g
|
||||
.setCurrency(
|
||||
Optional
|
||||
.ofNullable(first._1().getCurrency())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(new String()));
|
||||
g
|
||||
.setFunded_amount(
|
||||
Optional
|
||||
.ofNullable(first._1().getFundedamount())
|
||||
.orElse(null));
|
||||
g
|
||||
.setKeywords(
|
||||
first
|
||||
._1()
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.map(s -> s.getValue())
|
||||
.collect(Collectors.toList()));
|
||||
g
|
||||
.setStart_date(
|
||||
Optional
|
||||
.ofNullable(first._1().getStartdate())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(new String()));
|
||||
g
|
||||
.setEnd_date(
|
||||
Optional
|
||||
.ofNullable(first._1().getEnddate())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(new String()));
|
||||
g
|
||||
.setWebsite(
|
||||
Optional
|
||||
.ofNullable(first._1().getWebsiteurl())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(new String()));
|
||||
if (Optional.ofNullable(first._2()).isPresent()) {
|
||||
List<String> relevantOrganizatios = new ArrayList<>();
|
||||
relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource()));
|
||||
v
|
||||
.forEachRemaining(
|
||||
t2 -> relevantOrganizatios
|
||||
.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource())));
|
||||
g.setBeneficiaries(relevantOrganizatios);
|
||||
}
|
||||
return g;
|
||||
}, Encoders.bean(Grant.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "Grant");
|
||||
}
|
||||
|
||||
private static String getFundingStream(String fundingtree) throws DocumentException {
|
||||
final Document doc;
|
||||
|
||||
doc = new SAXReader().read(new StringReader(fundingtree));
|
||||
if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
|
||||
doc.selectNodes("//funding_level_0").size() > 0)
|
||||
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText();
|
||||
return new String();
|
||||
|
||||
}
|
||||
|
||||
private static String getFunderName(String fundingtree) throws DocumentException {
|
||||
final Document doc;
|
||||
|
||||
doc = new SAXReader().read(new StringReader(fundingtree));
|
||||
// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
|
||||
return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
|
||||
// f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
|
||||
|
||||
}
|
||||
|
||||
private static List<Identifier> getProjectIdentifier(Project project) throws DocumentException {
|
||||
List<Identifier> identifiers = new ArrayList<>();
|
||||
if (project.getPid().size() > 0)
|
||||
project
|
||||
.getPid()
|
||||
.stream()
|
||||
.forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())));
|
||||
identifiers
|
||||
.add(
|
||||
Identifier
|
||||
.newInstance(
|
||||
getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue()));
|
||||
return identifiers;
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.skgif.model.Identifier;
|
||||
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
|
||||
import eu.dnetlib.dhp.skgif.model.Prefixes;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class DumpOrganization implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
DumpOrganization.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/dump_organization_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "Organization");
|
||||
|
||||
mapOrganization(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
|
||||
organizations
|
||||
.filter(
|
||||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
|
||||
&& !o.getDataInfo().getInvisible())
|
||||
.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
|
||||
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
|
||||
organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
|
||||
organization
|
||||
.setCountry(
|
||||
Optional
|
||||
.ofNullable(o.getCountry().getClassid())
|
||||
.orElse(new String()));
|
||||
organization
|
||||
.setName(
|
||||
Optional
|
||||
.ofNullable(o.getLegalname().getValue())
|
||||
.orElse(new String()));
|
||||
organization
|
||||
.setShort_name(
|
||||
Optional
|
||||
.ofNullable(o.getLegalshortname())
|
||||
.map(v -> v.getValue())
|
||||
.orElse(new String()));
|
||||
organization
|
||||
.setIdentifiers(
|
||||
o
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
organization
|
||||
.setOther_names(
|
||||
o
|
||||
.getAlternativeNames()
|
||||
.stream()
|
||||
.map(a -> a.getValue())
|
||||
.collect(Collectors.toList()));
|
||||
organization.setType(getOrganizationType(o));
|
||||
return organization;
|
||||
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "Organization");
|
||||
}
|
||||
|
||||
private static String getOrganizationType(Organization o) {
|
||||
if (Optional.ofNullable(o.getEcenterprise()).isPresent()
|
||||
&& o.getEcenterprise().getValue().equalsIgnoreCase("true"))
|
||||
return OrganizationTypes.COMPANY.label;
|
||||
if (Optional.ofNullable(o.getEchighereducation()).isPresent()
|
||||
&& o.getEchighereducation().getValue().equalsIgnoreCase("true"))
|
||||
return OrganizationTypes.EDUCATION.label;
|
||||
if (Optional.ofNullable(o.getEcresearchorganization()).isPresent()
|
||||
&& o.getEcresearchorganization().getValue().equalsIgnoreCase("true"))
|
||||
return OrganizationTypes.EDUCATION.label;
|
||||
if (Optional.ofNullable(o.getEcnonprofit()).isPresent()
|
||||
&& o.getEcnonprofit().getValue().equalsIgnoreCase("true"))
|
||||
return OrganizationTypes.NONPROFIT.label;
|
||||
|
||||
return OrganizationTypes.OTHER.label;
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,345 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.skgif.model.*;
|
||||
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 06/02/24
|
||||
*/
|
||||
public class DumpResult implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpResult.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
DumpResult.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/dump_result_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, workingDir + "aggrelation");
|
||||
|
||||
mapResult(spark, inputPath, workingDir, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
//per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
||||
public static <R extends Result> void mapResult(SparkSession spark, String inputPath,
|
||||
String workingDir, String outputPath) {
|
||||
|
||||
// selection of the relevant relations from result type to other entity. Only teh semantic relevant ones are
|
||||
// considered
|
||||
selectRelations(spark, inputPath, workingDir);
|
||||
|
||||
// merge of relations and manifestation for the same result
|
||||
getRelationAndManifestation(spark, workingDir, inputPath);
|
||||
|
||||
// dump of the result and enrichment with relevant information for relations and manifestations
|
||||
dumpResult(spark, inputPath, workingDir, outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static void getRelationAndManifestation(SparkSession spark, String workingDir, String inputPath) {
|
||||
Dataset<RelationPerProduct> aggRelations = Utils
|
||||
.readPath(spark, workingDir + "aggrelation", RelationPerProduct.class);
|
||||
aggRelations.count();
|
||||
ModelSupport.entityTypes
|
||||
.keySet()
|
||||
.stream()
|
||||
.filter(ModelSupport::isResult)
|
||||
.forEach(e -> {
|
||||
Utils.removeOutputDir(spark, workingDir + e.name() + "/partialresearchproduct");
|
||||
|
||||
Dataset<Datasource> datasource = Utils
|
||||
.readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEoscdatasourcetype()).isPresent() &&
|
||||
d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
|
||||
|
||||
Dataset<EmitPerManifestation> man = Utils
|
||||
.readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
|
||||
|
||||
Dataset<PartialResearchProduct> partialResearchProduct = man
|
||||
.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(),
|
||||
Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (
|
||||
k, v) -> {
|
||||
PartialResearchProduct prp = new PartialResearchProduct();
|
||||
prp.setResultId(k);
|
||||
List<Manifestation> manifestationList = new ArrayList<>();
|
||||
while (v.hasNext())
|
||||
manifestationList.add(getManifestation(v.next()));
|
||||
prp.setManifestations(manifestationList);
|
||||
return prp;
|
||||
}, Encoders.bean(PartialResearchProduct.class));
|
||||
partialResearchProduct
|
||||
.joinWith(
|
||||
aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")),
|
||||
"left")
|
||||
.map(
|
||||
(MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> {
|
||||
PartialResearchProduct prp = t2._1();
|
||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||
prp.setRelated_products(t2._2().getRelatedProduct());
|
||||
prp.setRelevant_organizations(t2._2().getOrganizations());
|
||||
prp.setFunding(t2._2().getFunding());
|
||||
}
|
||||
return prp;
|
||||
}, Encoders.bean(PartialResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/partialresearchproduct");
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static Manifestation getManifestation(Tuple2<EmitPerManifestation, Datasource> t2) {
|
||||
|
||||
// se il lato sinistro c'e' allora ho la biblio e la venue
|
||||
// se non c'e' allora ho solo gli altri valori
|
||||
EmitPerManifestation epm = t2._1();
|
||||
Manifestation manifestation = new Manifestation();
|
||||
manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname());
|
||||
manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
|
||||
if (Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent())
|
||||
manifestation
|
||||
.setDates(
|
||||
Arrays
|
||||
.asList(
|
||||
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
|
||||
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
|
||||
switch (epm.getInstance().getRefereed().getClassid()) {
|
||||
case "0000":
|
||||
manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
|
||||
break;
|
||||
case "0001":
|
||||
manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
|
||||
break;
|
||||
case "0002":
|
||||
manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
|
||||
break;
|
||||
}
|
||||
|
||||
manifestation.setMetadata_curation("unavailable");
|
||||
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
|
||||
switch (epm.getInstance().getAccessright().getClassid()) {
|
||||
case "OPEN":
|
||||
case "OPEN DATA":
|
||||
case "OPEN SOURCE":
|
||||
manifestation.setAccess_right(AccessRight.OPEN.label);
|
||||
break;
|
||||
case "CLOSED":
|
||||
manifestation.setAccess_right(AccessRight.CLOSED.label);
|
||||
break;
|
||||
case "RESTRICTED":
|
||||
manifestation.setAccess_right(AccessRight.RESTRICTED.label);
|
||||
break;
|
||||
case "EMBARGO":
|
||||
case "12MONTHS":
|
||||
case "6MONTHS":
|
||||
manifestation.setAccess_right(AccessRight.EMBARGO.label);
|
||||
break;
|
||||
default:
|
||||
manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
|
||||
|
||||
}
|
||||
|
||||
manifestation
|
||||
.setLicence(
|
||||
Optional
|
||||
.ofNullable(epm.getInstance().getLicense())
|
||||
.map(value -> value.getValue())
|
||||
.orElse(null));
|
||||
if(Optional.ofNullable(epm.getInstance().getUrl()).isPresent() && epm.getInstance().getUrl().size() > 0)
|
||||
manifestation
|
||||
.setUrl(epm.getInstance().getUrl().get(0));
|
||||
else
|
||||
manifestation.setUrl(null);
|
||||
|
||||
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent() && epm.getInstance().getPid().size() > 0) {
|
||||
manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
|
||||
}
|
||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||
manifestation.setBiblio(getBiblio(epm));
|
||||
if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent())
|
||||
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted()));
|
||||
else if (Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent())
|
||||
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline()));
|
||||
}
|
||||
manifestation
|
||||
.setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()));
|
||||
|
||||
return manifestation;
|
||||
}
|
||||
|
||||
private static Biblio getBiblio(EmitPerManifestation epm) {
|
||||
Biblio biblio = new Biblio();
|
||||
biblio.setEdition(epm.getJournal().getEdition());
|
||||
biblio.setIssue(epm.getJournal().getIss());
|
||||
biblio.setPublisher(epm.getPublisher());
|
||||
biblio.setVolume(epm.getJournal().getVol());
|
||||
biblio.setEnd_page(epm.getJournal().getEp());
|
||||
biblio.setStart_page(epm.getJournal().getSp());
|
||||
return biblio;
|
||||
}
|
||||
|
||||
private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir,
|
||||
String outputPath) {
|
||||
ModelSupport.entityTypes
|
||||
.keySet()
|
||||
.parallelStream()
|
||||
.filter(ModelSupport::isResult)
|
||||
.forEach(e -> {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
Utils.removeOutputDir(spark, workingDir + e.name() + "/researchproduct");
|
||||
Dataset<R> results = Utils.readPath(spark, inputPath + e.name(), resultClazz);
|
||||
Dataset<PartialResearchProduct> prr = Utils
|
||||
.readPath(spark, workingDir + e.name() + "/partialresearchproduct", PartialResearchProduct.class);
|
||||
|
||||
results
|
||||
.joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left")
|
||||
.map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> {
|
||||
ResearchProduct rp = ResultMapper.map(t2._1());
|
||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||
if (Optional.ofNullable(t2._2().getRelated_products()).isPresent())
|
||||
rp.setRelated_products(t2._2().getRelated_products());
|
||||
if (Optional.ofNullable(t2._2().getFunding()).isPresent())
|
||||
rp.setFunding(t2._2().getFunding());
|
||||
if (Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent())
|
||||
rp.setRelevant_organizations(t2._2().getRelevant_organizations());
|
||||
if (Optional.ofNullable(t2._2().getManifestations()).isPresent())
|
||||
rp.setManifestations(t2._2().getManifestations());
|
||||
}
|
||||
return rp;
|
||||
}, Encoders.bean(ResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/researchproduct");
|
||||
|
||||
});
|
||||
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
|
||||
for (EntityType e : ModelSupport.entityTypes.keySet()) {
|
||||
if (ModelSupport.isResult(e))
|
||||
researchProducts = researchProducts
|
||||
.union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class));
|
||||
}
|
||||
researchProducts
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "ResearchProduct");
|
||||
|
||||
}
|
||||
|
||||
private static void selectRelations(SparkSession spark, String inputPath, String workingDir) {
|
||||
Dataset<Relation> relation = Utils
|
||||
.readPath(
|
||||
spark,
|
||||
inputPath + "relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
!r.getDataInfo().getInvisible())
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> r
|
||||
.getRelClass()
|
||||
.equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) ||
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) ||
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) ||
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label) ||
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.PART.label) ||
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) ||
|
||||
r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label));
|
||||
|
||||
relation
|
||||
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Relation, RelationPerProduct>) (k, v) -> {
|
||||
RelationPerProduct rpp = new RelationPerProduct();
|
||||
rpp.setResultId(k);
|
||||
Map<String, List<String>> remainignRelations = new HashMap<>();
|
||||
while (v.hasNext()) {
|
||||
Relation rel = v.next();
|
||||
String target = rel.getTarget();
|
||||
String relClass = rel.getRelClass();
|
||||
switch (rel.getRelClass().toLowerCase()) {
|
||||
case "hasauthorinstitution":
|
||||
rpp.getOrganizations().add(Utils.getIdentifier(Prefixes.ORGANIZATION, target));
|
||||
break;
|
||||
case "isproducedby":
|
||||
rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT, target));
|
||||
break;
|
||||
default:
|
||||
if (!remainignRelations.keySet().contains(relClass))
|
||||
remainignRelations.put(relClass, new ArrayList<>());
|
||||
remainignRelations
|
||||
.get(relClass)
|
||||
.add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target));
|
||||
}
|
||||
}
|
||||
for (String key : remainignRelations.keySet())
|
||||
rpp.getRelatedProduct().add(Relations.newInstance(key, remainignRelations.get(key)));
|
||||
return rpp;
|
||||
}, Encoders.bean(RelationPerProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + "/aggrelation");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.skgif.model.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/02/24
|
||||
*/
|
||||
public class DumpVenue implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpVenue.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
DumpVenue.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/dump_datasource_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "Venue");
|
||||
|
||||
mapVenue(spark, inputPath, outputPath, workingDir);
|
||||
});
|
||||
}
|
||||
|
||||
private static void mapVenue(SparkSession spark, String inputPath, String outputPath, String workingDir) {
|
||||
Dataset<EmitPerManifestation> manifestationDataset = Utils
|
||||
.readPath(spark, workingDir + "datasourcePublisher", EmitPerManifestation.class);
|
||||
Dataset<Datasource> datasourceDataset = Utils
|
||||
.readPath(spark, inputPath + "datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
|
||||
&& !d.getDataInfo().getDeletedbyinference()
|
||||
&& d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
|
||||
datasourceDataset
|
||||
.joinWith(
|
||||
manifestationDataset, datasourceDataset.col("id").equalTo(manifestationDataset.col("hostedby.key")),
|
||||
"left")
|
||||
.map((MapFunction<Tuple2<Datasource, EmitPerManifestation>, Venue>) t2 -> {
|
||||
Venue venue = new Venue();
|
||||
Datasource d = t2._1();
|
||||
if (Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent())
|
||||
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted()));
|
||||
else if (Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent())
|
||||
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline()));
|
||||
venue.setIdentifiers(getVenueIdentifier(d.getJournal()));
|
||||
venue.setName(d.getOfficialname().getValue());
|
||||
venue.setType(VenueType.JOURNAL.label);
|
||||
if (Optional.ofNullable(t2._2()).isPresent())
|
||||
venue.setPublisher(t2._2().getPublisher());
|
||||
venue.setAcronym(null);
|
||||
venue.setSeries(null);
|
||||
venue.setIs_currently_full_oa(null);
|
||||
venue.setCreation_date(null);
|
||||
venue.setContributions(null);
|
||||
return venue;
|
||||
}, Encoders.bean(Venue.class))
|
||||
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + "Venues");
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingDir + "Venues", Venue.class)
|
||||
.groupByKey((MapFunction<Venue, String>) v -> v.getLocal_identifier(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Venue, Venue>) (k, v) -> v.next(), Encoders.bean(Venue.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "Venues");
|
||||
}
|
||||
|
||||
private static List<Identifier> getVenueIdentifier(Journal journal) {
|
||||
List<Identifier> identifiers = new ArrayList<>();
|
||||
if (Optional.ofNullable((journal.getIssnOnline())).isPresent())
|
||||
identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline()));
|
||||
if (Optional.ofNullable(journal.getIssnPrinted()).isPresent())
|
||||
identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted()));
|
||||
if (Optional.ofNullable(journal.getIssnLinking()).isPresent())
|
||||
identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking()));
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
private static List<String> getResearchProductAccessPolicy(List<String> value) {
|
||||
|
||||
return value
|
||||
.stream()
|
||||
.map(v -> getResearchProductAccessPolicy(v))
|
||||
.filter(Objects::nonNull)
|
||||
.map(v -> v.get(0))
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static List<String> getResearchProductAccessPolicy(String value) {
|
||||
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
|
||||
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
|
||||
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
|
||||
switch (value) {
|
||||
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
|
||||
return Arrays.asList("open access");
|
||||
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
|
||||
return Arrays.asList("restricted access");
|
||||
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
|
||||
return Arrays.asList("metadata only access");
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> getEoscProductType(List<String> researchentitytypes) {
|
||||
|
||||
List<String> eoscProductType = new ArrayList<>();
|
||||
if (researchentitytypes != null) {
|
||||
|
||||
if (researchentitytypes.contains("Software"))
|
||||
eoscProductType.add("Research Software");
|
||||
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
|
||||
eoscProductType.add("Research Literature");
|
||||
if (researchentitytypes.contains("Research Data"))
|
||||
eoscProductType.add("Research Data");
|
||||
if (researchentitytypes.contains("Organization") ||
|
||||
researchentitytypes.contains("Organizations") ||
|
||||
researchentitytypes.contains("Services") ||
|
||||
researchentitytypes.contains("Projects"))
|
||||
eoscProductType.add("Other research product");
|
||||
}
|
||||
return eoscProductType;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,281 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.skgif.model.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 06/02/24
|
||||
*/
|
||||
public class EmitFromResults implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(EmitFromResults.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
EmitFromResults.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/emit_biblio_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
emitFromResult(spark, inputPath, outputPath, workingDir);
|
||||
});
|
||||
}
|
||||
|
||||
//per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
||||
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
emitManifestation(spark, inputPath, workingDir);
|
||||
emitPerson(spark, inputPath, outputPath, workingDir);
|
||||
emitTopic(spark, inputPath, outputPath, workingDir);
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void emitTopic(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||
if (ModelSupport.isResult(e)) {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
Utils
|
||||
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||
.filter((FilterFunction<R>) r -> Optional.ofNullable(r.getSubject()).isPresent())
|
||||
.flatMap(
|
||||
(FlatMapFunction<R, Topic>) r -> r
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(
|
||||
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos")
|
||||
|| s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
|
||||
.map(s -> {
|
||||
Topic t = new Topic();
|
||||
t
|
||||
.setLocal_identifier(
|
||||
Utils
|
||||
.getIdentifier(
|
||||
Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
|
||||
t
|
||||
.setIdentifiers(
|
||||
Arrays
|
||||
.asList(
|
||||
Identifier.newInstance(s.getQualifier().getClassid(), s.getValue())));
|
||||
t.setName(s.getValue());
|
||||
return t;
|
||||
})
|
||||
.collect(Collectors.toList())
|
||||
.iterator(),
|
||||
Encoders.bean(Topic.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/topic");
|
||||
}
|
||||
});
|
||||
Dataset<Topic> topics = spark.emptyDataset(Encoders.bean(Topic.class));
|
||||
|
||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||
if (ModelSupport.isResult(entityType))
|
||||
topics = topics.union(Utils.readPath(spark, workingDir + entityType.name() + "/topic", Topic.class));
|
||||
}
|
||||
topics
|
||||
.groupByKey((MapFunction<Topic, String>) p -> p.getLocal_identifier(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Topic, Topic>) (k, v) -> v.next(), Encoders.bean(Topic.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/Topic");
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void emitPerson(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||
if (ModelSupport.isResult(e)) {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
Utils
|
||||
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||
.flatMap((FlatMapFunction<R, Persons>) r -> {
|
||||
List<Persons> authors = new ArrayList<>();
|
||||
|
||||
if (Optional.ofNullable(r.getAuthor()).isPresent() && r.getAuthor().size() > 0) {
|
||||
int count = 0;
|
||||
for (Author a : r.getAuthor()) {
|
||||
count += 1;
|
||||
Persons p = new Persons();
|
||||
p.setFamily_name(a.getSurname());
|
||||
p.setGiven_name(a.getName());
|
||||
String identifier = new String();
|
||||
if (Optional.ofNullable(a.getPid()).isPresent()) {
|
||||
Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils
|
||||
.getOrcid(a.getPid());
|
||||
if (orcid != null) {
|
||||
identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2());
|
||||
if (orcid._2())
|
||||
p
|
||||
.setIdentifiers(
|
||||
Arrays.asList(Identifier.newInstance("orcid", orcid._1())));
|
||||
else
|
||||
p
|
||||
.setIdentifiers(
|
||||
Arrays
|
||||
.asList(Identifier.newInstance("inferred_orcid", orcid._1())));
|
||||
} else {
|
||||
if (Optional.ofNullable(a.getRank()).isPresent()) {
|
||||
identifier = Utils
|
||||
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank());
|
||||
} else {
|
||||
identifier = Utils
|
||||
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
p.setLocal_identifier(identifier);
|
||||
authors.add(p);
|
||||
}
|
||||
|
||||
}
|
||||
return authors.iterator();
|
||||
}, Encoders.bean(Persons.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/person");
|
||||
}
|
||||
});
|
||||
Dataset<Persons> persons = spark.emptyDataset(Encoders.bean(Persons.class));
|
||||
|
||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||
if (ModelSupport.isResult(entityType))
|
||||
persons = persons
|
||||
.union(Utils.readPath(spark, workingDir + entityType.name() + "/person", Persons.class));
|
||||
}
|
||||
persons
|
||||
.groupByKey((MapFunction<Persons, String>) p -> p.getLocal_identifier(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Persons, Persons>) (k, v) -> v.next(), Encoders.bean(Persons.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/Persons");
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void emitManifestation(SparkSession spark, String inputPath, String workingDir) {
|
||||
Dataset<Datasource> datasource = Utils
|
||||
.readPath(spark, inputPath + "datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEosctype()).isPresent() &&
|
||||
d.getEosctype().getClassname().equalsIgnoreCase("Journal archive"));
|
||||
|
||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||
if (ModelSupport.isResult(e)) {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
// Dataset<EmitPerManifestation> emitformanifestation =
|
||||
Utils
|
||||
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||
.flatMap((FlatMapFunction<R, EmitPerManifestation>) p -> p.getInstance().stream().map(i -> {
|
||||
EmitPerManifestation epb = new EmitPerManifestation();
|
||||
epb.setResultId(p.getId());
|
||||
epb.setInstance(i);
|
||||
epb.setHostedBy(i.getHostedby().getKey());
|
||||
epb
|
||||
.setPublisher(
|
||||
Optional
|
||||
.ofNullable(p.getPublisher())
|
||||
.map(v -> v.getValue())
|
||||
.orElse(new String()));
|
||||
if (p.getClass() == Publication.class) {
|
||||
epb.setJournal(((Publication) p).getJournal());
|
||||
}
|
||||
return epb;
|
||||
}).collect(Collectors.toList()).iterator(), Encoders.bean(EmitPerManifestation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/manifestation");
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Dataset<EmitPerManifestation> emitPerManifestationDataset = Utils
|
||||
.readPath(
|
||||
spark, workingDir + "software/manifestation", EmitPerManifestation.class)
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingDir + "dataset/manifestation", EmitPerManifestation.class))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingDir + "publication/manifestation", EmitPerManifestation.class))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingDir + "otherresearchproduct/manifestation", EmitPerManifestation.class));
|
||||
|
||||
emitPerManifestationDataset
|
||||
.groupByKey((MapFunction<EmitPerManifestation, String>) p -> p.getHostedBy(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, EmitPerManifestation, EmitPerManifestation>) (k, v) -> v.next(),
|
||||
Encoders.bean(EmitPerManifestation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + "/datasourcePublisher");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoAllowedTypeException;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoTitleFoundException;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.skgif.model.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class ResultMapper implements Serializable {
|
||||
|
||||
public static <E extends Result> ResearchProduct map(
|
||||
E input)
|
||||
throws Exception {
|
||||
|
||||
ResearchProduct out = new ResearchProduct();
|
||||
|
||||
Optional<Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
try {
|
||||
out.setLocal_identifier(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, input.getId()));
|
||||
mapPid(out, input);
|
||||
mapTitle(out, input);
|
||||
mapAbstract(out, input);
|
||||
mapType(out, input);
|
||||
mapTopic(out, input);
|
||||
mapContribution(out, input);
|
||||
|
||||
//The manifestation will be included extending the result as well as the relations to funder, organization and other results
|
||||
return out;
|
||||
} catch (ClassCastException cce) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapContribution(ResearchProduct out, E input) {
|
||||
if (Optional.ofNullable(input.getAuthor()).isPresent()) {
|
||||
int count = 0;
|
||||
List<Contribution> contributionList = new ArrayList<>();
|
||||
for (Author a : input.getAuthor()) {
|
||||
count += 1;
|
||||
Contribution contribution = new Contribution();
|
||||
Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid());
|
||||
if (orcid != null) {
|
||||
contribution.setPerson(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()));
|
||||
} else {
|
||||
if (Optional.ofNullable(a.getRank()).isPresent()) {
|
||||
contribution
|
||||
.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + a.getRank()));
|
||||
} else {
|
||||
contribution.setPerson(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + count));
|
||||
}
|
||||
|
||||
}
|
||||
if (Optional.ofNullable(a.getRank()).isPresent()) {
|
||||
contribution.setRank(a.getRank());
|
||||
}
|
||||
|
||||
contributionList.add(contribution);
|
||||
}
|
||||
out.setContributions(contributionList);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapTopic(ResearchProduct out, E input) {
|
||||
if (Optional.ofNullable(input.getSubject()).isPresent()) {
|
||||
out
|
||||
.setTopics(
|
||||
input
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(
|
||||
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos") ||
|
||||
s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
|
||||
.map(s -> {
|
||||
ResultTopic topic = new ResultTopic();
|
||||
topic
|
||||
.setTopic(
|
||||
Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
|
||||
if (Optional.ofNullable(s.getDataInfo()).isPresent()) {
|
||||
Provenance provenance = new Provenance();
|
||||
provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust()));
|
||||
provenance.setType(s.getDataInfo().getInferenceprovenance());
|
||||
topic.setProvenance(provenance);
|
||||
}
|
||||
|
||||
return topic;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapType(ResearchProduct out, E input) throws NoAllowedTypeException {
|
||||
switch (input.getResulttype().getClassid()) {
|
||||
case "publication":
|
||||
out.setProduct_type(ResearchTypes.LITERATURE.label);
|
||||
break;
|
||||
case "dataset":
|
||||
out.setProduct_type(ResearchTypes.RESEARCH_DATA.label);
|
||||
break;
|
||||
case "software":
|
||||
out.setProduct_type(ResearchTypes.RESEARCH_SOFTWARE.label);
|
||||
break;
|
||||
case "other":
|
||||
out.setProduct_type(ResearchTypes.OTHER.label);
|
||||
break;
|
||||
default:
|
||||
throw new ClassCastException("Result type not present or not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapPid(ResearchProduct out, Result input) {
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setIdentifiers(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
p -> {
|
||||
Identifier identifier = new Identifier();
|
||||
identifier.setValue(p.getValue());
|
||||
identifier.setScheme(p.getQualifier().getClassid());
|
||||
return identifier;
|
||||
})
|
||||
.collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
private static void mapTitle(ResearchProduct out, Result input) throws NoTitleFoundException {
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
|
||||
return;
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void mapAbstract(ResearchProduct out, Result input) {
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setAbstracts(Collections.singletonMap("none", descriptionList));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.skgif.model.Prefixes;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class Utils implements Serializable {
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private Utils() {
|
||||
}
|
||||
|
||||
public static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
|
||||
if (!Optional.ofNullable(pid).isPresent())
|
||||
return null;
|
||||
if (pid.size() == 0)
|
||||
return null;
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.TRUE);
|
||||
}
|
||||
}
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String getIdentifier(Prefixes entity, String id) {
|
||||
return entity.label + DHPUtils.md5(id);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.skgif.model.Biblio;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 15/02/24
|
||||
*/
|
||||
public class EmitPerManifestation implements Serializable {
|
||||
private String resultId;
|
||||
private String hostedBy;
|
||||
private Journal journal;
|
||||
private Instance instance;
|
||||
private String publisher;
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(String publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public String getHostedBy() {
|
||||
return hostedBy;
|
||||
}
|
||||
|
||||
public void setHostedBy(String hostedBy) {
|
||||
this.hostedBy = hostedBy;
|
||||
}
|
||||
|
||||
public Journal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(Journal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public Instance getInstance() {
|
||||
return instance;
|
||||
}
|
||||
|
||||
public void setInstance(Instance instance) {
|
||||
this.instance = instance;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||
|
||||
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class PartialResearchProduct extends ResearchProduct {
|
||||
private String resultId;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.skgif.model.Relations;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class RelationPerProduct implements Serializable {
|
||||
|
||||
private String resultId;
|
||||
private List<String> organizations;
|
||||
private List<String> funding;
|
||||
private List<Relations> relatedProduct;
|
||||
|
||||
public RelationPerProduct() {
|
||||
organizations = new ArrayList<>();
|
||||
funding = new ArrayList<>();
|
||||
relatedProduct = new ArrayList<>();
|
||||
}
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public List<String> getOrganizations() {
|
||||
return organizations;
|
||||
}
|
||||
|
||||
public void setOrganizations(List<String> organizations) {
|
||||
this.organizations = organizations;
|
||||
}
|
||||
|
||||
public List<String> getFunding() {
|
||||
return funding;
|
||||
}
|
||||
|
||||
public void setFunding(List<String> funding) {
|
||||
this.funding = funding;
|
||||
}
|
||||
|
||||
public List<Relations> getRelatedProduct() {
|
||||
return relatedProduct;
|
||||
}
|
||||
|
||||
public void setRelatedProduct(List<Relations> relatedProduct) {
|
||||
this.relatedProduct = relatedProduct;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif.exception;
|
||||
|
||||
public class NoAllowedTypeException extends Exception {
|
||||
public NoAllowedTypeException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public NoAllowedTypeException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public NoAllowedTypeException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public NoAllowedTypeException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public NoAllowedTypeException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif.exception;
|
||||
|
||||
public class NoTitleFoundException extends Exception {
|
||||
public NoTitleFoundException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public NoTitleFoundException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public NoTitleFoundException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public NoTitleFoundException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public NoTitleFoundException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.subset;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/07/22
|
||||
*/
|
||||
public class MasterDuplicate implements Serializable {
|
||||
private String duplicate;
|
||||
private String master;
|
||||
|
||||
public String getDuplicate() {
|
||||
return duplicate;
|
||||
}
|
||||
|
||||
public void setDuplicate(String duplicate) {
|
||||
this.duplicate = duplicate;
|
||||
}
|
||||
|
||||
public String getMaster() {
|
||||
return master;
|
||||
}
|
||||
|
||||
public void setMaster(String master) {
|
||||
this.master = master;
|
||||
}
|
||||
}
|
|
@ -1,97 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.subset;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.mongodb.DBCursor;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class ReadMasterDuplicateFromDB {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final String QUERY = "SELECT id as master, duplicate FROM dsm_dedup_services; ";
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
ReadMasterDuplicateFromDB.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/datasourcemaster_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
execute(dbUrl, dbUser, dbPassword, fsDataOutputStream);
|
||||
|
||||
}
|
||||
|
||||
private static void execute(String dbUrl, String dbUser, String dbPassword, FSDataOutputStream fos) {
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
|
||||
dbClient.processResults(QUERY, rs -> writeMap(datasourceMasterMap(rs), writer));
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static MasterDuplicate datasourceMasterMap(ResultSet rs) {
|
||||
try {
|
||||
MasterDuplicate dm = new MasterDuplicate();
|
||||
String duplicate = rs.getString("duplicate");
|
||||
dm.setDuplicate(OafMapperUtils.createOpenaireId(10, duplicate, true));
|
||||
String master = rs.getString("master");
|
||||
dm.setMaster(OafMapperUtils.createOpenaireId(10, master, true));
|
||||
|
||||
return dm;
|
||||
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected static void writeMap(final MasterDuplicate dm, BufferedWriter writer) {
|
||||
try {
|
||||
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
|
||||
writer.newLine();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,199 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.subset;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.criteria.VerbResolver;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.criteria.VerbResolverFactory;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.selectionconstraints.Param;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.subset.selectionconstraints.SelectionConstraints;
|
||||
import eu.dnetlib.dhp.oa.model.graph.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* Spark Job that fires the dump for the entities
|
||||
*/
|
||||
public class SparkDumpResult implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpResult.class);
|
||||
private static final VerbResolver resolver = VerbResolverFactory.newInstance();
|
||||
public static final String COMPRESSION = "compression";
|
||||
public static final String GZIP = "gzip";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkDumpResult.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String resultType = parser.get("resultType");
|
||||
log.info("resultType: {}", resultType);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
Optional<String> pathString = Optional.ofNullable(parser.get("pathMap"));
|
||||
HashMap<String, String> pathMap = null;
|
||||
if (pathString.isPresent()) {
|
||||
pathMap = new Gson().fromJson(parser.get("pathMap"), HashMap.class);
|
||||
log.info("pathMap: {}", new Gson().toJson(pathMap));
|
||||
}
|
||||
|
||||
final Optional<String> parameter = Optional.ofNullable(parser.get("selectionCriteria"));
|
||||
SelectionConstraints selectionConstraints = null;
|
||||
if (parameter.isPresent()) {
|
||||
selectionConstraints = new ObjectMapper().readValue(parameter.get(), SelectionConstraints.class);
|
||||
selectionConstraints.addResolver(resolver);
|
||||
|
||||
}
|
||||
|
||||
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> inputClazz = (Class<? extends eu.dnetlib.dhp.schema.oaf.Result>) Class
|
||||
.forName(resultClassName);
|
||||
|
||||
run(
|
||||
isSparkSessionManaged, inputPath, outputPath, pathMap, selectionConstraints, inputClazz,
|
||||
resultType);
|
||||
|
||||
}
|
||||
|
||||
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath,
|
||||
HashMap<String, String> pathMap, SelectionConstraints selectionConstraints,
|
||||
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> inputClazz, String resultType) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
HashMap<String, String> finalPathMap = pathMap;
|
||||
SelectionConstraints finalSelectionConstraints = selectionConstraints;
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "/original/" + resultType);
|
||||
Utils.removeOutputDir(spark, outputPath + "/dump/" + resultType);
|
||||
resultDump(
|
||||
spark, inputPath, outputPath, inputClazz, finalPathMap,
|
||||
finalSelectionConstraints, resultType);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public static <I extends eu.dnetlib.dhp.schema.oaf.Result> void resultDump(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
Class<I> inputClazz,
|
||||
Map<String, String> pathMap,
|
||||
SelectionConstraints selectionConstraints,
|
||||
String resultType) {
|
||||
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(
|
||||
(MapFunction<I, I>) value -> filterResult(
|
||||
value, pathMap, selectionConstraints, inputClazz, resultType),
|
||||
Encoders.bean(inputClazz))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/" + resultType);
|
||||
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/" + resultType, inputClazz)
|
||||
.map(
|
||||
(MapFunction<I, GraphResult>) value -> (GraphResult) ResultMapper
|
||||
.map(
|
||||
value, null,
|
||||
Constants.DUMPTYPE.COMPLETE.getType()),
|
||||
Encoders.bean(GraphResult.class))
|
||||
.map((MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.text(outputPath + "/dump/" + resultType);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends eu.dnetlib.dhp.schema.oaf.Result> I filterResult(I value, Map<String, String> pathMap,
|
||||
SelectionConstraints selectionConstraints, Class<I> inputClazz,
|
||||
String resultType) {
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
|
||||
|
||||
if (Boolean.FALSE.equals(odInfo.isPresent())) {
|
||||
return null;
|
||||
}
|
||||
if (Boolean.TRUE.equals(odInfo.get().getDeletedbyinference())
|
||||
|| Boolean.TRUE.equals(odInfo.get().getInvisible())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isCompatible(value.getResulttype().getClassid(), resultType)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (selectionConstraints != null) {
|
||||
Param param = new Param();
|
||||
String json = new Gson().toJson(value, inputClazz);
|
||||
DocumentContext jsonContext = JsonPath.parse(json);
|
||||
|
||||
for (String key : pathMap.keySet()) {
|
||||
try {
|
||||
param.insert(key, jsonContext.read(pathMap.get(key)));
|
||||
} catch (com.jayway.jsonpath.PathNotFoundException e) {
|
||||
param.insert(key, new ArrayList<>());
|
||||
}
|
||||
}
|
||||
if (!selectionConstraints.verifyCriteria(param)) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
private static boolean isCompatible(String classid, String resultType) {
|
||||
return (classid.equals(resultType) || (classid.equals("other") && resultType.equals("otherresearchproduct")));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,328 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.subset;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.print.attribute.standard.MediaSize;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.*;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Function1;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 11/11/22
|
||||
*/
|
||||
public class SparkSelectSubset implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectSubset.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectSubset.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
Optional<String> rs = Optional.ofNullable(parser.get("removeSet"));
|
||||
final Set<String> removeSet = new HashSet<>();
|
||||
if (rs.isPresent()) {
|
||||
Collections.addAll(removeSet, rs.get().split(";"));
|
||||
}
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
selectSubset(spark, inputPath, outputPath, removeSet);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void selectSubset(SparkSession spark, String inputPath, String outputPath, Set<String> removeSet) {
|
||||
Dataset<Relation> relation = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference()
|
||||
&& !removeSet.contains(r.getRelClass()));
|
||||
|
||||
Dataset<String> resultIds = Utils
|
||||
.readPath(spark, outputPath + "/original/publication", Publication.class)
|
||||
|
||||
.map((MapFunction<Publication, String>) p -> p.getId(), Encoders.STRING())
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||
|
||||
.map((MapFunction<eu.dnetlib.dhp.schema.oaf.Dataset, String>) d -> d.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/software", Software.class)
|
||||
|
||||
.map((MapFunction<Software, String>) s -> s.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/otherresearchproduct", OtherResearchProduct.class)
|
||||
|
||||
.map((MapFunction<OtherResearchProduct, String>) o -> o.getId(), Encoders.STRING()));
|
||||
|
||||
// select result -> result relations
|
||||
Dataset<Relation> relResultResult = relation
|
||||
.joinWith(resultIds, relation.col("source").equalTo(resultIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
|
||||
|
||||
relResultResult
|
||||
.joinWith(resultIds, relResultResult.col("target").equalTo(resultIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath + "/original/relation");
|
||||
|
||||
// save the relations among other entities and the results
|
||||
Dataset<String> otherIds = Utils
|
||||
.readPath(spark, inputPath + "/organization", Organization.class)
|
||||
.filter((FilterFunction<Organization>) e -> !e.getDataInfo().getDeletedbyinference())
|
||||
.map((MapFunction<Organization, String>) o -> o.getId(), Encoders.STRING())
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/project", Project.class)
|
||||
.filter((FilterFunction<Project>) e -> !e.getDataInfo().getDeletedbyinference())
|
||||
.map((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||
.filter((FilterFunction<Datasource>) e -> !e.getDataInfo().getDeletedbyinference())
|
||||
.map((MapFunction<Datasource, String>) d -> d.getId(), Encoders.STRING()));
|
||||
|
||||
Dataset<Relation> relResultOther = relation
|
||||
.joinWith(resultIds, relation.col("source").equalTo(resultIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
|
||||
|
||||
relResultOther
|
||||
.joinWith(otherIds, relResultOther.col("target").equalTo(otherIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/relation");
|
||||
|
||||
Dataset<Relation> relOtherResult = relation
|
||||
.joinWith(resultIds, relation.col("target").equalTo(resultIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
|
||||
|
||||
relOtherResult
|
||||
.joinWith(otherIds, relOtherResult.col("source").equalTo(otherIds.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/relation");
|
||||
|
||||
Dataset<String> relAll = Utils
|
||||
.readPath(spark, outputPath + "/original/relation", Relation.class)
|
||||
.flatMap(
|
||||
(FlatMapFunction<Relation, String>) r -> Arrays.asList(r.getSource(), r.getTarget()).iterator(),
|
||||
Encoders.STRING())
|
||||
.distinct();
|
||||
|
||||
// Save the entities in relations with at least one result
|
||||
Dataset<Organization> organization = Utils
|
||||
.readPath(spark, inputPath + "/organization", Organization.class)
|
||||
.filter((FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference());
|
||||
organization
|
||||
.joinWith(relAll, organization.col("id").equalTo(relAll.col("value")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Organization, String>, Organization>) t2 -> t2._1(),
|
||||
Encoders.bean(Organization.class))
|
||||
.groupByKey((MapFunction<Organization, String>) v -> v.getId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Organization, Organization>) (k, it) -> it.next(),
|
||||
Encoders.bean(Organization.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/organization");
|
||||
|
||||
Dataset<Datasource> datasource = Utils
|
||||
.readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getDeletedbyinference());
|
||||
datasource
|
||||
.joinWith(relAll, datasource.col("id").equalTo(relAll.col("value")))
|
||||
.map((MapFunction<Tuple2<Datasource, String>, Datasource>) t2 -> t2._1(), Encoders.bean(Datasource.class))
|
||||
.groupByKey((MapFunction<Datasource, String>) v -> v.getId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Datasource, Datasource>) (k, it) -> it.next(),
|
||||
Encoders.bean(Datasource.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/datasource");
|
||||
|
||||
// plus we need to dump all the datasource in collectedfrom hostedby
|
||||
Dataset<String> cfhb_orig = Utils
|
||||
.readPath(spark, outputPath + "/original/publication", Publication.class)
|
||||
.flatMap(
|
||||
(FlatMapFunction<Publication, String>) p -> {
|
||||
List<String> ret = new ArrayList<>();
|
||||
p.getInstance().stream().forEach(i -> {
|
||||
if (Optional.ofNullable(i.getHostedby()).isPresent()
|
||||
&& Optional.ofNullable(i.getHostedby().getKey()).isPresent())
|
||||
ret.add(i.getHostedby().getKey());
|
||||
});
|
||||
if (Optional.ofNullable(p.getCollectedfrom()).isPresent()) {
|
||||
p.getCollectedfrom().stream().forEach(cf -> {
|
||||
if (Optional.ofNullable(cf.getKey()).isPresent())
|
||||
ret.add(cf.getKey());
|
||||
});
|
||||
}
|
||||
return ret.iterator();
|
||||
}, Encoders.STRING())
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||
.flatMap(
|
||||
(FlatMapFunction<eu.dnetlib.dhp.schema.oaf.Dataset, String>) p -> {
|
||||
List<String> ret = new ArrayList<>();
|
||||
p.getInstance().stream().forEach(i -> {
|
||||
if (Optional.ofNullable(i.getHostedby()).isPresent()
|
||||
&& Optional.ofNullable(i.getHostedby().getKey()).isPresent())
|
||||
ret.add(i.getHostedby().getKey());
|
||||
});
|
||||
if (Optional.ofNullable(p.getCollectedfrom()).isPresent()) {
|
||||
p.getCollectedfrom().stream().forEach(cf -> {
|
||||
if (Optional.ofNullable(cf.getKey()).isPresent())
|
||||
ret.add(cf.getKey());
|
||||
});
|
||||
}
|
||||
return ret.iterator();
|
||||
}, Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/software", Software.class)
|
||||
.flatMap(
|
||||
(FlatMapFunction<Software, String>) p -> {
|
||||
List<String> ret = new ArrayList<>();
|
||||
p.getInstance().stream().forEach(i -> {
|
||||
if (Optional.ofNullable(i.getHostedby()).isPresent()
|
||||
&& Optional.ofNullable(i.getHostedby().getKey()).isPresent())
|
||||
ret.add(i.getHostedby().getKey());
|
||||
});
|
||||
if (Optional.ofNullable(p.getCollectedfrom()).isPresent()) {
|
||||
p.getCollectedfrom().stream().forEach(cf -> {
|
||||
if (Optional.ofNullable(cf.getKey()).isPresent())
|
||||
ret.add(cf.getKey());
|
||||
});
|
||||
}
|
||||
return ret.iterator();
|
||||
}, Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/otherresearchproduct", OtherResearchProduct.class)
|
||||
.flatMap(
|
||||
(FlatMapFunction<OtherResearchProduct, String>) p -> {
|
||||
List<String> ret = new ArrayList<>();
|
||||
p.getInstance().stream().forEach(i -> {
|
||||
if (Optional.ofNullable(i.getHostedby()).isPresent()
|
||||
&& Optional.ofNullable(i.getHostedby().getKey()).isPresent())
|
||||
ret.add(i.getHostedby().getKey());
|
||||
});
|
||||
if (Optional.ofNullable(p.getCollectedfrom()).isPresent()) {
|
||||
p.getCollectedfrom().stream().forEach(cf -> {
|
||||
if (Optional.ofNullable(cf.getKey()).isPresent())
|
||||
ret.add(cf.getKey());
|
||||
});
|
||||
}
|
||||
return ret.iterator();
|
||||
}, Encoders.STRING()))
|
||||
.filter((FilterFunction<String>) s -> !s.equals(ModelConstants.UNKNOWN_REPOSITORY.getKey()))
|
||||
.distinct();
|
||||
|
||||
datasource
|
||||
.joinWith(cfhb_orig, datasource.col("id").equalTo(cfhb_orig.col("value")))
|
||||
.map((MapFunction<Tuple2<Datasource, String>, Datasource>) t2 -> t2._1(), Encoders.bean(Datasource.class))
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/datasource");
|
||||
|
||||
Dataset<Project> project = Utils
|
||||
.readPath(spark, inputPath + "/project", Project.class)
|
||||
.filter((FilterFunction<Project>) d -> !d.getDataInfo().getDeletedbyinference());
|
||||
project
|
||||
.joinWith(relAll, project.col("id").equalTo(relAll.col("value")))
|
||||
.map((MapFunction<Tuple2<Project, String>, Project>) t2 -> t2._1(), Encoders.bean(Project.class))
|
||||
.groupByKey((MapFunction<Project, String>) v -> v.getId(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Project, Project>) (k, it) -> it.next(), Encoders.bean(Project.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/project");
|
||||
|
||||
// save the relations among entities different from the result
|
||||
|
||||
Dataset<String> selectedIDs = Utils
|
||||
.readPath(spark, outputPath + "/original/project", Project.class)
|
||||
.map((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING())
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/organization", Organization.class)
|
||||
.map((MapFunction<Organization, String>) o -> o.getId(), Encoders.STRING()))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(spark, outputPath + "/original/datasource", Datasource.class)
|
||||
.map((MapFunction<Datasource, String>) d -> d.getId(), Encoders.STRING()));
|
||||
|
||||
Dataset<Relation> relOtherOther = relation
|
||||
.joinWith(selectedIDs, relation.col("source").equalTo(selectedIDs.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
|
||||
|
||||
relOtherOther
|
||||
.joinWith(selectedIDs, relOtherOther.col("target").equalTo(selectedIDs.col("value")))
|
||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/original/relation");
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,133 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.subset;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.model.graph.ResearchCommunity;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 15/11/22
|
||||
*/
|
||||
public class SparkSelectValidContext implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectValidContext.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectValidContext.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_select_context.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String contextPath = parser.get("contextPath");
|
||||
log.info("contextPath: {}", contextPath);
|
||||
|
||||
final String communityMapPath = parser.get("communityMapPath");
|
||||
log.info("communityMapPath: {}", communityMapPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
selectValidContext(spark, inputPath, contextPath, communityMapPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void selectValidContext(SparkSession spark, String inputPath, String contextPath,
|
||||
String communityMapPath, String outputPath) {
|
||||
List<String> keys = Arrays
|
||||
.asList(Utils.getCommunityMap(spark, communityMapPath).keySet().stream().toArray(String[]::new));
|
||||
Dataset<String> context = getFilter(spark, inputPath + "/publication", keys, Publication.class)
|
||||
.union(getFilter(spark, inputPath + "/dataset", keys, eu.dnetlib.dhp.schema.oaf.Dataset.class))
|
||||
.union(getFilter(spark, inputPath + "/software", keys, Software.class))
|
||||
.union(getFilter(spark, inputPath + "/otherresearchproduct", keys, OtherResearchProduct.class))
|
||||
.distinct();
|
||||
|
||||
context.foreach((ForeachFunction<String>) c -> System.out.println(c));
|
||||
|
||||
Dataset<ResearchCommunity> researchCommunity = Utils.readPath(spark, contextPath, ResearchCommunity.class);
|
||||
|
||||
researchCommunity
|
||||
.joinWith(context, researchCommunity.col("acronym").equalTo(context.col("value")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<ResearchCommunity, String>, ResearchCommunity>) t2 -> t2._1(),
|
||||
Encoders.bean(ResearchCommunity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> Dataset<String> getFilter(SparkSession spark, String inputPath,
|
||||
List<String> keys, Class<I> inputClazz) {
|
||||
|
||||
return Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.filter((FilterFunction<I>) r -> isPresentContext(r))
|
||||
.flatMap(
|
||||
(FlatMapFunction<I, String>) r -> r
|
||||
.getContext()
|
||||
.stream()
|
||||
.map(c -> extract(c.getId(), keys))
|
||||
.collect(Collectors.toList())
|
||||
.iterator(),
|
||||
Encoders.STRING())
|
||||
.filter(Objects::nonNull);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> boolean isPresentContext(I r) {
|
||||
return Optional.ofNullable(r.getContext()).isPresent();
|
||||
}
|
||||
|
||||
private static String extract(String c, List<String> keySet) {
|
||||
if (keySet.contains(c))
|
||||
return c;
|
||||
if (c.contains(":") && keySet.contains(c.substring(0, c.indexOf(":"))))
|
||||
return c.substring(0, c.indexOf(":"));
|
||||
return null;
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue