Compare commits

...

24 Commits

Author SHA1 Message Date
Miriam Baglioni 292c69d819 [SKG-IF-EOSC] added new step in the resumeFrom 2024-04-04 12:23:53 +02:00
Miriam Baglioni 059b275a06 [SKG-IF-EOSC] fixed issue in selecting relevant eosc results. Applied static mapping from old eoscDsId and new eoscDsId 2024-03-29 11:46:46 +01:00
Miriam Baglioni 4c7e24df81 [SKG-IF-EOSC] added id for the eosc datasource at the level of the materialization 2024-03-27 09:45:13 +01:00
Miriam Baglioni 8fe452b61d [SKG-IF denormalized] refactoring 2024-03-26 11:45:59 +01:00
Miriam Baglioni 9f9ade077b [SKG-IF] changed the implementation to reduce the number of needed joins. Reduced the memory usage by defining specific flat beans 2024-03-18 09:30:05 +01:00
Miriam Baglioni 3c4c4e8ce0 [SKG-IF] tries to make the process finish. need to change the strategy 2024-03-16 08:44:10 +01:00
Miriam Baglioni 98bec3d2d2 [SKG-IF] changed workflow parametrization to avoid OOM error 2024-03-14 15:54:12 +01:00
Miriam Baglioni 3126907d09 [SKG-IF] fixing issue in deserialization 2024-03-14 15:27:43 +01:00
Miriam Baglioni 187b91a699 [SKG-IF] fixing issue in deserialization 2024-03-14 13:02:47 +01:00
Miriam Baglioni b176bbef1d [SKG-IF] fixing issue in deserialization 2024-03-14 10:11:45 +01:00
Miriam Baglioni e8f19ad003 [SKG-IF] selection of subset of relevant results from the set provided via input 2024-03-13 15:22:56 +01:00
Miriam Baglioni 2811e2ebd7 [SKG-IF] denormalization fixing issue and new properties 2024-03-12 14:58:42 +01:00
Miriam Baglioni cb9a081236 [SKG-IF] denormalization fixing issue and new properties 2024-03-12 14:51:14 +01:00
Miriam Baglioni d6a8db5202 [SKG-IF] denormalization fixing issue 2024-03-11 13:37:23 +01:00
Miriam Baglioni 0f40ed6b11 [SKG-IF] denormalization fixing issue and adding new field to mingrant 2024-03-11 09:56:40 +01:00
Miriam Baglioni a6a6922f11 [SKG-IF] added first implementation for denormalization 2024-03-04 16:28:52 +01:00
Miriam Baglioni 7b715b2bb8 - 2024-03-04 08:45:29 +01:00
Miriam Baglioni 752fd896e4 [SKG-IF] refactoring and fixing issues 2024-03-01 09:35:15 +01:00
Miriam Baglioni 0c887ca015 [SKG-IF] mapping to version latest in date 27 february 2024 2024-02-27 12:35:34 +01:00
Miriam Baglioni ebde629d49 merging with master 2024-02-20 09:57:37 +01:00
Miriam Baglioni e2b9989199 - 2024-02-20 09:57:33 +01:00
Miriam Baglioni c3be9a7b14 [SKG-IF] - 2024-02-07 15:33:12 +01:00
Miriam Baglioni 9a8a9ac7df extend peer review 2023-09-27 11:50:43 +02:00
Miriam Baglioni b1b48a90dc first entities 2023-09-18 08:59:02 +02:00
651 changed files with 9730 additions and 13359 deletions

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.skgif.model;
/**
* @author miriam.baglioni
* @Date 04/09/23
*/
public enum AccessRight {
OPEN("open"), CLOSED("closed"), EMBARGO("embargo"), RESTRICTED("restricted"), UNAVAILABLE("unavailable");
public final String label;
private AccessRight(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* @author miriam.baglioni
* @Date 05/09/23
*/
public class Affiliation implements Serializable {
private String organization;
@JsonProperty("start_date")
private String start_date;
@JsonProperty("end_date")
private String end_date;
public String getOrganization() {
return organization;
}
public void setOrganization(String organization) {
this.organization = organization;
}
public String getStart_date() {
return start_date;
}
public void setStart_date(String start_date) {
this.start_date = start_date;
}
public String getEnd_date() {
return end_date;
}
public void setEnd_date(String end_date) {
this.end_date = end_date;
}
}

View File

@ -0,0 +1,87 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Biblio implements Serializable {
private String issue;
@JsonProperty("start_page")
private String start_page;
@JsonProperty("end_page")
private String end_page;
private String volume;
private String edition;
private String number;
private String publisher;
private String series;
public String getIssue() {
return issue;
}
public void setIssue(String issue) {
this.issue = issue;
}
public String getStart_page() {
return start_page;
}
public void setStart_page(String start_page) {
this.start_page = start_page;
}
public String getEnd_page() {
return end_page;
}
public void setEnd_page(String end_page) {
this.end_page = end_page;
}
public String getVolume() {
return volume;
}
public void setVolume(String volume) {
this.volume = volume;
}
public String getEdition() {
return edition;
}
public void setEdition(String edition) {
this.edition = edition;
}
public String getNumber() {
return number;
}
public void setNumber(String number) {
this.number = number;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getSeries() {
return series;
}
public void setSeries(String series) {
this.series = series;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Contribution implements Serializable {
private MinPerson person;
@JsonProperty("declared_affiliations")
private List<String> declared_affiliation;
private List<String> roles;
private Integer rank;
public MinPerson getPerson() {
return person;
}
public void setPerson(MinPerson person) {
this.person = person;
}
public List<String> getDeclared_affiliation() {
return declared_affiliation;
}
public void setDeclared_affiliation(List<String> declared_affiliation) {
this.declared_affiliation = declared_affiliation;
}
public List<String> getRoles() {
return roles;
}
public void setRoles(List<String> roles) {
this.roles = roles;
}
public Integer getRank() {
return rank;
}
public void setRank(Integer rank) {
this.rank = rank;
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class Contributor implements Serializable {
private MinPerson person; // I would not map it because we have only information regarding the person (if any)
// associated to the leading organization
private String organization; // contributors.person
private String role;// private
public MinPerson getPerson() {
return person;
}
public void setPerson(MinPerson person) {
this.person = person;
}
public String getOrganization() {
return organization;
}
public void setOrganization(String organization) {
this.organization = organization;
}
public String getRole() {
return role;
}
public void setRole(String role) {
this.role = role;
}
}

View File

@ -0,0 +1,163 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class Datasource implements Serializable {
private String local_identifier;// id
private List<Identifier> identifiers; // .schema pid.qualifier.classid;identifiers.value pid.value
private String name; // officialname.value
private String submission_policy_url;// submissionpolicyurl
private String preservation_policy_url;// preservationpolicyurl
private Boolean version_control;// versioncontrol bool
private List<PersistentIdentitySystems> persistent_identity_systems;// . product_type researchentitytype list type
// to be remapped to the eosc types
// persistent_identity_systems. pid_scheme pidsystems.value when not null. It can be a string with multiple values
private String jurisdiction;// jurisdiction.classname
private String data_source_classification;// eoscdatasourcetype.classname
private List<String> research_product_type;// researchentitytype list type to be remapped to the eosc types
private Boolean thematic;// thematic bool
private List<Licence> research_product_license; // .name not mappable listresearch_product_license.url not mappable
private List<String> research_product_access_policy;// "databaseaccesstype if open => open access
// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) " list
private List<Licence> research_product_metadata_license; // .name not mappable list
// research_product_metadata_license.url not mappable
private List<String> research_product_metadata_access_policy;// researchproductmetadataccesspolicies list with the
// same mapping of research_product_access_policy
private List<MinOrganization> organization;
public List<MinOrganization> getOrganization() {
return organization;
}
public void setOrganization(List<MinOrganization> organization) {
this.organization = organization;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSubmission_policy_url() {
return submission_policy_url;
}
public void setSubmission_policy_url(String submission_policy_url) {
this.submission_policy_url = submission_policy_url;
}
public String getPreservation_policy_url() {
return preservation_policy_url;
}
public void setPreservation_policy_url(String preservation_policy_url) {
this.preservation_policy_url = preservation_policy_url;
}
public Boolean getVersion_control() {
return version_control;
}
public void setVersion_control(Boolean version_control) {
this.version_control = version_control;
}
public List<PersistentIdentitySystems> getPersistent_identity_systems() {
return persistent_identity_systems;
}
public void setPersistent_identity_systems(List<PersistentIdentitySystems> persistent_identity_systems) {
this.persistent_identity_systems = persistent_identity_systems;
}
public String getJurisdiction() {
return jurisdiction;
}
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public String getData_source_classification() {
return data_source_classification;
}
public void setData_source_classification(String data_source_classification) {
this.data_source_classification = data_source_classification;
}
public List<String> getResearch_product_type() {
return research_product_type;
}
public void setResearch_product_type(List<String> research_product_type) {
this.research_product_type = research_product_type;
}
public Boolean getThematic() {
return thematic;
}
public void setThematic(Boolean thematic) {
this.thematic = thematic;
}
public List<Licence> getResearch_product_license() {
return research_product_license;
}
public void setResearch_product_license(List<Licence> research_product_license) {
this.research_product_license = research_product_license;
}
public List<String> getResearch_product_access_policy() {
return research_product_access_policy;
}
public void setResearch_product_access_policy(List<String> research_product_access_policy) {
this.research_product_access_policy = research_product_access_policy;
}
public List<Licence> getResearch_product_metadata_license() {
return research_product_metadata_license;
}
public void setResearch_product_metadata_license(List<Licence> research_product_metadata_license) {
this.research_product_metadata_license = research_product_metadata_license;
}
public List<String> getResearch_product_metadata_access_policy() {
return research_product_metadata_access_policy;
}
public void setResearch_product_metadata_access_policy(List<String> research_product_metadata_access_policy) {
this.research_product_metadata_access_policy = research_product_metadata_access_policy;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Dates implements Serializable {
private String value;
private String type;
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public static Dates newInstance(String value, String type) {
Dates d = new Dates();
d.value = value;
d.type = type;
return d;
}
}

View File

@ -0,0 +1,164 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import org.codehaus.jackson.annotate.JsonProperty;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class Grant implements Serializable {
private String local_identifier;// id
private List<Identifier> identifiers;// .schema pid.qualifier.classid identifiers.value pid.value
// identifiers.schema funder acronym to be used the xpath //fundingtree/funder/shortname
// identifiers.value project.code
private String title;// title.value
@JsonProperty(value = "abstract")
private String summary;// summary.value
private String acronym; // acronym.value
private String funder;// fundingtree to be used the xpath //funder/name
private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
private String currency;// currency.value
private Float funded_amount;// ' fundedamount.value
private List<String> keywords;// subject.value
private String start_date;// startdate.value
private String end_date;// enddate.value
private String website;// websiteurl.value
private List<MinOrganization> beneficiaries;// organization.id for the organizations in the relation with semantic
// class
// isParticipant produces the list of organization internal identifiers
private List<Contributor> contributors;//
private String grantCode;
public String getGrantCode() {
return grantCode;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getFunder() {
return funder;
}
public void setFunder(String funder) {
this.funder = funder;
}
public String getFunding_stream() {
return funding_stream;
}
public void setFunding_stream(String funding_stream) {
this.funding_stream = funding_stream;
}
public String getCurrency() {
return currency;
}
public void setCurrency(String currency) {
this.currency = currency;
}
public Float getFunded_amount() {
return funded_amount;
}
public void setFunded_amount(Float funded_amount) {
this.funded_amount = funded_amount;
}
public List<String> getKeywords() {
return keywords;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
public String getStart_date() {
return start_date;
}
public void setStart_date(String start_date) {
this.start_date = start_date;
}
public String getEnd_date() {
return end_date;
}
public void setEnd_date(String end_date) {
this.end_date = end_date;
}
public String getWebsite() {
return website;
}
public void setWebsite(String website) {
this.website = website;
}
public List<MinOrganization> getBeneficiaries() {
return beneficiaries;
}
public void setBeneficiaries(List<MinOrganization> beneficiaries) {
this.beneficiaries = beneficiaries;
}
public List<Contributor> getContributors() {
return contributors;
}
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
public void setGrantCode(String value) {
grantCode = value;
}
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Identifier implements Serializable {
private String scheme;
private String value;
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public static Identifier newInstance(String scheme, String value) {
Identifier i = new Identifier();
i.value = value;
i.scheme = scheme;
return i;
}
}

View File

@ -0,0 +1,11 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class Licence implements Serializable {
}

View File

@ -0,0 +1,148 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Manifestation implements Serializable {
@JsonProperty("product_local_type")
private String product_local_type;
@JsonProperty("product_local_type_schema")
private String product_local_type_schema;
private List<Dates> dates;
@JsonProperty("peer_review")
private String peer_review;
@JsonProperty("metadata_curation")
private String metadata_curation;
private String url;
private String pid;
@JsonProperty("access_right")
private String access_right;
private String licence;
@JsonProperty("licance_schema")
private String licence_schema;
private Biblio biblio;
private MinVenue venue;
private List<String> eoscId;
public List<String> getEoscId() {
return eoscId;
}
public void setEoscId(List<String> eoscId) {
this.eoscId = eoscId;
}
@JsonProperty("hosting_datasource")
private MinVenue hosting_datasource;
public String getProduct_local_type() {
return product_local_type;
}
public void setProduct_local_type(String product_local_type) {
this.product_local_type = product_local_type;
}
public String getProduct_local_type_schema() {
return product_local_type_schema;
}
public void setProduct_local_type_schema(String product_local_type_schema) {
this.product_local_type_schema = product_local_type_schema;
}
public List<Dates> getDates() {
return dates;
}
public void setDates(List<Dates> dates) {
this.dates = dates;
}
public String getPeer_review() {
return peer_review;
}
public void setPeer_review(String peer_review) {
this.peer_review = peer_review;
}
public String getMetadata_curation() {
return metadata_curation;
}
public void setMetadata_curation(String metadata_curation) {
this.metadata_curation = metadata_curation;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getAccess_right() {
return access_right;
}
public void setAccess_right(String access_right) {
this.access_right = access_right;
}
public String getLicence() {
return licence;
}
public void setLicence(String licence) {
this.licence = licence;
}
public String getLicence_schema() {
return licence_schema;
}
public void setLicence_schema(String licence_schema) {
this.licence_schema = licence_schema;
}
public Biblio getBiblio() {
return biblio;
}
public void setBiblio(Biblio biblio) {
this.biblio = biblio;
}
public MinVenue getVenue() {
return venue;
}
public void setVenue(MinVenue venue) {
this.venue = venue;
}
public MinVenue getHosting_datasource() {
return hosting_datasource;
}
public void setHosting_datasource(MinVenue hosting_datasource) {
this.hosting_datasource = hosting_datasource;
}
}

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.skgif.model;
/**
* @author miriam.baglioni
* @Date 04/09/23
*/
public enum MetadataCuration {
YES("yes"), NO("no"), UNAVAILABLE("unavailable");
public final String label;
private MetadataCuration(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class MinGrant implements Serializable {
private String local_identifier;
private String funder;
private String code;
private String title;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public String getFunder() {
return funder;
}
public void setFunder(String funder) {
this.funder = funder;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
}

View File

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class MinOrganization implements Serializable {
private String local_identifier;
private String name;
private String ror;
private String isni;
private String fundRef;
private String rinGold;
private String wikidata;
public String getWikidata() {
return wikidata;
}
public void setWikidata(String wikidata) {
this.wikidata = wikidata;
}
public String getFundRef() {
return fundRef;
}
public String getRinGold() {
return rinGold;
}
public void setRinGold(String rinGold) {
this.rinGold = rinGold;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getRor() {
return ror;
}
public void setRor(String ror) {
this.ror = ror;
}
public String getIsni() {
return isni;
}
public void setIsni(String isni) {
this.isni = isni;
}
public void setFundRef(String value) {
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class MinPerson implements Serializable {
private String local_identifier;
private String full_name;
private String orcid;
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public String getFull_name() {
return full_name;
}
public void setFull_name(String full_name) {
this.full_name = full_name;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
}

View File

@ -0,0 +1,65 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class MinProduct implements Serializable {
private String local_identifier;
private String title;
private String doi;
private String pmcid;
private String arxivid;
private String pmid;
public String getPmid() {
return pmid;
}
public void setPmid(String pmid) {
this.pmid = pmid;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getPmcid() {
return pmcid;
}
public void setPmcid(String pmcid) {
this.pmcid = pmcid;
}
public String getArxivid() {
return arxivid;
}
public void setArxivid(String arxivid) {
this.arxivid = arxivid;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class MinTopic implements Serializable {
private String local_identifier;
private String value;
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class MinVenue implements Serializable {
private String local_identifier;
private String name;
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String loval_identifier) {
this.local_identifier = loval_identifier;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public static MinVenue newInstance(String local_identifier, String name) {
MinVenue minVenue = new MinVenue();
minVenue.local_identifier = local_identifier;
minVenue.name = name;
return minVenue;
}
}

View File

@ -0,0 +1,85 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class Organization implements Serializable {
private String local_identifier; // id
private List<Identifier> identifiers; // pid.qualifier.classid; pid.value list
private String name; // legalname.value
private String short_name; // legalshortname.value
private List<String> other_names;// alternative_names.value list
private String website;// websiteurl.value
private String country; // country.classid
private String type; // map relevant types from the ec* fields of organisations. If no match, default to "other"
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getShort_name() {
return short_name;
}
public void setShort_name(String short_name) {
this.short_name = short_name;
}
public List<String> getOther_names() {
return other_names;
}
public void setOther_names(List<String> other_names) {
this.other_names = other_names;
}
public String getWebsite() {
return website;
}
public void setWebsite(String website) {
this.website = website;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}

View File

@ -0,0 +1,17 @@
package eu.dnetlib.dhp.skgif.model;
public enum OrganizationTypes {
ARCHIVE("archive"),
COMPANY("company"),
EDUCATION("education"), FACILITY("facility"), GOVERNMENT("government"), HEALTHCARE("healthcare"), NONPROFIT(
"nonprofit"), FUNDER("funder"), OTHER("other");
public final String label;
private OrganizationTypes(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,17 @@
package eu.dnetlib.dhp.skgif.model;
/**
* @author miriam.baglioni
* @Date 04/09/23
*/
public enum PeerReview {
PEER_REVIEWED("peer-reviewed"), NON_PEER_REVIEWED("not peer-reviewed"), DOUBLE_BLIND("double-blind"), SINGLE_BLIND(
"single-blind"), UNAVAILABLE("unavailable"), OPEN("open peer review");
public final String label;
private PeerReview(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,11 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class PersistentIdentitySystems implements Serializable {
}

View File

@ -0,0 +1,82 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import org.codehaus.jackson.annotate.JsonProperty;
/**
* @author miriam.baglioni
* @Date 05/09/23
*/
public class Persons implements Serializable {
@JsonProperty("local_identifier")
private String local_identifier;
private List<Identifier> identifiers;
@JsonProperty("given_name")
private String given_name;
@JsonProperty("family_name")
private String family_name;
private String agent;
@JsonProperty("declared_affiliations")
private List<Affiliation> declared_affiliations;
private String fullname;
public String getFullname() {
return fullname;
}
public void setFullname(String fullname) {
this.fullname = fullname;
}
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getGiven_name() {
return given_name;
}
public void setGiven_name(String given_name) {
this.given_name = given_name;
}
public String getFamily_name() {
return family_name;
}
public void setFamily_name(String family_name) {
this.family_name = family_name;
}
public String getAgent() {
return agent;
}
public void setAgent(String agent) {
this.agent = agent;
}
public List<Affiliation> getDeclared_affiliations() {
return declared_affiliations;
}
public void setDeclared_affiliations(List<Affiliation> declared_affiliations) {
this.declared_affiliations = declared_affiliations;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public enum Prefixes implements Serializable {
RESEARCH_PRODUCT("product_____::"),
ORGANIZATION("organization::"),
GRANT("grant_______::"),
PERSON(
"person______::"),
TEMPORARY_PERSON("temp_person_::"),
DATASOURCE("datasource__::"), TOPIC("temp_topic__::"), VENUE("temp_venue__::");
public final String label;
private Prefixes(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Provenance implements Serializable {
private String type;
private double trust;
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public double getTrust() {
return trust;
}
public void setTrust(double trust) {
this.trust = trust;
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 05/09/23
*/
public enum RelationType implements Serializable {
RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION(
"hasAuthorInstitution"), DATASOURCE_PROVIDED_BY_ORGANIZATION(
"isProvidedBy"), PROJECT_HAS_PARTICIPANT_ORGANIZATION("hasParticipant"), SUPPLEMENT(
"IsSupplementedBy"), DOCUMENTS(
"IsDocumentedBy"), PART("IsPartOf"), VERSION(
"IsNewVersionOf"), CITATION("Cites"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant");
public final String label;
private RelationType(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Relations implements Serializable {
@JsonProperty("relation_type")
private String relation_type;
@JsonProperty("product_list")
private List<MinProduct> product_list;
public static Relations newInstance(String relClass, List<MinProduct> target) {
Relations r = new Relations();
r.relation_type = relClass;
r.product_list = target;
return r;
}
public String getRelation_type() {
return relation_type;
}
public void setRelation_type(String relation_type) {
this.relation_type = relation_type;
}
public List<MinProduct> getProduct_list() {
return product_list;
}
public void setProduct_list(List<MinProduct> product_list) {
this.product_list = product_list;
}
}

View File

@ -0,0 +1,118 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class ResearchProduct implements Serializable {
@JsonProperty("local_identifier")
private String local_identifier;
private List<Identifier> identifiers;
private Map<String, List<String>> titles;
private Map<String, List<String>> abstracts;
@JsonProperty("product_type")
private String product_type;
private List<ResultTopic> topics;
private List<Contribution> contributions;
private List<Manifestation> manifestations;
@JsonProperty("relevant_organizations")
private List<MinOrganization> relevant_organizations;
private List<MinGrant> funding;
@JsonProperty("related_products")
private List<Relations> related_products;
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public Map<String, List<String>> getTitles() {
return titles;
}
public void setTitles(Map<String, List<String>> titles) {
this.titles = titles;
}
public Map<String, List<String>> getAbstracts() {
return abstracts;
}
public void setAbstracts(Map<String, List<String>> abstracts) {
this.abstracts = abstracts;
}
public String getProduct_type() {
return product_type;
}
public void setProduct_type(String product_type) {
this.product_type = product_type;
}
public List<ResultTopic> getTopics() {
return topics;
}
public void setTopics(List<ResultTopic> topics) {
this.topics = topics;
}
public List<Contribution> getContributions() {
return contributions;
}
public void setContributions(List<Contribution> contributions) {
this.contributions = contributions;
}
public List<Manifestation> getManifestations() {
return manifestations;
}
public void setManifestations(List<Manifestation> manifestations) {
this.manifestations = manifestations;
}
public List<MinOrganization> getRelevant_organizations() {
return relevant_organizations;
}
public void setRelevant_organizations(List<MinOrganization> relevant_organizations) {
this.relevant_organizations = relevant_organizations;
}
public List<MinGrant> getFunding() {
return funding;
}
public void setFunding(List<MinGrant> funding) {
this.funding = funding;
}
public List<Relations> getRelated_products() {
return related_products;
}
public void setRelated_products(List<Relations> related_products) {
this.related_products = related_products;
}
}

View File

@ -0,0 +1,17 @@
package eu.dnetlib.dhp.skgif.model;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public enum ResearchTypes {
LITERATURE("literature"), RESEARCH_DATA("research data"), RESEARCH_SOFTWARE("research software"), OTHER("other");
public final String label;
private ResearchTypes(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 16/02/24
*/
public class ResultTopic implements Serializable {
private MinTopic topic;
private Provenance provenance;
public MinTopic getTopic() {
return topic;
}
public void setTopic(MinTopic topic) {
this.topic = topic;
}
public Provenance getProvenance() {
return provenance;
}
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class Topic implements Serializable {
private String local_identifier;
private List<Identifier> identifiers;
private String name;
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 27/02/24
*/
public class Venue implements Serializable {
private String local_identifier;
private List<Identifier> identifiers;
private String name;
private String acronym;
private String type;
private String publisher;
private String series;
private Boolean is_currently_full_oa;
private String creation_date;
private List<VenueContribution> contributions;
public String getLocal_identifier() {
return local_identifier;
}
public void setLocal_identifier(String local_identifier) {
this.local_identifier = local_identifier;
}
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getSeries() {
return series;
}
public void setSeries(String series) {
this.series = series;
}
public Boolean getIs_currently_full_oa() {
return is_currently_full_oa;
}
public void setIs_currently_full_oa(Boolean is_currently_full_oa) {
this.is_currently_full_oa = is_currently_full_oa;
}
public String getCreation_date() {
return creation_date;
}
public void setCreation_date(String creation_date) {
this.creation_date = creation_date;
}
public List<VenueContribution> getContributions() {
return contributions;
}
public void setContributions(List<VenueContribution> contributions) {
this.contributions = contributions;
}
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 27/02/24
*/
public class VenueContribution implements Serializable {
private String person;
private List<String> roles;
public String getPerson() {
return person;
}
public void setPerson(String person) {
this.person = person;
}
public List<String> getRoles() {
return roles;
}
public void setRoles(List<String> roles) {
this.roles = roles;
}
}

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
public enum VenueIdentifierType implements Serializable {
EISSN("eissn"), ISSN("issn"), LISSN("lissn"), ISBN("isbn"), OPENDOAR(
"opendoar"), R3DATA("re3data.org"), FAIRSHARING("fairsharing");
public final String label;
private VenueIdentifierType(String label) {
this.label = label;
}
}

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.skgif.model;
import java.io.Serializable;
public enum VenueType implements Serializable {
REPOSITORY("repository"), JOURNAL("journal"), CONFERENCE("conference"), BOOK("book"), OTHER(
"other"), UNKNOWN("unknown");
public final String label;
private VenueType(String label) {
this.label = label;
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.common;
package eu.dnetlib.dhp.common;
import java.io.BufferedInputStream;
import java.io.IOException;

View File

@ -15,7 +15,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.common.MakeTarArchive;
import eu.dnetlib.dhp.common.MakeTarArchive;
public class MakeTar implements Serializable {

View File

@ -1,793 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump;
import static eu.dnetlib.dhp.oa.graph.dump.Constants.*;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.oa.model.*;
import eu.dnetlib.dhp.oa.model.AccessRight;
import eu.dnetlib.dhp.oa.model.Author;
import eu.dnetlib.dhp.oa.model.GeoLocation;
import eu.dnetlib.dhp.oa.model.Instance;
import eu.dnetlib.dhp.oa.model.OpenAccessColor;
import eu.dnetlib.dhp.oa.model.OpenAccessRoute;
import eu.dnetlib.dhp.oa.model.Result;
import eu.dnetlib.dhp.oa.model.Subject;
import eu.dnetlib.dhp.oa.model.community.CfHbKeyValue;
import eu.dnetlib.dhp.oa.model.community.CommunityInstance;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.oa.model.community.Context;
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class ResultMapper implements Serializable {
private static final String NULL = "null";
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in, Map<String, String> communityMap, String dumpType)
throws NoAvailableEntityTypeException, CardinalityTooHighException {
Result out;
if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
out = new GraphResult();
} else {
out = new CommunityResult();
}
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
try {
addTypeSpecificInformation(out, input, ort.get());
mapAuthor(out, input);
mapAccessRight(out, input);
mapContributor(out, input);
mapCountry(out, input);
mapCoverage(out, input);
out.setDateofcollection(input.getDateofcollection());
out.setGreen(input.getIsGreen());
out.setInDiamondJournal(input.getIsInDiamondJournal());
out.setPubliclyFunded(input.getPubliclyFunded());
mapOpenAccessColor(out, input);
mapDescription(out, input);
mapEmbargo(out, input);
mapFormat(out, input);
out.setId(getEntityId(input.getId(), ENTITY_ID_SEPARATOR));
mapOriginalId(out, input);
mapInstance(dumpType, out, input);
mapLanguage(out, input);
mapLastUpdateTimestamp(out, input);
mapTitle(out, input);
mapPid(out, input);
mapDateOfAcceptance(out, input);
mapPublisher(out, input);
mapSource(out, input);
mapSubject(out, input);
out.setType(input.getResulttype().getClassid());
mapMeasure(out, input);
if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
mapCollectedfrom((CommunityResult) out, input);
mapContext(communityMap, (CommunityResult) out, input);
}
} catch (ClassCastException cce) {
return null;
}
}
return out;
}
private static void mapOpenAccessColor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getOpenAccessColor()).isPresent())
switch (input.getOpenAccessColor()) {
case bronze:
out.setOpenAccessColor(OpenAccessColor.bronze);
break;
case gold:
out.setOpenAccessColor(OpenAccessColor.gold);
break;
case hybrid:
out.setOpenAccessColor(OpenAccessColor.hybrid);
break;
}
}
private static void mapContext(Map<String, String> communityMap, CommunityResult out,
eu.dnetlib.dhp.schema.oaf.Result input) {
Set<String> communities = communityMap.keySet();
List<Context> contextList = Optional
.ofNullable(
input
.getContext())
.map(
value -> value
.stream()
.map(c -> {
String communityId = c.getId();
if (communityId.contains("::")) {
communityId = communityId.substring(0, communityId.indexOf("::"));
}
if (communities.contains(communityId)) {
Context context = new Context();
context.setCode(communityId);
context.setLabel(communityMap.get(communityId));
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
if (dataInfo.isPresent()) {
List<Provenance> provenance = new ArrayList<>();
provenance
.addAll(
dataInfo
.get()
.stream()
.map(
di -> Optional
.ofNullable(di.getProvenanceaction())
.map(
provenanceaction -> Provenance
.newInstance(
provenanceaction.getClassname(),
di.getTrust()))
.orElse(null))
.filter(Objects::nonNull)
.collect(Collectors.toSet()));
try {
context.setProvenance(getUniqueProvenance(provenance));
} catch (NoAvailableEntityTypeException e) {
e.printStackTrace();
}
}
return context;
}
return null;
})
.filter(Objects::nonNull)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
if (!contextList.isEmpty()) {
Set<Integer> hashValue = new HashSet<>();
List<Context> remainigContext = new ArrayList<>();
contextList.forEach(c -> {
if (!hashValue.contains(c.hashCode())) {
remainigContext.add(c);
hashValue.add(c.hashCode());
}
});
out.setContext(remainigContext);
}
}
private static void mapCollectedfrom(CommunityResult out, eu.dnetlib.dhp.schema.oaf.Result input) {
out
.setCollectedfrom(
input
.getCollectedfrom()
.stream()
.map(cf -> CfHbKeyValue.newInstance(getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), cf.getValue()))
.collect(Collectors.toList()));
}
private static void mapMeasure(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
if (Optional.ofNullable(input.getMeasures()).isPresent() && input.getMeasures().size() > 0) {
out.setIndicators(Utils.getIndicator(input.getMeasures()));
}
}
private static void mapSubject(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
List<Subject> subjectList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.stream()
// .filter(
// s -> !((s.getQualifier().getClassid().equalsIgnoreCase("fos") &&
// Optional.ofNullable(s.getDataInfo()).isPresent()
// && Optional.ofNullable(s.getDataInfo().getProvenanceaction()).isPresent() &&
// s.getDataInfo().getProvenanceaction().getClassid().equalsIgnoreCase("subject:fos"))
// ||
// (s.getQualifier().getClassid().equalsIgnoreCase("sdg") &&
// Optional.ofNullable(s.getDataInfo()).isPresent()
// && Optional.ofNullable(s.getDataInfo().getProvenanceaction()).isPresent() &&
// s
// .getDataInfo()
// .getProvenanceaction()
// .getClassid()
// .equalsIgnoreCase("subject:sdg"))))
.filter(s -> !s.getValue().equalsIgnoreCase(NULL))
.forEach(s -> subjectList.add(getSubject(s))));
out.setSubjects(subjectList);
}
private static void mapSource(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getSource())
.ifPresent(
value -> out.setSource(value.stream().map(Field::getValue).collect(Collectors.toList())));
}
private static void mapPublisher(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<Field<String>> oStr;
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
out.setPublisher(oStr.get().getValue());
}
}
private static void mapDateOfAcceptance(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<Field<String>> oStr;
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
out.setPublicationdate(oStr.get().getValue());
}
}
private static void mapPid(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> out
.setPid(
value
.stream()
.map(
p -> ResultPid
.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
}
private static void mapTitle(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
}
private static void mapLastUpdateTimestamp(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
out.setLastupdatetimestamp(oLong.get());
}
}
private static void mapLanguage(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
Qualifier language = oL.get();
out.setLanguage(Language.newInstance(language.getClassid(), language.getClassname()));
}
}
private static void mapInstance(String dumpType, Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
.ofNullable(input.getInstance());
if (oInst.isPresent()) {
if (DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
((GraphResult) out)
.setInstance(
oInst.get().stream().map(ResultMapper::getGraphInstance).collect(Collectors.toList()));
} else {
((CommunityResult) out)
.setInstance(
oInst
.get()
.stream()
.map(ResultMapper::getCommunityInstance)
.collect(Collectors.toList()));
}
}
}
private static void mapOriginalId(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
out.setOriginalId(new ArrayList<>());
Optional
.ofNullable(input.getOriginalId())
.ifPresent(
v -> out
.setOriginalId(
input
.getOriginalId()
.stream()
.filter(s -> !s.startsWith("50|"))
.collect(Collectors.toList())));
}
private static void mapFormat(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
}
private static void mapEmbargo(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
}
private static void mapDescription(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
}
private static void mapCoverage(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
}
private static void mapCountry(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
ResultCountry country = new ResultCountry();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
}
private static void mapContributor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
}
private static void mapAccessRight(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
// I do not map Access Right UNKNOWN or OTHER
Optional<Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent() && Constants.ACCESS_RIGHTS_COAR_MAP.containsKey(oar.get().getClassid())) {
String code = Constants.ACCESS_RIGHTS_COAR_MAP.get(oar.get().getClassid());
out
.setBestaccessright(
BestAccessRight
.newInstance(
code,
Constants.COAR_CODE_LABEL_MAP.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
private static void mapAuthor(Result out, eu.dnetlib.dhp.schema.oaf.Result input) {
Optional
.ofNullable(input.getAuthor())
.ifPresent(
ats -> out.setAuthor(ats.stream().map(ResultMapper::getAuthor).collect(Collectors.toList())));
}
private static void addTypeSpecificInformation(Result out, eu.dnetlib.dhp.schema.oaf.Result input,
eu.dnetlib.dhp.schema.oaf.Qualifier ort) throws NoAvailableEntityTypeException {
switch (ort.getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
Container c = new Container();
c.setConferencedate(j.getConferencedate());
c.setConferenceplace(j.getConferenceplace());
c.setEdition(j.getEdition());
c.setEp(j.getEp());
c.setIss(j.getIss());
c.setIssnLinking(j.getIssnLinking());
c.setIssnOnline(j.getIssnOnline());
c.setIssnPrinted(j.getIssnPrinted());
c.setName(j.getName());
c.setSp(j.getSp());
c.setVol(j.getVol());
out.setContainer(c);
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
}
break;
case "dataset":
Dataset id = (Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out
.setGeolocation(
Optional
.ofNullable(id.getGeolocation())
.map(
igl -> igl
.stream()
.filter(Objects::nonNull)
.map(gli -> {
GeoLocation gl = new GeoLocation();
gl.setBox(gli.getBox());
gl.setPlace(gli.getPlace());
gl.setPoint(gli.getPoint());
return gl;
})
.collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
break;
case "software":
Software is = (Software) input;
Optional
.ofNullable(is.getCodeRepositoryUrl())
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
Optional
.ofNullable(is.getDocumentationUrl())
.ifPresent(
value -> out
.setDocumentationUrl(
value
.stream()
.map(Field::getValue)
.collect(Collectors.toList())));
Optional
.ofNullable(is.getProgrammingLanguage())
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
break;
case "other":
OtherResearchProduct ir = (OtherResearchProduct) input;
out
.setContactgroup(
Optional
.ofNullable(ir.getContactgroup())
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
.orElse(null));
out
.setContactperson(
Optional
.ofNullable(ir.getContactperson())
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
.orElse(null));
out
.setTool(
Optional
.ofNullable(ir.getTool())
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
break;
default:
throw new NoAvailableEntityTypeException();
}
}
private static Instance getGraphInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
Instance instance = new Instance();
setCommonValue(i, instance);
return instance;
}
private static CommunityInstance getCommunityInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
CommunityInstance instance = new CommunityInstance();
setCommonValue(i, instance);
if (Optional.ofNullable(i.getCollectedfrom()).isPresent() &&
Optional.ofNullable(i.getCollectedfrom().getKey()).isPresent() &&
StringUtils.isNotBlank(i.getCollectedfrom().getKey()))
instance
.setCollectedfrom(
CfHbKeyValue
.newInstance(
getEntityId(i.getCollectedfrom().getKey(), ENTITY_ID_SEPARATOR),
i.getCollectedfrom().getValue()));
if (Optional.ofNullable(i.getHostedby()).isPresent() &&
Optional.ofNullable(i.getHostedby().getKey()).isPresent() &&
StringUtils.isNotBlank(i.getHostedby().getKey()))
instance
.setHostedby(
CfHbKeyValue
.newInstance(
getEntityId(i.getHostedby().getKey(), ENTITY_ID_SEPARATOR), i.getHostedby().getValue()));
return instance;
}
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
Optional<eu.dnetlib.dhp.schema.oaf.AccessRight> opAr = Optional.ofNullable(i.getAccessright());
if (opAr.isPresent() && Constants.ACCESS_RIGHTS_COAR_MAP.containsKey(opAr.get().getClassid())) {
String code = Constants.ACCESS_RIGHTS_COAR_MAP.get(opAr.get().getClassid());
instance
.setAccessright(
AccessRight
.newInstance(
code,
Constants.COAR_CODE_LABEL_MAP.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
if (opAr.get().getOpenAccessRoute() != null) {
switch (opAr.get().getOpenAccessRoute()) {
case hybrid:
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.hybrid);
break;
case gold:
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.gold);
break;
case green:
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.green);
break;
case bronze:
instance.getAccessright().setOpenAccessRoute(OpenAccessRoute.bronze);
break;
}
}
}
Optional
.ofNullable(i.getPid())
.ifPresent(
pid -> instance
.setPid(
pid
.stream()
.map(p -> ResultPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
Optional
.ofNullable(i.getAlternateIdentifier())
.ifPresent(
ai -> instance
.setAlternateIdentifier(
ai
.stream()
.map(p -> AlternateIdentifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
Optional
.ofNullable(i.getLicense())
.ifPresent(value -> instance.setLicense(value.getValue()));
Optional
.ofNullable(i.getDateofacceptance())
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
Optional
.ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getClassname()));
Optional
.ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
Optional<Field<String>> oPca = Optional.ofNullable(i.getProcessingchargeamount());
Optional<Field<String>> oPcc = Optional.ofNullable(i.getProcessingchargecurrency());
if (oPca.isPresent() && oPcc.isPresent()) {
Field<String> pca = oPca.get();
Field<String> pcc = oPcc.get();
if (!pca.getValue().trim().equals("") && !pcc.getValue().trim().equals("")) {
APC apc = new APC();
apc.setCurrency(oPcc.get().getValue());
apc.setAmount(oPca.get().getValue());
instance.setArticleprocessingcharge(apc);
}
}
Optional.ofNullable(i.getUrl()).ifPresent(instance::setUrl);
}
private static List<Provenance> getUniqueProvenance(List<Provenance> provenance)
throws NoAvailableEntityTypeException {
Provenance iProv = new Provenance();
Provenance hProv = new Provenance();
Provenance lProv = new Provenance();
for (Provenance p : provenance) {
switch (p.getProvenance()) {
case Constants.HARVESTED:
hProv = getHighestTrust(hProv, p);
break;
case Constants.INFERRED:
iProv = getHighestTrust(iProv, p);
// To be removed as soon as the new beta run has been done
// this fixex issue of not set trust during bulktagging
if (StringUtils.isEmpty(iProv.getTrust())) {
iProv.setTrust(Constants.DEFAULT_TRUST);
}
break;
case Constants.USER_CLAIM:
lProv = getHighestTrust(lProv, p);
break;
default:
throw new NoAvailableEntityTypeException();
}
}
return Arrays
.asList(iProv, hProv, lProv)
.stream()
.filter(p -> !StringUtils.isEmpty(p.getProvenance()))
.collect(Collectors.toList());
}
private static Provenance getHighestTrust(Provenance hProv, Provenance p) {
if (StringUtils.isNoneEmpty(hProv.getTrust(), p.getTrust()))
return hProv.getTrust().compareTo(p.getTrust()) > 0 ? hProv : p;
return (StringUtils.isEmpty(p.getTrust()) && !StringUtils.isEmpty(hProv.getTrust())) ? hProv : p;
}
private static Subject getSubject(StructuredProperty s) {
Subject subject = new Subject();
subject.setSubject(SubjectSchemeValue.newInstance(s.getQualifier().getClassid(), s.getValue()));
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
if (di.isPresent()) {
Provenance p = new Provenance();
p.setProvenance(di.get().getProvenanceaction().getClassname());
if (!s.getQualifier().getClassid().equalsIgnoreCase("fos") &&
!s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
p.setTrust(di.get().getTrust());
subject.setProvenance(p);
}
return subject;
}
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional<List<StructuredProperty>> oPids = Optional
.ofNullable(oa.getPid());
if (oPids.isPresent()) {
AuthorPid pid = getOrcid(oPids.get());
if (pid != null) {
a.setPid(pid);
}
}
return a;
}
private static AuthorPid getAuthorPid(StructuredProperty pid) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return AuthorPid
.newInstance(
AuthorPidSchemeValue
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue()),
Provenance
.newInstance(
di.get().getProvenanceaction().getClassname(),
di.get().getTrust()));
} else {
return AuthorPid
.newInstance(
AuthorPidSchemeValue
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue())
);
}
}
private static AuthorPid getOrcid(List<StructuredProperty> p) {
List<StructuredProperty> pidList = p.stream().map(pid -> {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID) ||
(pid.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING))) {
return pid;
}
return null;
}).filter(Objects::nonNull).collect(Collectors.toList());
if (pidList.size() == 1) {
return getAuthorPid(pidList.get(0));
}
List<StructuredProperty> orcid = pidList
.stream()
.filter(
ap -> ap
.getQualifier()
.getClassid()
.equals(ModelConstants.ORCID))
.collect(Collectors.toList());
if (orcid.size() == 1) {
return getAuthorPid(orcid.get(0));
}
orcid = pidList
.stream()
.filter(
ap -> ap
.getQualifier()
.getClassid()
.equals(ModelConstants.ORCID_PENDING))
.collect(Collectors.toList());
if (orcid.size() == 1) {
return getAuthorPid(orcid.get(0));
}
return null;
}
}

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* This class connects with the community APIs for production. It saves the information about the
* context that will guide the dump of the results. The information saved is a HashMap. The key is the id of a community
* - research infrastructure/initiative , the value is the label of the research community - research
* infrastructure/initiative.
*/
public class SaveCommunityMap implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SaveCommunityMap.class);
private final transient UtilCommunityAPI queryInformationSystem;
private final transient BufferedWriter writer;
public SaveCommunityMap(String hdfsPath, String hdfsNameNode) throws IOException {
final Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
Path hdfsWritePath = new Path(hdfsPath);
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
queryInformationSystem = new UtilCommunityAPI();
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
}
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SaveCommunityMap.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_cm_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String nameNode = parser.get("nameNode");
log.info("nameNode: {}", nameNode);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final Boolean singleCommunity = Optional
.ofNullable(parser.get("singleDeposition"))
.map(Boolean::valueOf)
.orElse(false);
final String community_id = Optional.ofNullable(parser.get("communityId")).orElse(null);
final SaveCommunityMap scm = new SaveCommunityMap(outputPath, nameNode);
scm.saveCommunityMap(singleCommunity, community_id);
}
private void saveCommunityMap(boolean singleCommunity, String communityId)
throws IOException {
final String communityMapString = Utils.OBJECT_MAPPER
.writeValueAsString(queryInformationSystem.getCommunityMap(singleCommunity, communityId));
log.info("communityMap {} ", communityMapString);
writer
.write(
communityMapString);
writer.close();
}
}

View File

@ -1,203 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump;
import static eu.dnetlib.dhp.utils.DHPUtils.MAPPER;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.communityapi.model.*;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.complete.ContextInfo;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
import eu.dnetlib.dhp.utils.DHPUtils;
public class UtilCommunityAPI {
private static final Logger log = LoggerFactory.getLogger(UtilCommunityAPI.class);
public CommunityMap getCommunityMap(boolean singleCommunity, String communityId)
throws IOException {
if (singleCommunity)
return getMap(Arrays.asList(getCommunity(communityId)));
return getMap(getValidCommunities());
}
private CommunityMap getMap(List<CommunityModel> communities) {
final CommunityMap map = new CommunityMap();
communities.forEach(c -> map.put(c.getId(), c.getName()));
return map;
}
public List<String> getCommunityCsv(List<String> comms) {
return comms.stream().map(c -> {
try {
CommunityModel community = getCommunity(c);
StringBuilder builder = new StringBuilder();
builder.append(DHPUtils.md5(community.getId()));
builder.append(Constants.SEP);
builder.append(community.getName());
builder.append(Constants.SEP);
builder.append(community.getId());
builder.append(Constants.SEP);
builder
.append(
community.getDescription());
return builder.toString();
} catch (IOException e) {
throw new RuntimeException(e);
}
}).collect(Collectors.toList());
}
private List<CommunityModel> getValidCommunities() throws IOException {
ObjectMapper mapper = new ObjectMapper();
return mapper
.readValue(eu.dnetlib.dhp.communityapi.QueryCommunityAPI.communities(), CommunitySummary.class)
.stream()
.filter(
community -> (community.getStatus().equals("all") || community.getStatus().equalsIgnoreCase("public"))
&&
(community.getType().equals("ri") || community.getType().equals("community")))
.collect(Collectors.toList());
}
private CommunityModel getCommunity(String id) throws IOException {
ObjectMapper mapper = new ObjectMapper();
return mapper
.readValue(eu.dnetlib.dhp.communityapi.QueryCommunityAPI.community(id), CommunityModel.class);
}
public List<ContextInfo> getContextInformation() throws IOException {
return getValidCommunities()
.stream()
.map(c -> getContext(c))
.collect(Collectors.toList());
}
public ContextInfo getContext(CommunityModel c) {
ContextInfo cinfo = new ContextInfo();
cinfo.setId(c.getId());
cinfo.setDescription(c.getDescription());
CommunityModel cm = null;
try {
cm = getCommunity(c.getId());
} catch (IOException e) {
throw new RuntimeException(e);
}
cinfo.setSubject(new ArrayList<>());
cinfo.getSubject().addAll(cm.getSubjects());
cinfo.setZenodocommunity(c.getZenodoCommunity());
cinfo.setType(c.getType());
return cinfo;
}
public List<ContextInfo> getContextRelation() throws IOException {
return getValidCommunities().stream().map(c -> {
ContextInfo cinfo = new ContextInfo();
cinfo.setId(c.getId());
cinfo.setDatasourceList(getDatasourceList(c.getId()));
cinfo.setProjectList(getProjectList(c.getId()));
return cinfo;
}).collect(Collectors.toList());
}
private List<String> getDatasourceList(String id) {
List<String> datasourceList = new ArrayList<>();
try {
new ObjectMapper()
.readValue(
eu.dnetlib.dhp.communityapi.QueryCommunityAPI.communityDatasource(id),
DatasourceList.class)
.stream()
.forEach(ds -> {
if (Optional.ofNullable(ds.getOpenaireId()).isPresent()) {
datasourceList.add(ds.getOpenaireId());
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
return datasourceList;
}
private List<String> getProjectList(String id) {
int page = -1;
int size = 100;
ContentModel cm = null;
;
ArrayList<String> projectList = new ArrayList<>();
do {
page++;
try {
cm = new ObjectMapper()
.readValue(
eu.dnetlib.dhp.communityapi.QueryCommunityAPI
.communityProjects(
id, String.valueOf(page), String.valueOf(size)),
ContentModel.class);
if (cm.getContent().size() > 0) {
cm.getContent().forEach(p -> {
if (Optional.ofNullable(p.getOpenaireId()).isPresent())
projectList.add(p.getOpenaireId());
});
}
} catch (IOException e) {
throw new RuntimeException(e);
}
} while (!cm.getLast());
return projectList;
}
/**
* it returns for each organization the list of associated communities
*/
public CommunityEntityMap getCommunityOrganization() throws IOException {
CommunityEntityMap organizationMap = new CommunityEntityMap();
getValidCommunities()
.forEach(community -> {
String id = community.getId();
try {
List<String> associatedOrgs = MAPPER
.readValue(
eu.dnetlib.dhp.communityapi.QueryCommunityAPI.communityPropagationOrganization(id),
OrganizationList.class);
associatedOrgs.forEach(o -> {
if (!organizationMap
.keySet()
.contains(o))
organizationMap.put(o, new ArrayList<>());
organizationMap.get(o).add(community.getId());
});
} catch (IOException e) {
throw new RuntimeException(e);
}
});
return organizationMap;
}
}

View File

@ -1,200 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump;
import static eu.dnetlib.dhp.oa.graph.dump.Constants.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.jetbrains.annotations.NotNull;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.complete.Constants;
import eu.dnetlib.dhp.oa.model.Indicator;
import eu.dnetlib.dhp.oa.model.Score;
import eu.dnetlib.dhp.oa.model.UsageCounts;
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.oa.model.graph.Relation;
import eu.dnetlib.dhp.oa.model.graph.ResearchCommunity;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class Utils {
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final String ENTITY_ID_SEPARATOR = "|";
private Utils() {
}
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static String getContextId(String id) {
return String
.format(
"%s::%s", Constants.CONTEXT_NS_PREFIX,
DHPUtils.md5(id));
}
public static CommunityMap getCommunityMap(SparkSession spark, String communityMapPath) {
return new Gson().fromJson(spark.read().textFile(communityMapPath).collectAsList().get(0), CommunityMap.class);
}
public static CommunityMap readCommunityMap(FileSystem fileSystem, String communityMapPath) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(new Path(communityMapPath))));
StringBuilder sb = new StringBuilder();
try {
String line;
while ((line = br.readLine()) != null) {
sb.append(line);
}
} finally {
br.close();
}
return new Gson().fromJson(sb.toString(), CommunityMap.class);
}
public static String getEntityId(String id, String separator) {
return id.substring(id.indexOf(separator) + 1);
}
public static Dataset<String> getEntitiesId(SparkSession spark, String inputPath) {
Dataset<String> dumpedIds = Utils
.readPath(spark, inputPath + "/publication", GraphResult.class)
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING())
.union(
Utils
.readPath(spark, inputPath + "/dataset", GraphResult.class)
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/software", GraphResult.class)
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/otherresearchproduct", GraphResult.class)
.map((MapFunction<GraphResult, String>) r -> r.getId(), Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/organization", eu.dnetlib.dhp.oa.model.graph.Organization.class)
.map(
(MapFunction<eu.dnetlib.dhp.oa.model.graph.Organization, String>) o -> o.getId(),
Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.oa.model.graph.Project.class)
.map(
(MapFunction<eu.dnetlib.dhp.oa.model.graph.Project, String>) o -> o.getId(), Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/datasource", eu.dnetlib.dhp.oa.model.graph.Datasource.class)
.map(
(MapFunction<eu.dnetlib.dhp.oa.model.graph.Datasource, String>) o -> o.getId(),
Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/communities_infrastructures", ResearchCommunity.class)
.map((MapFunction<ResearchCommunity, String>) c -> c.getId(), Encoders.STRING()));
return dumpedIds;
}
public static Dataset<Relation> getValidRelations(Dataset<Relation> relations,
Dataset<String> entitiesIds) {
Dataset<Tuple2<String, Relation>> relationSource = relations
.map(
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getSource(), r),
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
Dataset<Tuple2<String, Relation>> relJoinSource = relationSource
.joinWith(entitiesIds, relationSource.col("_1").equalTo(entitiesIds.col("value")))
.map(
(MapFunction<Tuple2<Tuple2<String, Relation>, String>, Tuple2<String, Relation>>) t2 -> new Tuple2<>(
t2._1()._2().getTarget(), t2._1()._2()),
Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)));
return relJoinSource
.joinWith(entitiesIds, relJoinSource.col("_1").equalTo(entitiesIds.col("value")))
.map(
(MapFunction<Tuple2<Tuple2<String, Relation>, String>, Relation>) t2 -> t2._1()._2(),
Encoders.bean(Relation.class));
}
public static Indicator getIndicator(List<Measure> measures) {
Indicator i = new Indicator();
for (eu.dnetlib.dhp.schema.oaf.Measure m : measures) {
switch (m.getId()) {
case USAGE_COUNT_DOWNLOADS:
getUsageCounts(i).setDownloads(m.getUnit().get(0).getValue());
break;
case USAGE_COUNT_VIEWS:
getUsageCounts(i).setViews(m.getUnit().get(0).getValue());
break;
default:
getImpactMeasure(i).add(getScore(m.getId(), m.getUnit()));
break;
}
}
return i;
}
@NotNull
private static UsageCounts getUsageCounts(Indicator i) {
if (i.getUsageCounts() == null) {
i.setUsageCounts(new UsageCounts());
}
return i.getUsageCounts();
}
@NotNull
private static List<Score> getImpactMeasure(Indicator i) {
if (i.getBipIndicators() == null) {
i.setBipIndicators(new ArrayList<>());
}
return i.getBipIndicators();
}
private static Score getScore(String indicator, List<KeyValue> unit) {
Score s = new Score();
s.setIndicator(indicator);
for (KeyValue u : unit) {
if (u.getKey().equals("score")) {
s.setScore(u.getValue());
} else {
s.setClazz(u.getValue());
}
}
return s;
}
}

View File

@ -1,8 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.HashMap;
public class CommunityMap extends HashMap<String, String> implements Serializable {
}

View File

@ -1,75 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.oa.model.community.Context;
/**
* This class splits the dumped results according to the research community - research initiative/infrastructure they
* are related to. The information about the community is found in the element "context.id" in the result. Since the
* context that can be found in the result can be associated not only to communities, a community Map is provided. It
* will guide the splitting process. Note: the repartition(1) just before writing the results related to a community.
* This is a choice due to uploading constraints (just one file for each community) As soon as a better solution will be
* in place remove the repartition
*/
public class CommunitySplit implements Serializable {
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath) {
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
execSplit(spark, inputPath, outputPath, communityMap);
});
}
private static void execSplit(SparkSession spark, String inputPath, String outputPath,
CommunityMap communities) {
Dataset<CommunityResult> result = Utils
.readPath(spark, inputPath + "/publication", CommunityResult.class)
.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
.union(Utils.readPath(spark, inputPath + "/orp", CommunityResult.class))
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
communities
.keySet()
.stream()
.parallel()
.forEach(c -> {
result
.filter(
(FilterFunction<CommunityResult>) r -> Optional.ofNullable(r.getContext()).isPresent() &&
r.getContext().stream().anyMatch(con -> con.getCode().equals(c)))
.map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(outputPath + "/" + c.replace(" ", "_"));
});
}
}

View File

@ -1,28 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.oa.model.community.Project;
public class ResultProject implements Serializable {
private String resultId;
private List<Project> projectsList;
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public List<Project> getProjectsList() {
return projectsList;
}
public void setProjectsList(List<Project> projectsList) {
this.projectsList = projectsList;
}
}

View File

@ -1,155 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Spark action to trigger the dump of results associated to research community - reseach initiative/infrasctructure The
* actual dump if performed via the class DumpProducts that is used also for the entire graph dump
*/
public class SparkDumpCommunityProducts implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpCommunityProducts.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
String communityMapPath = Optional
.ofNullable(parser.get("communityMapPath"))
.orElse(null);
String dumpType = Optional
.ofNullable(parser.get("dumpType"))
.orElse(null);
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
resultDump(
spark, inputPath, outputPath, communityMapPath, inputClazz, dumpType);
});
}
public static <I extends OafEntity> void resultDump(
SparkSession spark,
String inputPath,
String outputPath,
String communityMapPath,
Class<I> inputClazz,
String dumpType) {
CommunityMap communityMap = null;
if (!StringUtils.isEmpty(communityMapPath))
communityMap = Utils.getCommunityMap(spark, communityMapPath);
CommunityMap finalCommunityMap = communityMap;
Utils
.readPath(spark, inputPath, inputClazz)
.map(
(MapFunction<I, CommunityResult>) value -> execMap(value, finalCommunityMap, dumpType),
Encoders.bean(CommunityResult.class))
.filter((FilterFunction<CommunityResult>) value -> value != null)
.map(
(MapFunction<CommunityResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(outputPath);
}
private static <I extends OafEntity, O extends eu.dnetlib.dhp.oa.model.Result> O execMap(I value,
CommunityMap communityMap, String dumpType) throws NoAvailableEntityTypeException, CardinalityTooHighException {
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
if (Boolean.FALSE.equals(odInfo.isPresent())) {
return null;
}
if (Boolean.TRUE.equals(odInfo.get().getDeletedbyinference())
|| Boolean.TRUE.equals(odInfo.get().getInvisible())) {
return null;
}
if (StringUtils.isEmpty(dumpType)) {
Set<String> communities = communityMap.keySet();
Optional<List<Context>> inputContext = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Result) value).getContext());
if (!inputContext.isPresent()) {
return null;
}
List<String> toDumpFor = inputContext.get().stream().map(c -> {
if (communities.contains(c.getId())) {
return c.getId();
}
if (c.getId().contains("::") && communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
return c.getId().substring(0, c.getId().indexOf("::"));
}
return null;
}).filter(Objects::nonNull).collect(Collectors.toList());
if (toDumpFor.isEmpty()) {
return null;
}
}
return (O) ResultMapper.map(value, communityMap, Constants.DUMPTYPE.COMMUNITY.getType());
}
}

View File

@ -1,206 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.community.Funder;
import eu.dnetlib.dhp.oa.model.community.Project;
import eu.dnetlib.dhp.oa.model.community.Validated;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/**
* Preparation of the Project information to be added to the dumped results. For each result associated to at least one
* Project, a serialization of an instance af ResultProject class is done. ResultProject contains the resultId, and the
* list of Projects (as in eu.dnetlib.dhp.schema.dump.oaf.community.Project) it is associated to
*/
public class SparkPrepareResultProject implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkPrepareResultProject.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_prep_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Boolean substring = Optional
.ofNullable(parser.get("substring"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
prepareResultProjectList(spark, inputPath, outputPath, substring);
});
}
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath,
Boolean substring) {
Dataset<Relation> relation = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equalsIgnoreCase(ModelConstants.IS_PRODUCED_BY));
Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class);
projects
.joinWith(relation, projects.col("id").equalTo(relation.col("target")), "inner")
.groupByKey(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value
._2()
.getSource(),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s,
it) -> {
Set<String> projectSet = new HashSet<>();
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
ResultProject rp = new ResultProject();
if (substring)
rp.setResultId(getEntityId(s, ENTITY_ID_SEPARATOR));
else
rp.setResultId(s);
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
projectSet.add(p.getId());
Project ps = getProject(p, first._2);
List<Project> projList = new ArrayList<>();
projList.add(ps);
rp.setProjectsList(projList);
it.forEachRemaining(c -> {
eu.dnetlib.dhp.schema.oaf.Project op = c._1();
if (!projectSet.contains(op.getId())) {
projList
.add(getProject(op, c._2));
projectSet.add(op.getId());
}
});
return rp;
}, Encoders.bean(ResultProject.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op, Relation relation) {
Project p = Project
.newInstance(
getEntityId(op.getId(), ENTITY_ID_SEPARATOR),
op.getCode().getValue(),
Optional
.ofNullable(op.getAcronym())
.map(Field::getValue)
.orElse(null),
Optional
.ofNullable(op.getTitle())
.map(Field::getValue)
.orElse(null),
Optional
.ofNullable(op.getFundingtree())
.map(value -> {
List<Funder> tmp = value
.stream()
.map(ft -> getFunder(ft.getValue()))
.collect(Collectors.toList());
if (!tmp.isEmpty()) {
return tmp.get(0);
} else {
return null;
}
})
.orElse(null));
Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo());
Provenance provenance = new Provenance();
if (di.isPresent()) {
provenance.setProvenance(di.get().getProvenanceaction().getClassname());
provenance.setTrust(di.get().getTrust());
p.setProvenance(provenance);
}
if (Boolean.TRUE.equals(relation.getValidated())) {
p.setValidated(Validated.newInstance(relation.getValidated(), relation.getValidationDate()));
}
return p;
}
private static Funder getFunder(String fundingtree) {
final Funder f = new Funder();
final Document doc;
try {
final SAXReader reader = new SAXReader();
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
doc = reader.read(new StringReader(fundingtree));
f.setShortName(((Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
for (Object o : doc.selectNodes("//funding_level_0")) {
List<Node> node = ((Node) o).selectNodes("./name");
f.setFundingStream((node.get(0)).getText());
}
return f;
} catch (DocumentException | SAXException e) {
throw new IllegalArgumentException(e);
}
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* Spark job to trigger the split of results associated to research community - reseach initiative/infrasctructure. The
* actual split is performed by the class CommunitySplit
*/
public class SparkSplitForCommunity implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkSplitForCommunity.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String communityMapPath = parser.get("communityMapPath");
CommunitySplit split = new CommunitySplit();
split.run(isSparkSessionManaged, inputPath, outputPath, communityMapPath);
}
}

View File

@ -1,95 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import scala.Tuple2;
public class SparkUpdateProjectInfo implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkUpdateProjectInfo.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
extend(spark, inputPath, outputPath, preparedInfoPath);
});
}
private static void extend(
SparkSession spark,
String inputPath,
String outputPath,
String preparedInfoPath) {
Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class);
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
result
.joinWith(
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
"left")
.map((MapFunction<Tuple2<CommunityResult, ResultProject>, CommunityResult>) value -> {
CommunityResult r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList()));
return r;
}, Encoders.bean(CommunityResult.class))
.map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.text(outputPath);
}
}

View File

@ -1,26 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
public class Constants implements Serializable {
public static final String IS_HOSTED_BY = "isHostedBy";
public static final String HOSTS = "hosts";
public static final String IS_FUNDED_BY = "isFundedBy";
public static final String FUNDS = "funds";
public static final String FUNDINGS = "fundings";
public static final String RESULT_ENTITY = "result";
public static final String DATASOURCE_ENTITY = "datasource";
public static final String CONTEXT_ENTITY = "context";
public static final String ORGANIZATION_ENTITY = "organization";
public static final String PROJECT_ENTITY = "project";
public static final String CONTEXT_ID = "00";
public static final String CONTEXT_NS_PREFIX = "context_____";
public static final String UNKNOWN = "UNKNOWN";
}

View File

@ -1,84 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.List;
/**
* Deserialization of the information in the context needed to create Context Entities, and relations between context
* entities and datasources and projects
*/
public class ContextInfo implements Serializable {
private String id;
private String description;
private String type;
private String zenodocommunity;
private String name;
private List<String> projectList;
private List<String> datasourceList;
private List<String> subject;
public List<String> getSubject() {
return subject;
}
public void setSubject(List<String> subject) {
this.subject = subject;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getZenodocommunity() {
return zenodocommunity;
}
public void setZenodocommunity(String zenodocommunity) {
this.zenodocommunity = zenodocommunity;
}
public List<String> getProjectList() {
return projectList;
}
public void setProjectList(List<String> projectList) {
this.projectList = projectList;
}
public List<String> getDatasourceList() {
return datasourceList;
}
public void setDatasourceList(List<String> datasourceList) {
this.datasourceList = datasourceList;
}
}

View File

@ -1,106 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.graph.ResearchInitiative;
/**
* Writes on HDFS Context entities. It queries the Information System at the lookup url provided as parameter and
* collects the general information for contexes of type community or ri. The general information is the id of the
* context, its label, the subjects associated to the context, its zenodo community, description and type. This
* information is used to create a new Context Entity
*/
public class CreateContextEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateContextEntities.class);
private final transient Configuration conf;
private final transient BufferedWriter writer;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CreateContextEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final CreateContextEntities cce = new CreateContextEntities(hdfsPath, hdfsNameNode);
log.info("Processing contexts...");
cce.execute(Process::getEntity);
cce.close();
}
private void close() throws IOException {
writer.close();
}
public CreateContextEntities(String hdfsPath, String hdfsNameNode) throws IOException {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.append(hdfsWritePath);
} else {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
}
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodecByClassName("org.apache.hadoop.io.compress.GzipCodec");
this.writer = new BufferedWriter(new OutputStreamWriter(codec.createOutputStream(fsDataOutputStream),
StandardCharsets.UTF_8));
}
public <R extends ResearchInitiative> void execute(final Function<ContextInfo, R> producer)
throws IOException {
UtilCommunityAPI queryInformationSystem = new UtilCommunityAPI();
final Consumer<ContextInfo> consumer = ci -> writeEntity(producer.apply(ci));
queryInformationSystem.getContextInformation().forEach(ci -> consumer.accept(ci));
}
protected <R extends ResearchInitiative> void writeEntity(final R r) {
try {
writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r));
writer.newLine();
} catch (final IOException e) {
throw new IllegalArgumentException(e);
}
}
}

View File

@ -1,113 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.MyRuntimeException;
import eu.dnetlib.dhp.oa.graph.dump.subset.MasterDuplicate;
import eu.dnetlib.dhp.oa.model.graph.*;
/**
* Writes the set of new Relation between the context and datasources. At the moment the relation between the context
* and the project is not created because of a low coverage in the profiles of openaire ids related to projects
*/
public class CreateContextRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateContextRelation.class);
private final transient Configuration conf;
private final transient BufferedWriter writer;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
Objects
.requireNonNull(
CreateContextRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json")));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("hdfsNameNode: {}", hdfsNameNode);
final CreateContextRelation cce = new CreateContextRelation(hdfsPath, hdfsNameNode);
log.info("Creating relation for datasources and projects...");
cce
.execute(
Process::getRelation);
cce.close();
}
private void close() throws IOException {
writer.close();
}
public CreateContextRelation(String hdfsPath, String hdfsNameNode)
throws IOException {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.append(hdfsWritePath);
} else {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
}
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
}
public void execute(final Function<ContextInfo, List<Relation>> producer) throws IOException {
final Consumer<ContextInfo> consumer = ci -> producer.apply(ci).forEach(this::writeEntity);
UtilCommunityAPI queryCommunityAPI = new UtilCommunityAPI();
queryCommunityAPI.getContextRelation().forEach(ci -> consumer.accept(ci));
}
protected void writeEntity(final Relation r) {
try {
writer.write(Utils.OBJECT_MAPPER.writeValueAsString(r));
writer.newLine();
} catch (final Exception e) {
throw new MyRuntimeException(e);
}
}
}

View File

@ -1,203 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.graph.RelType;
import eu.dnetlib.dhp.oa.model.graph.Relation;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Creates new Relations (as in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation) from the information in the Entity. The
* new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
* to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
* -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result. Note for
* context: it gets the first provenance in the dataInfo. If more than one is present the others are not dumped
*/
public class Extractor implements Serializable {
public void run(Boolean isSparkSessionManaged,
String inputPath,
String outputPath,
Class<? extends Result> inputClazz,
String communityMapPath) {
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
extractRelationResult(
spark, inputPath, outputPath, inputClazz, Utils.getCommunityMap(spark, communityMapPath));
});
}
private <R extends Result> void extractRelationResult(SparkSession spark,
String inputPath,
String outputPath,
Class<R> inputClazz,
CommunityMap communityMap) {
Set<Integer> hashCodes = new HashSet<>();
Utils
.readPath(spark, inputPath, inputClazz)
.flatMap((FlatMapFunction<R, Relation>) value -> {
List<Relation> relationList = new ArrayList<>();
extractRelationsFromInstance(hashCodes, value, relationList);
Set<String> communities = communityMap.keySet();
Optional
.ofNullable(value.getContext())
.ifPresent(contexts -> contexts.forEach(context -> {
String id = context.getId();
if (id.contains(":")) {
id = id.substring(0, id.indexOf(":"));
}
if (communities.contains(id)) {
String contextId = Utils.getContextId(id);
Provenance provenance = Optional
.ofNullable(context.getDataInfo())
.map(
dinfo -> Optional
.ofNullable(dinfo.get(0).getProvenanceaction())
.map(
paction -> Provenance
.newInstance(
paction.getClassid(),
dinfo.get(0).getTrust()))
.orElse(null))
.orElse(null);
Relation r = getRelation(
getEntityId(value.getId(), ENTITY_ID_SEPARATOR), contextId,
Constants.RESULT_ENTITY,
Constants.CONTEXT_ENTITY,
ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP, provenance);
if (!hashCodes.contains(r.hashCode())) {
relationList
.add(r);
hashCodes.add(r.hashCode());
}
r = getRelation(
contextId, getEntityId(value.getId(), ENTITY_ID_SEPARATOR),
Constants.CONTEXT_ENTITY,
Constants.RESULT_ENTITY,
ModelConstants.IS_RELATED_TO,
ModelConstants.RELATIONSHIP, provenance);
if (!hashCodes.contains(r.hashCode())) {
relationList
.add(
r);
hashCodes.add(r.hashCode());
}
}
}));
return relationList.iterator();
}, Encoders.bean(Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(outputPath);
}
private <R extends Result> void extractRelationsFromInstance(Set<Integer> hashCodes, R value,
List<Relation> relationList) {
Optional
.ofNullable(value.getInstance())
.ifPresent(inst -> inst.forEach(instance -> {
Optional
.ofNullable(instance.getCollectedfrom())
.ifPresent(
cf -> getRelatioPair(
value, relationList, cf,
ModelConstants.IS_PROVIDED_BY, ModelConstants.PROVIDES, hashCodes));
Optional
.ofNullable(instance.getHostedby())
.ifPresent(
hb -> getRelatioPair(
value, relationList, hb,
Constants.IS_HOSTED_BY, Constants.HOSTS, hashCodes));
}));
}
private static <R extends Result> void getRelatioPair(R value, List<Relation> relationList, KeyValue cf,
String resultDatasource, String datasourceResult,
Set<Integer> hashCodes) {
Provenance provenance = Optional
.ofNullable(cf.getDataInfo())
.map(
dinfo -> Optional
.ofNullable(dinfo.getProvenanceaction())
.map(
paction -> Provenance
.newInstance(
paction.getClassname(),
dinfo.getTrust()))
.orElse(
Provenance
.newInstance(
eu.dnetlib.dhp.oa.graph.dump.Constants.HARVESTED,
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)))
.orElse(
Provenance
.newInstance(
eu.dnetlib.dhp.oa.graph.dump.Constants.HARVESTED,
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST));
Relation r = getRelation(
getEntityId(value.getId(), ENTITY_ID_SEPARATOR),
getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY,
resultDatasource, ModelConstants.PROVISION,
provenance);
if (!hashCodes.contains(r.hashCode())) {
relationList
.add(r);
hashCodes.add(r.hashCode());
}
r = getRelation(
getEntityId(cf.getKey(), ENTITY_ID_SEPARATOR), getEntityId(value.getId(), ENTITY_ID_SEPARATOR),
Constants.DATASOURCE_ENTITY, Constants.RESULT_ENTITY,
datasourceResult, ModelConstants.PROVISION,
provenance);
if (!hashCodes.contains(r.hashCode())) {
relationList
.add(r);
hashCodes.add(r.hashCode());
}
}
private static Relation getRelation(String source, String target, String sourceType, String targetType,
String relName, String relType, Provenance provenance) {
Relation r = new Relation();
r.setSource(source);
r.setSourceType(sourceType);
r.setTarget(target);
r.setTargetType(targetType);
r.setReltype(RelType.newInstance(relName, relType));
r.setProvenance(provenance);
return r;
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
public class MergedRels implements Serializable {
private String organizationId;
private String representativeId;
public String getOrganizationId() {
return organizationId;
}
public void setOrganizationId(String organizationId) {
this.organizationId = organizationId;
}
public String getRepresentativeId() {
return representativeId;
}
public void setRepresentativeId(String representativeId) {
this.representativeId = representativeId;
}
}

View File

@ -1,21 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class OrganizationMap extends HashMap<String, List<String>> {
public OrganizationMap() {
super();
}
public List<String> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

View File

@ -1,99 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.MyRuntimeException;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.graph.*;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
/**
* It process the ContextInfo information to produce a new Context Entity or a set of Relations between the generic
* context entity and datasource/projects related to the context.
*/
public class Process implements Serializable {
@SuppressWarnings("unchecked")
public static <R extends ResearchInitiative> R getEntity(ContextInfo ci) {
try {
ResearchInitiative ri;
if (ci.getType().equals("community")) {
ri = new ResearchCommunity();
((ResearchCommunity) ri).setSubject(ci.getSubject());
ri.setType(Constants.RESEARCH_COMMUNITY);
} else {
ri = new ResearchInitiative();
ri.setType(Constants.RESEARCH_INFRASTRUCTURE);
}
ri.setId(Utils.getContextId(ci.getId()));
ri.setAcronym(ci.getId());
ri.setDescription(ci.getDescription());
ri.setName(ci.getName());
if (StringUtils.isNotEmpty(ci.getZenodocommunity())) {
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
}
return (R) ri;
} catch (final Exception e) {
throw new MyRuntimeException(e);
}
}
public static List<Relation> getRelation(ContextInfo ci) {
try {
List<Relation> relationList = new ArrayList<>();
ci
.getDatasourceList()
.forEach(ds -> relationList.addAll(addRelations(ci, ds, ModelSupport.idPrefixEntity.get("10"))));
ci
.getProjectList()
.forEach(p -> relationList.addAll(addRelations(ci, p, ModelSupport.idPrefixEntity.get("40"))));
return relationList;
} catch (final Exception e) {
throw new MyRuntimeException(e);
}
}
private static List<Relation> addRelations(ContextInfo ci, String ds, String nodeType) {
List<Relation> relationList = new ArrayList<>();
String contextId = Utils.getContextId(ci.getId());
relationList
.add(
Relation
.newInstance(
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
ds, nodeType,
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
Provenance
.newInstance(
Constants.USER_CLAIM,
Constants.DEFAULT_TRUST)));
relationList
.add(
Relation
.newInstance(
ds, nodeType,
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
Provenance
.newInstance(
Constants.USER_CLAIM,
Constants.DEFAULT_TRUST)));
return relationList;
}
}

View File

@ -1,252 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.oa.model.graph.Relation;
import it.unimi.dsi.fastutil.objects.Object2BooleanMap;
import scala.Tuple2;
/**
* Reads all the entities of the same type (Relation / Results) and saves them in the same folder
*/
public class SparkCollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkCollectAndSave.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkCollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_collect_and_save.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final Boolean aggregateResult = Optional
.ofNullable(parser.get("resultAggregation"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "/result");
run(spark, inputPath, outputPath, aggregateResult);
});
}
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
if (aggregate) {
Utils
.readPath(spark, inputPath + "/result/publication", GraphResult.class)
.union(Utils.readPath(spark, inputPath + "/result/dataset", GraphResult.class))
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class))
.union(Utils.readPath(spark, inputPath + "/result/software", GraphResult.class))
.map(
(MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(outputPath + "/result");
} else {
write(
Utils
.readPath(spark, inputPath + "/result/publication", GraphResult.class),
outputPath + "/publication");
write(
Utils
.readPath(spark, inputPath + "/result/dataset", GraphResult.class),
outputPath + "/dataset");
write(
Utils
.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class),
outputPath + "/otherresearchproduct");
write(
Utils
.readPath(spark, inputPath + "/result/software", GraphResult.class),
outputPath + "/software");
}
// Dataset<String> dumpedIds = Utils.getEntitiesId(spark, outputPath);
// Dataset<Relation> relations = Utils
// .readPath(spark, inputPath + "/relation/publication", Relation.class)
// .union(Utils.readPath(spark, inputPath + "/relation/dataset", Relation.class))
// .union(Utils.readPath(spark, inputPath + "/relation/orp", Relation.class))
// .union(Utils.readPath(spark, inputPath + "/relation/software", Relation.class))
// .union(Utils.readPath(spark, inputPath + "/relation/contextOrg", Relation.class))
// .union(Utils.readPath(spark, inputPath + "/relation/context", Relation.class))
// .union(Utils.readPath(spark, inputPath + "/relation/relation", Relation.class));
// Utils.getValidRelations(relations, Utils.getEntitiesId(spark, outputPath))
Utils
.readPath(spark, inputPath + "/relation/publication", Relation.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/relation");
Utils
.readPath(spark, inputPath + "/relation/dataset", Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath + "/relation");
Utils
.readPath(spark, inputPath + "/relation/orp", Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath + "/relation");
Utils
.readPath(spark, inputPath + "/relation/software", Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath + "/relation");
Utils
.readPath(spark, inputPath + "/relation/contextOrg", Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath + "/relation");
Utils
.readPath(spark, inputPath + "/relation/context", Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath + "/relation");
Utils
.readPath(spark, inputPath + "/relation/relation", Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath + "/relation");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/publication", Relation.class),
// inputPath + "/relSource/publication");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/dataset", Relation.class),
// inputPath + "/relSource/dataset");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/orp", Relation.class),
// inputPath + "/relSource/orp");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/software", Relation.class),
// inputPath + "/relSource/software");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/contextOrg", Relation.class),
// inputPath + "/relSource/contextOrg");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/context", Relation.class),
// inputPath + "/relSource/context");
// relSource(
// inputPath, dumpedIds, Utils
// .readPath(spark, inputPath + "/relation/relation", Relation.class),
// inputPath + "/relSource/relation");
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/publication", Relation.class),
// SaveMode.Overwrite);
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/dataset", Relation.class),
// SaveMode.Append);
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/orp", Relation.class),
// SaveMode.Append);
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/software", Relation.class),
// SaveMode.Append);
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/contextOrg", Relation.class),
// SaveMode.Append);
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/context", Relation.class),
// SaveMode.Append);
// relTarget(
// outputPath, dumpedIds, Utils.readPath(spark, inputPath + "/relSource/relation", Relation.class),
// SaveMode.Append);
}
private static void relTarget(String outputPath, Dataset<String> dumpedIds, Dataset<Relation> relJoinSource,
SaveMode saveMode) {
relJoinSource
.joinWith(dumpedIds, relJoinSource.col("target").equalTo(dumpedIds.col("value")))
.map(
(MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
Encoders.bean(Relation.class))
.write()
.mode(saveMode)
.option("compression", "gzip")
.json(outputPath + "/relation");
}
private static void relSource(String inputPath, Dataset<String> dumpedIds, Dataset<Relation> relations,
String outputPath) {
relations
.joinWith(dumpedIds, relations.col("source").equalTo(dumpedIds.col("value")))
.map(
(MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static void write(Dataset<GraphResult> dataSet, String outputPath) {
dataSet
.map((MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(outputPath);
}
}

View File

@ -1,639 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.oa.model.Container;
import eu.dnetlib.dhp.oa.model.Result;
import eu.dnetlib.dhp.oa.model.graph.*;
import eu.dnetlib.dhp.oa.model.graph.Datasource;
import eu.dnetlib.dhp.oa.model.graph.Organization;
import eu.dnetlib.dhp.oa.model.graph.Project;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
/**
* Spark Job that fires the dump for the entites
*/
public class SparkDumpEntitiesJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpEntitiesJob.class);
public static final String COMPRESSION = "compression";
public static final String GZIP = "gzip";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpEntitiesJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Optional<String> communityMap = Optional.ofNullable(parser.get("communityMapPath"));
String communityMapPath = null;
if (communityMap.isPresent())
communityMapPath = communityMap.get();
Class<? extends OafEntity> inputClazz = (Class<? extends OafEntity>) Class.forName(resultClassName);
run(isSparkSessionManaged, inputPath, outputPath, communityMapPath, inputClazz);
}
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath,
Class<? extends OafEntity> inputClazz) {
SparkConf conf = new SparkConf();
switch (ModelSupport.idPrefixMap.get(inputClazz)) {
case "50":
String finalCommunityMapPath = communityMapPath;
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
resultDump(
spark, inputPath, outputPath, finalCommunityMapPath, inputClazz);
});
break;
case "40":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
projectMap(spark, inputPath, outputPath, inputClazz);
});
break;
case "20":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath, inputClazz);
});
break;
case "10":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
datasourceMap(spark, inputPath, outputPath, inputClazz);
});
break;
}
}
public static <I extends OafEntity> void resultDump(
SparkSession spark,
String inputPath,
String outputPath,
String communityMapPath,
Class<I> inputClazz) {
CommunityMap communityMap = null;
if (!StringUtils.isEmpty(communityMapPath))
communityMap = Utils.getCommunityMap(spark, communityMapPath);
CommunityMap finalCommunityMap = communityMap;
Utils
.readPath(spark, inputPath, inputClazz)
.map(
(MapFunction<I, GraphResult>) value -> execMap(value, finalCommunityMap),
Encoders.bean(GraphResult.class))
.filter((FilterFunction<GraphResult>) value -> value != null)
.map((MapFunction<GraphResult, String>) r -> new ObjectMapper().writeValueAsString(r), Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.text(outputPath);
}
private static <I extends OafEntity, O extends Result> O execMap(I value,
CommunityMap communityMap) throws NoAvailableEntityTypeException, CardinalityTooHighException {
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
if (Boolean.FALSE.equals(odInfo.isPresent())) {
return null;
}
if (Boolean.TRUE.equals(odInfo.get().getDeletedbyinference())
|| Boolean.TRUE.equals(odInfo.get().getInvisible())) {
return null;
}
return (O) ResultMapper.map(value, communityMap, Constants.DUMPTYPE.COMPLETE.getType());
}
private static <E extends OafEntity> void datasourceMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
Utils
.readPath(spark, inputPath, inputClazz)
.map(
(MapFunction<E, Datasource>) d -> mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) d),
Encoders.bean(Datasource.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.json(outputPath);
}
private static <E extends OafEntity> void projectMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
Utils
.readPath(spark, inputPath, inputClazz)
.map(
(MapFunction<E, Project>) p -> mapProject((eu.dnetlib.dhp.schema.oaf.Project) p),
Encoders.bean(Project.class))
.filter((FilterFunction<Project>) p -> p != null)
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.json(outputPath);
}
private static Datasource mapDatasource(eu.dnetlib.dhp.schema.oaf.Datasource d) {
if (Boolean.TRUE.equals(d.getDataInfo().getDeletedbyinference()))
return null;
Datasource datasource = new Datasource();
datasource.setId(getEntityId(d.getId(), ENTITY_ID_SEPARATOR));
Optional
.ofNullable(d.getOriginalId())
.ifPresent(
oId -> datasource.setOriginalId(oId.stream().filter(Objects::nonNull).collect(Collectors.toList())));
Optional
.ofNullable(d.getPid())
.ifPresent(
pids -> datasource
.setPid(
pids
.stream()
.map(p -> DatasourcePid.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
Optional
.ofNullable(d.getDatasourcetype())
.ifPresent(
dsType -> datasource
.setDatasourcetype(DatasourceSchemeValue.newInstance(dsType.getClassid(), dsType.getClassname())));
Optional
.ofNullable(d.getOpenairecompatibility())
.ifPresent(v -> datasource.setOpenairecompatibility(v.getClassname()));
Optional
.ofNullable(d.getOfficialname())
.ifPresent(oname -> datasource.setOfficialname(oname.getValue()));
Optional
.ofNullable(d.getEnglishname())
.ifPresent(ename -> datasource.setEnglishname(ename.getValue()));
Optional
.ofNullable(d.getWebsiteurl())
.ifPresent(wsite -> datasource.setWebsiteurl(wsite.getValue()));
Optional
.ofNullable(d.getLogourl())
.ifPresent(lurl -> datasource.setLogourl(lurl.getValue()));
Optional
.ofNullable(d.getDateofvalidation())
.ifPresent(dval -> datasource.setDateofvalidation(dval.getValue()));
Optional
.ofNullable(d.getDescription())
.ifPresent(dex -> datasource.setDescription(dex.getValue()));
Optional
.ofNullable(d.getSubjects())
.ifPresent(
sbjs -> datasource.setSubjects(sbjs.stream().map(sbj -> sbj.getValue()).collect(Collectors.toList())));
Optional
.ofNullable(d.getOdpolicies())
.ifPresent(odp -> datasource.setPolicies(Arrays.asList(odp.getValue())));
Optional
.ofNullable(d.getOdlanguages())
.ifPresent(
langs -> datasource
.setLanguages(langs.stream().map(lang -> lang.getValue()).collect(Collectors.toList())));
Optional
.ofNullable(d.getOdcontenttypes())
.ifPresent(
ctypes -> datasource
.setContenttypes(ctypes.stream().map(ctype -> ctype.getValue()).collect(Collectors.toList())));
Optional
.ofNullable(d.getReleasestartdate())
.ifPresent(rd -> datasource.setReleasestartdate(rd.getValue()));
Optional
.ofNullable(d.getReleaseenddate())
.ifPresent(ed -> datasource.setReleaseenddate(ed.getValue()));
Optional
.ofNullable(d.getMissionstatementurl())
.ifPresent(ms -> datasource.setMissionstatementurl(ms.getValue()));
Optional
.ofNullable(d.getDatabaseaccesstype())
.ifPresent(ar -> datasource.setAccessrights(ar.getValue()));
Optional
.ofNullable(d.getDatauploadtype())
.ifPresent(dut -> datasource.setUploadrights(dut.getValue()));
Optional
.ofNullable(d.getDatabaseaccessrestriction())
.ifPresent(dar -> datasource.setDatabaseaccessrestriction(dar.getValue()));
Optional
.ofNullable(d.getDatauploadrestriction())
.ifPresent(dur -> datasource.setDatauploadrestriction(dur.getValue()));
Optional
.ofNullable(d.getVersioning())
.ifPresent(v -> datasource.setVersioning(v.getValue()));
Optional
.ofNullable(d.getCitationguidelineurl())
.ifPresent(cu -> datasource.setCitationguidelineurl(cu.getValue()));
Optional
.ofNullable(d.getPidsystems())
.ifPresent(ps -> datasource.setPidsystems(ps.getValue()));
Optional
.ofNullable(d.getCertificates())
.ifPresent(c -> datasource.setCertificates(c.getValue()));
Optional
.ofNullable(d.getPolicies())
.ifPresent(ps -> datasource.setPolicies(ps.stream().map(p -> p.getValue()).collect(Collectors.toList())));
Optional
.ofNullable(d.getJournal())
.ifPresent(j -> datasource.setJournal(getContainer(j)));
// Optional
// .ofNullable(d.getMeasures())
// .ifPresent(m -> datasource.setIndicators(Utils.getIndicator(d.getMeasures())));
return datasource;
}
private static Container getContainer(Journal j) {
Container c = new Container();
Optional
.ofNullable(j.getName())
.ifPresent(n -> c.setName(n));
Optional
.ofNullable(j.getIssnPrinted())
.ifPresent(issnp -> c.setIssnPrinted(issnp));
Optional
.ofNullable(j.getIssnOnline())
.ifPresent(issno -> c.setIssnOnline(issno));
Optional
.ofNullable(j.getIssnLinking())
.ifPresent(isnl -> c.setIssnLinking(isnl));
Optional
.ofNullable(j.getEp())
.ifPresent(ep -> c.setEp(ep));
Optional
.ofNullable(j.getIss())
.ifPresent(iss -> c.setIss(iss));
Optional
.ofNullable(j.getSp())
.ifPresent(sp -> c.setSp(sp));
Optional
.ofNullable(j.getVol())
.ifPresent(vol -> c.setVol(vol));
Optional
.ofNullable(j.getEdition())
.ifPresent(edition -> c.setEdition(edition));
Optional
.ofNullable(j.getConferencedate())
.ifPresent(cdate -> c.setConferencedate(cdate));
Optional
.ofNullable(j.getConferenceplace())
.ifPresent(cplace -> c.setConferenceplace(cplace));
return c;
}
private static Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p) throws DocumentException {
if (Boolean.TRUE.equals(p.getDataInfo().getDeletedbyinference()))
return null;
Project project = new Project();
Optional
.ofNullable(p.getId())
.ifPresent(id -> project.setId(getEntityId(id, ENTITY_ID_SEPARATOR)));
Optional
.ofNullable(p.getWebsiteurl())
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
Optional
.ofNullable(p.getCode())
.ifPresent(code -> project.setCode(code.getValue()));
Optional
.ofNullable(p.getAcronym())
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
Optional
.ofNullable(p.getTitle())
.ifPresent(title -> project.setTitle(title.getValue()));
Optional
.ofNullable(p.getStartdate())
.ifPresent(sdate -> project.setStartdate(sdate.getValue()));
Optional
.ofNullable(p.getEnddate())
.ifPresent(edate -> project.setEnddate(edate.getValue()));
Optional
.ofNullable(p.getCallidentifier())
.ifPresent(cide -> project.setCallidentifier(cide.getValue()));
Optional
.ofNullable(p.getKeywords())
.ifPresent(key -> project.setKeywords(key.getValue()));
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
boolean mandate = false;
if (omandate.isPresent()) {
if (omandate.get().getValue().equals("true")) {
mandate = true;
}
}
if (oecsc39.isPresent()) {
if (oecsc39.get().getValue().equals("true")) {
mandate = true;
}
}
project.setOpenaccessmandateforpublications(mandate);
project.setOpenaccessmandatefordataset(false);
Optional
.ofNullable(p.getEcarticle29_3())
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true")));
project
.setSubject(
Optional
.ofNullable(p.getSubjects())
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
Optional
.ofNullable(p.getSummary())
.ifPresent(summary -> project.setSummary(summary.getValue()));
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
Optional<Float> ototalcost = Optional.ofNullable(p.getTotalcost());
if (ocurrency.isPresent()) {
if (ofundedamount.isPresent()) {
if (ototalcost.isPresent()) {
project
.setGranted(
Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
} else {
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
}
}
}
project
.setH2020programme(
Optional
.ofNullable(p.getH2020classification())
.map(
classification -> classification
.stream()
.map(
c -> Programme
.newInstance(
c.getH2020Programme().getCode(), c.getH2020Programme().getDescription()))
.collect(Collectors.toList()))
.orElse(new ArrayList<>()));
Optional<List<Field<String>>> ofundTree = Optional
.ofNullable(p.getFundingtree());
List<Funder> funList = new ArrayList<>();
if (ofundTree.isPresent()) {
for (Field<String> fundingtree : ofundTree.get()) {
funList.add(getFunder(fundingtree.getValue()));
}
}
project.setFunding(funList);
// if (Optional.ofNullable(p.getMeasures()).isPresent()) {
// project.setIndicators(Utils.getIndicator(p.getMeasures()));
// }
return project;
}
public static Funder getFunder(String fundingtree) throws DocumentException {
Funder f = new Funder();
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
String id = "";
StringBuilder bld = new StringBuilder();
int level = 0;
List<org.dom4j.Node> nodes = doc.selectNodes("//funding_level_" + level);
while (!nodes.isEmpty()) {
for (org.dom4j.Node n : nodes) {
List node = n.selectNodes("./id");
id = ((org.dom4j.Node) node.get(0)).getText();
id = id.substring(id.indexOf("::") + 2);
node = n.selectNodes("./description");
bld.append(((Node) node.get(0)).getText() + " - ");
}
level += 1;
nodes = doc.selectNodes("//funding_level_" + level);
}
String description = bld.toString();
if (!id.equals("")) {
Fundings fundings = new Fundings();
fundings.setId(id);
fundings.setDescription(description.substring(0, description.length() - 3).trim());
f.setFunding_stream(fundings);
}
return f;
}
private static <E extends OafEntity> void organizationMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
Utils
.readPath(spark, inputPath, inputClazz)
.map(
(MapFunction<E, Organization>) o -> mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) o),
Encoders.bean(Organization.class))
.filter((FilterFunction<Organization>) o -> o != null)
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.json(outputPath);
}
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
eu.dnetlib.dhp.schema.oaf.Organization org) {
if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference()))
return null;
if (!Optional.ofNullable(org.getLegalname()).isPresent()
&& !Optional.ofNullable(org.getLegalshortname()).isPresent())
return null;
Organization organization = new Organization();
Optional
.ofNullable(org.getLegalshortname())
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
Optional
.ofNullable(org.getLegalname())
.ifPresent(value -> organization.setLegalname(value.getValue()));
Optional
.ofNullable(org.getWebsiteurl())
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
Optional
.ofNullable(org.getAlternativeNames())
.ifPresent(
value -> organization
.setAlternativenames(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(org.getCountry())
.ifPresent(
value -> {
if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
organization
.setCountry(
eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
}
});
Optional
.ofNullable(org.getId())
.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
Optional
.ofNullable(org.getPid())
.ifPresent(
value -> organization
.setPid(
value
.stream()
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
return organization;
}
}

View File

@ -1,132 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.graph.RelType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
/**
* Dumps eu.dnetlib.dhp.schema.oaf.Relation in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation
*/
public class SparkDumpRelationJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpRelationJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpRelationJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
Optional<String> rs = Optional.ofNullable(parser.get("removeSet"));
final Set<String> removeSet = new HashSet<>();
if (rs.isPresent()) {
Collections.addAll(removeSet, rs.get().split(";"));
}
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
dumpRelation(spark, inputPath, outputPath, removeSet);
});
}
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, Set<String> removeSet) {
Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class);
relations
.filter(
(FilterFunction<Relation>) r -> !removeSet.contains(r.getRelClass())
&& !r.getSubRelType().equalsIgnoreCase("resultService"))
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setReltype(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(outputPath);
}
}

View File

@ -1,54 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Spark job that fires the extraction of relations from entities
*/
public class SparkExtractRelationFromEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkExtractRelationFromEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkExtractRelationFromEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String communityMapPath = parser.get("communityMapPath");
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
Extractor extractor = new Extractor();
extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath);
}
}

View File

@ -1,188 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import static eu.dnetlib.dhp.schema.common.ModelSupport.idPrefixMap;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Consumer;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.communityapi.model.CommunityEntityMap;
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.graph.RelType;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation;
/**
* Create new Relations between Context Entities and Organizations whose products are associated to the context. It
* produces relation such as: organization <-> isRelatedTo <-> context
*/
public class SparkOrganizationRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkOrganizationRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
UtilCommunityAPI queryCommunityAPI = new UtilCommunityAPI();
final CommunityEntityMap organizationMap = queryCommunityAPI.getCommunityOrganization();
final String serializedOrganizationMap = new Gson().toJson(organizationMap);
log.info("organization map : {}", serializedOrganizationMap);
final String communityMapPath = parser.get("communityMapPath");
log.info("communityMapPath: {}", communityMapPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
extractRelation(spark, inputPath, organizationMap, outputPath, communityMapPath);
});
}
private static void extractRelation(SparkSession spark, String inputPath, CommunityEntityMap organizationMap,
String outputPath, String communityMapPath) {
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
relationDataset.createOrReplaceTempView("relation");
List<eu.dnetlib.dhp.oa.model.graph.Relation> relList = new ArrayList<>();
Dataset<MergedRels> mergedRelsDataset = spark
.sql(
"SELECT target organizationId, source representativeId " +
"FROM relation " +
"WHERE datainfo.deletedbyinference = false " +
"AND relclass = 'merges' " +
"AND substr(source, 1, 2) = '20'")
.as(Encoders.bean(MergedRels.class));
mergedRelsDataset.map((MapFunction<MergedRels, MergedRels>) mergedRels -> {
if (organizationMap.containsKey(getEntityId(mergedRels.getOrganizationId(), ENTITY_ID_SEPARATOR))) {
return mergedRels;
}
return null;
}, Encoders.bean(MergedRels.class))
.filter(Objects::nonNull)
.collectAsList()
.forEach(getMergedRelsConsumer(organizationMap, relList, communityMap));
organizationMap
.keySet()
.forEach(
oId -> organizationMap
.get(oId)
.forEach(community -> {
if (communityMap.containsKey(community)) {
addRelations(relList, community, oId);
}
}));
spark
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
@NotNull
private static Consumer<MergedRels> getMergedRelsConsumer(CommunityEntityMap organizationMap,
List<eu.dnetlib.dhp.oa.model.graph.Relation> relList, CommunityMap communityMap) {
return mergedRels -> {
String oId = getEntityId(mergedRels.getOrganizationId(), ENTITY_ID_SEPARATOR);
organizationMap
.get(oId)
.forEach(community -> {
if (communityMap.containsKey(community)) {
addRelations(
relList, community, getEntityId(mergedRels.getRepresentativeId(), ENTITY_ID_SEPARATOR));
}
});
organizationMap.remove(oId);
};
}
private static void addRelations(List<eu.dnetlib.dhp.oa.model.graph.Relation> relList, String community,
String organization) {
String id = Utils.getContextId(community);
log.info("create relation for organization: {}", organization);
relList
.add(
eu.dnetlib.dhp.oa.model.graph.Relation
.newInstance(
id, Constants.CONTEXT_ENTITY,
organization,
ModelSupport.idPrefixEntity.get(idPrefixMap.get(Organization.class)),
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
Provenance
.newInstance(
eu.dnetlib.dhp.oa.graph.dump.Constants.USER_CLAIM,
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)));
relList
.add(
eu.dnetlib.dhp.oa.model.graph.Relation
.newInstance(
organization, ModelSupport.idPrefixEntity.get(idPrefixMap.get(Organization.class)),
id, Constants.CONTEXT_ENTITY,
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
Provenance
.newInstance(
eu.dnetlib.dhp.oa.graph.dump.Constants.USER_CLAIM,
eu.dnetlib.dhp.oa.graph.dump.Constants.DEFAULT_TRUST)));
}
}

View File

@ -1,101 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.oaf.*;
/**
* It selects the valid relations among those present in the graph. One relation is valid if it is not deletedbyinference
* and if both the source and the target node are present in the graph and are not deleted by inference nor invisible.
* To check this I made a view of the ids of all the entities in the graph, and select the relations for which a join exists
* with this view for both the source and the target
*/
public class SparkSelectValidRelationsJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSelectValidRelationsJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkSelectValidRelationsJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
selectValidRelation2(spark, inputPath, outputPath);
});
}
private static void selectValidRelation2(SparkSession spark, String inputPath, String outputPath) {
final StructType structureSchema = new StructType()
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
org.apache.spark.sql.Dataset<Row> df = spark.createDataFrame(new ArrayList<Row>(), structureSchema);
List<String> entities = Arrays
.asList(
"publication", "dataset", "otherresearchproduct", "software", "organization", "project", "datasource");
for (String e : entities)
df = df
.union(
spark
.read()
.schema(structureSchema)
.json(inputPath + "/" + e)
.filter("dataInfo.deletedbyinference != true and dataInfo.invisible != true"));
org.apache.spark.sql.Dataset<Row> relations = spark
.read()
.schema(Encoders.bean(Relation.class).schema())
.json(inputPath + "/relation")
.filter("dataInfo.deletedbyinference == false");
relations
.join(df, relations.col("source").equalTo(df.col("id")), "leftsemi")
.join(df, relations.col("target").equalTo(df.col("id")), "leftsemi")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -1,136 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.country;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import javax.rmi.CORBA.Util;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.subset.MasterDuplicate;
import eu.dnetlib.dhp.oa.graph.dump.subset.SparkDumpResult;
import eu.dnetlib.dhp.oa.graph.dump.subset.criteria.VerbResolver;
import eu.dnetlib.dhp.oa.graph.dump.subset.criteria.VerbResolverFactory;
import eu.dnetlib.dhp.oa.graph.dump.subset.selectionconstraints.Param;
import eu.dnetlib.dhp.oa.graph.dump.subset.selectionconstraints.SelectionConstraints;
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 27/04/23
* Selects the results having in the country the given country
*/
public class SparkFindResultWithCountry implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkFindResultWithCountry.class);
public static final String COMPRESSION = "compression";
public static final String GZIP = "gzip";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkFindResultWithCountry.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/result_country_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String preparedInfoPath = parser.get("resultWithCountry");
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> inputClazz = (Class<? extends eu.dnetlib.dhp.schema.oaf.Result>) Class
.forName(resultClassName);
run(
isSparkSessionManaged, inputPath, outputPath, inputClazz,
resultType, preparedInfoPath);
}
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath,
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> inputClazz, String resultType, String preparedInfoPath) {
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "/original/" + resultType);
resultDump(
spark, inputPath, outputPath, inputClazz, resultType, preparedInfoPath);
});
}
public static <I extends eu.dnetlib.dhp.schema.oaf.Result> void resultDump(
SparkSession spark,
String inputPath,
String outputPath,
Class<I> inputClazz,
String resultType,
String preparedInfoPath) {
Dataset<String> resultsWithCountry = spark.read().textFile(preparedInfoPath).distinct();
Dataset<I> result = Utils
.readPath(spark, inputPath, inputClazz)
.filter(
(FilterFunction<I>) r -> !r.getDataInfo().getInvisible() && !r.getDataInfo().getDeletedbyinference());
resultsWithCountry
.joinWith(result, resultsWithCountry.col("value").equalTo(result.col("id")))
.map((MapFunction<Tuple2<String, I>, I>) t2 -> t2._2(), Encoders.bean(inputClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/original/" + resultType);
}
}

View File

@ -1,173 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.country;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 27/04/23
* Finds the results id which are in relation with another entity having the given country
* or that have that country in the country list
*/
public class SparkFindResultsRelatedToCountry implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkFindResultsRelatedToCountry.class);
public static final String COMPRESSION = "compression";
public static final String GZIP = "gzip";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkFindResultsRelatedToCountry.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/result_related_country_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String country = parser.get("country");
run(
isSparkSessionManaged, inputPath, outputPath, country);
}
private static void run(Boolean isSparkSessionManaged, String inputPath, String outputPath,
String country) {
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
findRelatedEntities(
spark, inputPath, outputPath, country);
});
}
public static <I extends eu.dnetlib.dhp.schema.oaf.Result> void findRelatedEntities(
SparkSession spark,
String inputPath,
String outputPath,
String country) {
Dataset<Project> projectsInCountry = Utils
.readPath(spark, inputPath + "/project", Project.class)
.filter((FilterFunction<Project>) p -> isCountryInFunderJurisdiction(p.getFundingtree(), country));
Dataset<Relation> relsProjectResults = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.PRODUCES));
projectsInCountry
.joinWith(relsProjectResults, projectsInCountry.col("id").equalTo(relsProjectResults.col("source")))
.map((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._2().getTarget(), Encoders.STRING())
.write()
.option(COMPRESSION, GZIP)
.mode(SaveMode.Overwrite)
.text(outputPath);
Dataset<Organization> organizationsInCountry = Utils
.readPath(spark, inputPath + "/organization", Organization.class)
.filter(
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
&& o.getCountry().getClassid().equals(country));
Dataset<Relation> relsOrganizationResults = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.IS_AUTHOR_INSTITUTION_OF));
organizationsInCountry
.joinWith(
relsOrganizationResults,
organizationsInCountry.col("id").equalTo(relsOrganizationResults.col("source")))
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getTarget(), Encoders.STRING())
.write()
.option(COMPRESSION, GZIP)
.mode(SaveMode.Append)
.text(outputPath);
selectResultWithCountry(spark, inputPath, outputPath, country, "publication", Publication.class);
selectResultWithCountry(
spark, inputPath, outputPath, country, "dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
selectResultWithCountry(spark, inputPath, outputPath, country, "software", Software.class);
selectResultWithCountry(
spark, inputPath, outputPath, country, "otherresearchproduct", OtherResearchProduct.class);
}
private static <R extends Result> void selectResultWithCountry(SparkSession spark, String inputPath,
String outputPath, String country, String type, Class<R> inputClazz) {
Utils
.readPath(spark, inputPath + "/" + type, inputClazz)
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible() &&
p.getCountry() != null &&
p.getCountry().stream().anyMatch(c -> c.getClassid().equals(country)))
.map((MapFunction<R, String>) p -> p.getId(), Encoders.STRING())
.write()
.option(COMPRESSION, GZIP)
.mode(SaveMode.Append)
.text(outputPath);
}
private static boolean isCountryInFunderJurisdiction(List<Field<String>> fundingtrees, String country) {
try {
final SAXReader reader = new SAXReader();
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
for (Field<String> fundingtree : fundingtrees) {
final Document doc = reader.read(new StringReader(fundingtree.getValue()));
if (((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText().equals(country)) {
return true;
}
}
return false;
} catch (DocumentException | SAXException e) {
throw new IllegalArgumentException(e);
}
}
}

View File

@ -1,102 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import java.io.Serializable;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* @author miriam.baglioni
* @Date 05/05/23
*/
public class AuthorResult implements Serializable {
private String authorId;
private String firstName;
private String lastName;
private String fullName;
private String orcid;
private String resultId;
private String rank;
private Boolean fromOrcid;
public Boolean getFromOrcid() {
return fromOrcid;
}
public void setFromOrcid(Boolean fromOrcid) {
this.fromOrcid = fromOrcid;
}
public String getFullName() {
return fullName;
}
public void setFullName(String fullName) {
this.fullName = fullName;
}
public String getAuthorId() {
return authorId;
}
public void setAuthorId(String authorId) {
this.authorId = authorId;
}
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public String getRank() {
return rank;
}
public void setRank(String rank) {
this.rank = rank;
}
public String getId() {
return authorId;
}
public void setId(String id) {
this.authorId = id;
}
public String getFirstName() {
return firstName;
}
public void setFirstName(String firstName) {
this.firstName = firstName;
}
public String getLastName() {
return lastName;
}
public void setLastName(String lastName) {
this.lastName = lastName;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public void autosetId() {
if (orcid != null) {
authorId = DHPUtils.md5(orcid);
} else {
authorId = DHPUtils.md5(resultId + rank);
}
}
}

View File

@ -1,20 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import java.io.Serializable;
import org.apache.commons.lang.StringUtils;
/**
* @author miriam.baglioni
* @Date 10/05/23
*/
public class Constants implements Serializable {
public final static String SEP = "\t";
public static final String addQuotes(String id) {
// if (StringUtils.isNotEmpty(id))
// return "\"" + id + "\"";
return id;
}
}

View File

@ -1,96 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.commons.lang3.StringUtils.split;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.UtilCommunityAPI;
/**
* @author miriam.baglioni
* @Date 09/05/23
*/
//STEP 1
public class DumpCommunities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class);
private final BufferedWriter writer;
private final static String HEADER = "id" + Constants.SEP + "name" + Constants.SEP + "acronym" + Constants.SEP
+ " description \n";
private final transient UtilCommunityAPI queryCommunityAPI;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpCommunities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String nameNode = parser.get("nameNode");
log.info("nameNode: {}", nameNode);
final List<String> communities = Arrays.asList(split(parser.get("communities"), ";"));
final DumpCommunities dc = new DumpCommunities(outputPath, nameNode);
dc.writeCommunity(communities);
}
private void writeCommunity(List<String> communities)
throws IOException {
writer.write(HEADER);
writer.flush();
for (String community : queryCommunityAPI
.getCommunityCsv(communities)) {
writer
.write(
community);
writer.write("\n");
}
writer.close();
}
public DumpCommunities(String hdfsPath, String hdfsNameNode) throws Exception {
final Configuration conf = new Configuration();
queryCommunityAPI = new UtilCommunityAPI();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
Path hdfsWritePath = new Path(hdfsPath);
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
}
}

View File

@ -1,362 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.commons.lang3.StringUtils.remove;
import static org.apache.commons.lang3.StringUtils.split;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 04/05/23
*/
//STEP 3
public class SparkDumpResults implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpResults.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpResults.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String workingPath = parser.get("workingPath");
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
run(spark, inputPath, inputClazz, resultType, workingPath);
});
}
private static <R extends Result> void run(SparkSession spark, String inputPath,
Class<R> inputClazz, String resultType, String workingPath) {
Dataset<String> resultIds = spark.read().textFile(workingPath + "/resultIds");
// resultIds.foreach((ForeachFunction<String>) r -> System.out.println(r));
Dataset<R> results = Utils
.readPath(spark, inputPath + "/" + resultType, inputClazz)
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible());
resultIds
.joinWith(results, resultIds.col("value").equalTo(results.col("id")))
.map((MapFunction<Tuple2<String, R>, R>) t2 -> t2._2(), Encoders.bean(inputClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/" + resultType + "/temp/result");
// map results
results = Utils.readPath(spark, workingPath + "/" + resultType + "/temp/result", inputClazz);
results
.map(
(MapFunction<R, CSVResult>) r -> mapResultInfo(r),
Encoders.bean(CSVResult.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingPath + "/" + resultType + "/result");
// map relations between pid and result
results
.flatMap((FlatMapFunction<R, CSVPid>) r -> {
List<CSVPid> pids = new ArrayList<>();
if (Optional.ofNullable(r.getPid()).isPresent() && r.getPid().size() > 0) {
pids.addAll(mapPid(r.getPid(), r.getId()));
}
return pids.iterator();
}, Encoders.bean(CSVPid.class))
.filter(Objects::nonNull)
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingPath + "/" + resultType + "/result_pid");
// map authors from the result
// per ogni autore nel result
// se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid))
// se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua
// la sua posizione nell'insieme degli autori) sempre con md5
results
.flatMap((FlatMapFunction<R, AuthorResult>) r -> {
int count = 0;
List<AuthorResult> arl = new ArrayList<>();
Set<String> authorIds = new HashSet();
if (Optional.ofNullable(r.getAuthor()).isPresent()) {
for (Author a : r.getAuthor()) {
count += 1;
AuthorResult ar = new AuthorResult();
ar.setResultId(r.getId());
if (Optional.ofNullable(a.getRank()).isPresent()) {
if (a.getRank() > 0) {
ar.setRank(String.valueOf(a.getRank()));
} else {
ar.setRank(String.valueOf(count));
}
}
ar.setFirstName(removeBreaks(a.getName()));
ar.setLastName(removeBreaks(a.getSurname()));
ar.setFullName(removeBreaks(a.getFullname()));
Tuple2<String, Boolean> orcid = getOrcid(a.getPid());
if (Optional.ofNullable(orcid).isPresent()) {
ar.setOrcid(orcid._1());
ar.setFromOrcid(orcid._2());
}
ar.autosetId();
if (!authorIds.contains(ar.getAuthorId())) {
arl.add(ar);
authorIds.add(ar.getAuthorId());
}
}
}
return arl.iterator();
}, Encoders.bean(AuthorResult.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/" + resultType + "/temp/authorresult");
Dataset<AuthorResult> authorResult = Utils
.readPath(spark, workingPath + "/" + resultType + "/temp/authorresult", AuthorResult.class);
// map the relation between author and result
authorResult
.map(
(MapFunction<AuthorResult, CSVRelResAut>) ar -> {
CSVRelResAut ret = new CSVRelResAut();
ret.setResult_id(ar.getResultId());
ret.setAuthor_id(ar.getAuthorId());
return ret;
},
Encoders.bean(CSVRelResAut.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingPath + "/" + resultType + "/result_author");
// ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose
// the one from orcid if any
authorResult
.groupByKey((MapFunction<AuthorResult, String>) ar -> ar.getAuthorId(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, AuthorResult, CSVAuthor>) (k, it) -> {
AuthorResult first = it.next();
if (!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid())
return getAuthorDump(first);
while (it.hasNext()) {
AuthorResult ar = it.next();
if (ar.getFromOrcid())
return getAuthorDump(ar);
}
return getAuthorDump(first);
},
Encoders.bean(CSVAuthor.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingPath + "/" + resultType + "/author");
}
private static List<CSVPid> mapPid(List<StructuredProperty> pid, String resultId) {
return pid
.stream()
.map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase())
.distinct()
.map(p -> {
CSVPid ret = new CSVPid();
ret.setId(DHPUtils.md5(p + "@" + resultId));
ret.setResult_id(resultId);
ret.setPid(split(p, "@")[1]);
ret.setType(split(p, "@")[0]);
return ret;
})
.collect(Collectors.toList());
}
private static CSVAuthor getAuthorDump(AuthorResult ar) {
CSVAuthor ret = new CSVAuthor();
ret.setFirstname(ar.getFirstName());
ret.setId(ar.getAuthorId());
ret.setLastname(ar.getLastName());
ret.setFullname(ar.getFullName());
if (ar.getOrcid() != null) {
ret.setOrcid(ar.getOrcid());
ret.setFromOrcid(ar.getFromOrcid());
} else {
ret.setOrcid("");
}
return ret;
}
private static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
if (!Optional.ofNullable(pid).isPresent())
return null;
if (pid.size() == 0)
return null;
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
return new Tuple2<>(p.getValue(), Boolean.TRUE);
}
}
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
return new Tuple2<>(p.getValue(), Boolean.FALSE);
}
}
return null;
}
private static String getFieldValue(Field<String> input) {
if (input != null &&
StringUtils.isNotEmpty(input.getValue())) {
return removeBreaks(input.getValue());
} else {
return "";
}
}
private static <R extends Result> CSVResult mapResultInfo(R r) {
CSVResult ret = new CSVResult();
ret.setId(removeBreaks(r.getId()));
ret.setType(removeBreaks(r.getResulttype().getClassid()));
ret.setTitle(getTitle(r.getTitle()));
ret.setDescription(getAbstract(r.getDescription()));
ret.setAccessright(removeBreaks(r.getBestaccessright().getClassid()));
ret.setPublication_date(removeBreaks(getFieldValue(r.getDateofacceptance())));
ret.setPublisher(removeBreaks(getFieldValue(r.getPublisher())));
if (Optional.ofNullable(r.getSubject()).isPresent())
ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> {
if (StringUtils.isNotEmpty(s.getValue()))
return removeBreaks(s.getValue().toLowerCase());
else
return null;
}).filter(Objects::nonNull).distinct().collect(Collectors.toList())));
else
ret.setKeywords("");
if (Optional.ofNullable(r.getCountry()).isPresent())
ret
.setCountry(
String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList())));
else
ret.setCountry("");
if (Optional.ofNullable(r.getLanguage()).isPresent() && StringUtils.isNotEmpty(r.getLanguage().getClassid())) {
ret.setLanguage(r.getLanguage().getClassid());
} else {
ret.setLanguage("");
}
return ret;
}
private static String getAbstract(List<Field<String>> description) {
if (description == null)
return "";
for (Field<String> abs : description) {
if (StringUtils.isNotEmpty(abs.getValue())) {
return removeBreaks(abs.getValue());
}
}
return "";
}
private static String getTitle(List<StructuredProperty> titles) {
String firstTitle = null;
for (StructuredProperty title : titles) {
if (StringUtils.isEmpty(firstTitle)) {
if (StringUtils.isNotEmpty(title.getValue()))
firstTitle = removeBreaks(title.getValue());
}
if (title.getQualifier().getClassid().equals(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())) {
if (StringUtils.isNotEmpty(title.getValue()))
return removeBreaks(title.getValue());
}
}
if (firstTitle != null) {
return removeBreaks(firstTitle);
}
return "";
}
private static String removeBreaks(String input) {
if (StringUtils.isNotEmpty(input))
return input
.replace("\n", " ")
.replace("\t", " ")
.replace("\r", " ")
// .replace("\\", " ")
.replace("\"", " ");
return input;
}
}

View File

@ -1,133 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
import eu.dnetlib.dhp.schema.oaf.*;
/**
* @author miriam.baglioni
* @Date 10/05/23
*/
//STEP 4
public class SparkMoveOnSigleDir implements Serializable {
// All the products saved in different directories are put under the same one.
// For the authors also a step of reconciliation mast be done, since the same author id can be saved in more than
// one directory
private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkMoveOnSigleDir.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
run(spark, outputPath, workingPath);
});
}
private static <R extends Result> void run(SparkSession spark, String outputPath,
String workingPath) {
Utils
.readPath(spark, workingPath + "/publication/result", CSVResult.class)
.union(Utils.readPath(spark, workingPath + "/dataset/result", CSVResult.class))
.union(Utils.readPath(spark, workingPath + "/software/result", CSVResult.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result", CSVResult.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/result");
Utils
.readPath(spark, workingPath + "/publication/result_pid", CSVPid.class)
.union(Utils.readPath(spark, workingPath + "/dataset/result_pid", CSVPid.class))
.union(Utils.readPath(spark, workingPath + "/software/result_pid", CSVPid.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_pid", CSVPid.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/result_pid");
Utils
.readPath(spark, workingPath + "/publication/result_author", CSVRelResAut.class)
.union(Utils.readPath(spark, workingPath + "/dataset/result_author", CSVRelResAut.class))
.union(Utils.readPath(spark, workingPath + "/software/result_author", CSVRelResAut.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_author", CSVRelResAut.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/result_author");
Utils
.readPath(spark, workingPath + "/publication/author", CSVAuthor.class)
.union(Utils.readPath(spark, workingPath + "/dataset/author", CSVAuthor.class))
.union(Utils.readPath(spark, workingPath + "/software/author", CSVAuthor.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/author", CSVAuthor.class))
.groupByKey((MapFunction<CSVAuthor, String>) r -> r.getId(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, CSVAuthor, CSVAuthor>) (k, it) -> it.next(), Encoders.bean(CSVAuthor.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/author");
}
}

View File

@ -1,227 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVCitation;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 04/05/23
*/
//STEP 2
public class SparkSelectResultsAndDumpRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class);
private static String RESULT_COMMUNITY_TABLE = "/result_community";
private static String COMMUNITY_RESULT_IDS = "/communityResultIds";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkSelectResultsAndDumpRelations.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String workingPath = parser.get("workingPath");
List<String> communityList = null;
Optional<String> communities = Optional.ofNullable(parser.get("communities"));
if (communities.isPresent()) {
communityList = Arrays.asList(communities.get().split(";"));
}
SparkConf conf = new SparkConf();
List<String> finalCommunityList = communityList;
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
run(spark, inputPath, outputPath, workingPath, finalCommunityList);
});
}
private static void run(SparkSession spark, String inputPath, String outputPath,
String workingPath,
List<String> communityList) {
// select the result ids related to the set of communities considered
writeCommunityRelatedIds(
spark, inputPath + "/publication", Publication.class, communityList, workingPath + COMMUNITY_RESULT_IDS);
writeCommunityRelatedIds(
spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + COMMUNITY_RESULT_IDS);
writeCommunityRelatedIds(
spark, inputPath + "/software", Software.class, communityList, workingPath + COMMUNITY_RESULT_IDS);
writeCommunityRelatedIds(
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList,
workingPath + COMMUNITY_RESULT_IDS);
// write the relations result communities
writeCommunityResultRelations(
spark, inputPath + "/publication", Publication.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
writeCommunityResultRelations(
spark, inputPath + "/dataset", Dataset.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
writeCommunityResultRelations(
spark, inputPath + "/software", Software.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
writeCommunityResultRelations(
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList,
outputPath + RESULT_COMMUNITY_TABLE);
// select the relations with semantics cites
org.apache.spark.sql.Dataset<Relation> relations = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.CITES));
// select the results target of the selected relations having as source one of the results related to the
// communities
org.apache.spark.sql.Dataset<String> resultIds = spark
.read()
.textFile(workingPath + COMMUNITY_RESULT_IDS)
.distinct();
resultIds
.joinWith(relations, resultIds.col("value").equalTo(relations.col("source")), "left")
.flatMap((FlatMapFunction<Tuple2<String, Relation>, String>) t2 -> {
if (Optional.ofNullable(t2._2()).isPresent()) {
return Arrays.asList(t2._1(), t2._2().getTarget()).iterator();
} else {
return Arrays.asList(t2._1()).iterator();
}
}, Encoders.STRING())
.distinct()
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(workingPath + "/resultIds");
resultIds
.joinWith(relations, resultIds.col("value").equalTo(relations.col("source")))
.map(
(MapFunction<Tuple2<String, Relation>, CSVCitation>) t2 -> mapToCitation(t2._2()),
Encoders.bean(CSVCitation.class))
.write()
.option("compression", "gzip")
.option("header", "true")
.option("delimiter", Constants.SEP)
.mode(SaveMode.Overwrite)
.csv(outputPath + "/relation");
}
private static CSVCitation mapToCitation(Relation relation) {
CSVCitation ret = new CSVCitation();
ret.setId(DHPUtils.md5(relation.getSource() + relation.getRelClass().toLowerCase() + relation.getTarget()));
ret.setResult_id_cites(relation.getSource());
ret.setResult_id_cited(relation.getTarget());
return ret;
}
private static <R extends Result> void writeCommunityResultRelations(SparkSession spark, String inputPath,
Class<R> clazz, List<String> communityList, String outputPath) {
Utils
.readPath(spark, inputPath, clazz)
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible())
.flatMap((FlatMapFunction<R, CSVRELCommunityResult>) p -> {
Set<String> inserted = new HashSet<>();
List<CSVRELCommunityResult> ret = new ArrayList<>();
for (String context : p
.getContext()
.stream()
.map(Context::getId)
.distinct()
.collect(Collectors.toList())) {
String cId = context.contains("::")
? context.substring(0, context.indexOf("::"))
: context;
if (communityList.contains(cId) && !inserted.contains(cId)) {
CSVRELCommunityResult crc = new CSVRELCommunityResult();
crc.setResult_id(p.getId());
crc.setCommunity_id(DHPUtils.md5(cId));
ret.add(crc);
inserted.add(cId);
}
}
return ret.iterator();
}, Encoders.bean(CSVRELCommunityResult.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(outputPath);
}
private static <R extends Result> void writeCommunityRelatedIds(SparkSession spark, String inputPath,
Class<R> clazz, List<String> communityList, String outputPath) {
Utils
.readPath(spark, inputPath, clazz)
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible() &&
isRelatedToCommunities(p, communityList))
.map((MapFunction<R, String>) Result::getId, Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.text(outputPath);
}
private static <R extends Result> boolean isRelatedToCommunities(R p, List<String> communityList) {
return p
.getContext()
.stream()
.anyMatch(
c -> communityList.contains(c.getId()) ||
(c.getId().contains("::")
&& communityList.contains(c.getId().substring(0, c.getId().indexOf("::")))));
}
}

View File

@ -1,68 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
import java.io.Serializable;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class CSVAuthor implements Serializable {
private String id;
private String firstname;
private String lastname;
private String fullname;
private String orcid;
private Boolean fromOrcid;
public Boolean getFromOrcid() {
return fromOrcid;
}
public void setFromOrcid(Boolean fromOrcid) {
this.fromOrcid = fromOrcid;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = Constants.addQuotes(id);
}
public String getFirstname() {
return firstname;
}
public void setFirstname(String firstname) {
this.firstname = Constants.addQuotes(firstname);
}
public String getLastname() {
return lastname;
}
public void setLastname(String lastname) {
this.lastname = Constants.addQuotes(lastname);
}
public String getFullname() {
return fullname;
}
public void setFullname(String fullname) {
this.fullname = Constants.addQuotes(fullname);
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = Constants.addQuotes(orcid);
}
}

View File

@ -1,40 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
import java.io.Serializable;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class CSVCitation implements Serializable {
private String id;
private String result_id_cites;
private String result_id_cited;
public String getId() {
return id;
}
public void setId(String id) {
this.id = Constants.addQuotes(id);
}
public String getResult_id_cites() {
return result_id_cites;
}
public void setResult_id_cites(String result_id_cites) {
this.result_id_cites = Constants.addQuotes(result_id_cites);
}
public String getResult_id_cited() {
return result_id_cited;
}
public void setResult_id_cited(String result_id_cited) {
this.result_id_cited = Constants.addQuotes(result_id_cited);
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
import java.io.Serializable;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class CSVPid implements Serializable {
private String id;
private String result_id;
private String pid;
private String type;
public String getResult_id() {
return result_id;
}
public void setResult_id(String result_id) {
this.result_id = Constants.addQuotes(result_id);
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = Constants.addQuotes(pid);
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = Constants.addQuotes(type);
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = Constants.addQuotes(id);
}
}

View File

@ -1,31 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
import java.io.Serializable;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class CSVRELCommunityResult implements Serializable {
private String result_id;
private String community_id;
public String getResult_id() {
return result_id;
}
public void setResult_id(String result_id) {
this.result_id = Constants.addQuotes(result_id);
}
public String getCommunity_id() {
return community_id;
}
public void setCommunity_id(String community_id) {
this.community_id = Constants.addQuotes(community_id);
}
}

View File

@ -1,31 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
import java.io.Serializable;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class CSVRelResAut implements Serializable {
private String result_id;
private String author_id;
public String getResult_id() {
return result_id;
}
public void setResult_id(String result_id) {
this.result_id = Constants.addQuotes(result_id);
}
public String getAuthor_id() {
return author_id;
}
public void setAuthor_id(String author_id) {
this.author_id = Constants.addQuotes(author_id);
}
}

View File

@ -1,113 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.csv.model;
import java.io.Serializable;
import org.apache.commons.lang.StringUtils;
import com.fasterxml.jackson.annotation.JsonGetter;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSetter;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import sun.swing.StringUIClientPropertyKey;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class CSVResult implements Serializable {
private String id;
private String type;
private String title;
private String description;
private String accessright;
private String publication_date;
private String publisher;
private String keywords;
private String country;
private String language;
public String getId() {
return id;
}
public void setId(String id) {
this.id = Constants.addQuotes(id);
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = Constants.addQuotes(type);
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = Constants.addQuotes(title);
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = Constants.addQuotes(description);
}
public String getAccessright() {
return accessright;
}
public void setAccessright(String accessright) {
this.accessright = Constants.addQuotes(accessright);
}
public String getPublication_date() {
return publication_date;
}
public void setPublication_date(String publication_date) {
this.publication_date = Constants.addQuotes(publication_date);
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = Constants.addQuotes(publisher);
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = Constants.addQuotes(keywords);
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = Constants.addQuotes(country);
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = Constants.addQuotes(language);
}
}

View File

@ -1,30 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.exceptions;
public class MyRuntimeException extends RuntimeException {
public MyRuntimeException() {
super();
}
public MyRuntimeException(
final String message,
final Throwable cause,
final boolean enableSuppression,
final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
public MyRuntimeException(final String message, final Throwable cause) {
super(message, cause);
}
public MyRuntimeException(final String message) {
super(message);
}
public MyRuntimeException(final Throwable cause) {
super(cause);
}
}

View File

@ -0,0 +1,55 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
/**
* @author miriam.baglioni
* @Date 26/03/24
*/
/**
* @author miriam.baglioni
* @Date 26/03/24
*/
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
/**
* @author miriam.baglioni
* @Date 25/09/23
*/
public class EoscMasterDuplicate {
private static final Logger log = LoggerFactory.getLogger(EoscMasterDuplicate.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
EoscMasterDuplicate.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/datasourcemaster_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
log.info("postgresUrl: {}", dbUrl);
final String dbUser = parser.get("postgresUser");
log.info("postgresUser: {}", dbUser);
final String dbPassword = parser.get("postgresPassword");
log.info("postgresPassword: {}", dbPassword);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode: {}", hdfsNameNode);
ReadDatasourceMasterDuplicateFromDB.execute(dbUrl, dbUser, dbPassword, hdfsPath, hdfsNameNode);
}
}

View File

@ -0,0 +1,93 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromEntities;
import eu.dnetlib.dhp.oa.graph.dump.skgif.Utils;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 12/03/24
*/
public class FilterEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
FilterEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/filter_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String filterPath = parser.get("filterPath");
log.info("filterPath: {}", filterPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
filterEntities(spark, inputPath, filterPath, workingDir);
});
}
private static <R extends Result> void filterEntities(SparkSession spark, String inputPath, String filterPath,
String workingDir) {
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Dataset<R> result = Utils
.readPath(spark, inputPath + e.name(), resultClazz);
Dataset<Row> filterIds = spark.read().parquet(filterPath + e.name() + "_ids");
result
.join(filterIds, result.col("id").equalTo(filterIds.col("id")), "leftsemi")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name());
}
});
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 25/09/23
*/
public class MasterDuplicate implements Serializable {
private String eoscId;
private String graphId;
private String graphName;
public String getEoscId() {
return eoscId;
}
public void setEoscId(String eoscId) {
this.eoscId = eoscId;
}
public String getGraphId() {
return graphId;
}
public void setGraphId(String graphId) {
this.graphId = graphId;
}
public String getGraphName() {
return graphName;
}
public void setGraphName(String graphName) {
this.graphName = graphName;
}
}

View File

@ -0,0 +1,323 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
/**
* @author miriam.baglioni
* @Date 26/03/24
*/
/**
* @author miriam.baglioni
* @Date 26/03/24
*/
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadDatasourceMasterDuplicateFromDB {
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public ReadDatasourceMasterDuplicateFromDB() {
}
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException {
int count = 0;
DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword);
Throwable var7 = null;
try {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
log
.info(
"running query: {}",
"SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);");
log.info("storing results in: {}", hdfsPath);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
Throwable var12 = null;
try {
dbClient
.processResults(
"SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);",
(rs) -> {
writeMap(datasourceMasterMap(rs), writer);
});
++count;
} catch (Throwable var35) {
var12 = var35;
throw var35;
} finally {
if (writer != null) {
if (var12 != null) {
try {
writer.close();
} catch (Throwable var34) {
var12.addSuppressed(var34);
}
} else {
writer.close();
}
}
}
} catch (Throwable var37) {
var7 = var37;
throw var37;
} finally {
if (dbClient != null) {
if (var7 != null) {
try {
dbClient.close();
} catch (Throwable var33) {
var7.addSuppressed(var33);
}
} else {
dbClient.close();
}
}
}
return count;
}
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try {
MasterDuplicate md = new MasterDuplicate();
String duplicateId = rs.getString("duplicateId");
String masterId = rs.getString("masterId");
String masterName = rs.getString("masterName");
if (duplicateId.startsWith("eosc")) {
final String eoscDsId = getEoscDsId(duplicateId);
md.setEoscId(eoscDsId);
md.setGraphId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setGraphName(masterName);
return md;
}
return null;
} catch (SQLException var5) {
throw new RuntimeException(var5);
}
}
@NotNull
private static String getEoscDsId(String duplicateId) {
String eoscDsId = duplicateId.substring(duplicateId.lastIndexOf("::") + 2);
switch (eoscDsId) {
case "eosc.blue-cloud.44fa8dba8ad3ed19445227940032f31c":
eoscDsId = "eosc.blue-cloud.grsf";
break;
case "eosc.ror-org.24ef0000cfbf3ce7f3a40ba6b87e76ce":
eoscDsId = "eosc.ror-org.ror";
break;
case "eosc.clarin-eric.2aad8ade139792a49b130b539e1bb144":
eoscDsId = "eosc.clarin-eric.virtual_language_observatory";
break;
case "eosc.embl-ebi.e29a4e098afa05818957179f05d8e21d":
eoscDsId = "eosc.embl-ebi.icr";
break;
case "eosc.cyfronet.b59c2171d05ed9fb9e70a86d544f42a3":
eoscDsId = "eosc.cyfronet.rodbuk";
break;
case "eosc.eudat.9168f179ffab97584bf99a2729837545":
eoscDsId = "eosc.eudat.b2safe";
break;
case "eosc.oxford_e-research_centre.21697de1a5b10b8eb5fad857edecf5c9":
eoscDsId = "eosc.oxford_e-research_centre.fairsharing";
break;
case "eosc.inria.5923d0f31f0acda46cf4b592972284a2":
eoscDsId = "eosc.inria.software_heritage_archive";
break;
case "eosc.rli.661cdfdc74561b8eb69583b8137799d2":
eoscDsId = "eosc.rli.open_energy_platform";
break;
case "eosc.bbmri-eric.314cee7546a7489c2cc3ab79d34e2640":
eoscDsId = "eosc.bbmri-eric.bbmri-eric_directory";
break;
case "eosc.ku_leuven.68bf19ae7ee1bc7e3872255e96550c04":
eoscDsId = "eosc.ku_leuven.lirias";
break;
case "eosc.wenmr.d288225c333b07fc9d001da5c5392741":
eoscDsId = "eosc.wenmr.madomsi3sobm";
break;
case "eosc.zpid.b96341f00ca4c3a314abcc07fc0084b2":
eoscDsId = "eosc.zpid.psycharchives";
break;
case "eosc.vamdc.c967f669aa354e584e6786ee1d0c823e":
eoscDsId = "eosc.vamdc.vamdc_portal";
break;
case "eosc.openaire.2bb8710e1870170a175110615698e677":
eoscDsId = "eosc.openaire.openaire_scholexplorer";
break;
case "eosc.elixir-uk.5126ffcc8e23f65bbbe219d36128f2c8":
eoscDsId = "eosc.elixir-uk.workflowhub";
break;
case "eosc.vliz.61c6dae33d794d477e6a68ed43f52eb3":
eoscDsId = "eosc.vliz.worms";
break;
case "eosc.cern.8025243fa3c887159fc9b3930ae147c2":
eoscDsId = "eosc.cern.cod";
break;
case "eosc.hits.901e9baaa76d72017ebd7dfd93436caf":
eoscDsId = "eosc.hits.fairdomhub";
break;
case "eosc.bbmri-eric.8206c9aa93eb9513383218704570feb2":
eoscDsId = "eosc.bbmri-eric.bbmri-eric_crc-cohort";
break;
case "eosc.hn.02e4d980399d7142506e8aadb2b8e865":
eoscDsId = "eosc.hn.isidore";
break;
case "eosc.obsparis.9e98089baaf6af32fab3154873dfdfeb":
eoscDsId = "eosc.obsparis::eosc.obsparis.padc";
break;
case "eosc.esrf.ecc74ab09791c52aa238ee77ae988874":
eoscDsId = "eosc.esrf::eosc.esrf.tesrfdp";
break;
case "eosc.cessda-eric.7e17e8817404ce7a8013be373723b2be":
eoscDsId = "eosc.cessda-eric.cdc";
break;
case "eosc.psi.f1a79f572f95bc2fbea5cdc40ef4eb22":
eoscDsId = "eosc.psi.psi_public_data_repository";
break;
case "eosc.uniwersytet_opolski.19b44a96f7a776774de3939d9820d00c":
eoscDsId = "eosc.uniwersytet_opolski.bk_uniopole";
break;
case "eosc.lindatclariah-cz.6dc98fcb5294282acf3d92f3ab3376b2":
eoscDsId = "eosc.lindatclariah-cz.lindatclariah-cz_repository";
break;
case "eosc.eudat.17bb7bb8ef1af0f9bdb55a7db30cfa8a":
eoscDsId = "eosc.eudat.b2share";
break;
case "eosc.acdh-ch.3b0149bee976d6db7eef053159e97a87":
eoscDsId = "eosc.acdh-ch.arche";
break;
case "eosc.uit.49e8d4cef23bda3b66dd417e6675727d":
eoscDsId = "eosc.uit.trolling";
break;
case "eosc.csuc.135887d3dea4b6723095d13c28dd52a3":
eoscDsId = "eosc.csuc.corardr";
break;
case "eosc.ccsd.06cdd3ff4700bb4c8e7bf22c14f23f5b":
eoscDsId = "eosc.ccsd.episciences";
break;
case "eosc.gbif.14ac40283813a624bd74ae82605ded23":
eoscDsId = "eosc.gbif.gbif_species_occurrence_data";
break;
case "eosc.gdansk_tech.1434de11c83986b5be5592677f28d171":
eoscDsId = "eosc.gdansk_tech.most";
break;
case "eosc.gwdg.d6521479ffa922bbccc839606b8ec7c5":
eoscDsId = "eosc.gwdg.textgrid_repository";
break;
case "eosc.unipd.12d35bb1f56d4b91bb4644faf76d9486":
eoscDsId = "eosc.unipd.rdu";
break;
case "eosc.unibi-ub.a61d9ea844bdf43e6feabd6b14dfe3c5":
eoscDsId = "eosc.unibi-ub.pub";
break;
case "eosc.scipedia.0063745e5964b19c3e9ceeb2bd6632f5":
eoscDsId = "eosc.scipedia.spaosp";
break;
case "eosc.psnc.6f0470e3bb9203ec3a7553f3a72a7a1f":
eoscDsId = "eosc.psnc.rohub";
break;
case "eosc.ill.d422cba59746f39d10bdfea5c9cf8511":
eoscDsId = "eosc.ill.ill_data_portal";
break;
case "eosc.ceric-eric.e9354332fd75190b935b80c1ba30b837":
eoscDsId = "eosc.ceric-eric.ceric-data-portal";
break;
case "eosc.cnr_-_isti.dbe89d2b83f3e29caab7923a51c1d151":
eoscDsId = "eosc.cnr_-_isti.isti_open_portal";
break;
case "eosc.lapp.ef0bb7d889d0cced364444495f7a1e67":
eoscDsId = "eosc.lapp.ossr";
break;
case "eosc.lida.26c1ee137e7510fd1d7e44eb87cdb4af":
eoscDsId = "eosc.lida.lida_survey_data";
break;
case "eosc.awi_bremerhaven.2882af227241cb956c28fe321a70dfb2":
eoscDsId = "eosc.awi_bremerhaven.pangaea";
break;
case "eosc.riga_stradins_university.4ea61809e753e65a459bbe4a492c773b":
eoscDsId = "eosc.riga_stradins_university.rsu_dataverse";
break;
case "eosc.ku_leuven.1cb0937dc41e70d8126d7b259ad470af":
eoscDsId = "eosc.ku_leuven.ku_leuven_rdr";
break;
case "eosc.dkrz.9ffffb05aaf22e7f9138dca4560a8c8b":
eoscDsId = "eosc.dkrz.wdcc";
break;
case "eosc.openaire.0a02f13310296033694acead588a773b":
eoscDsId = "eosc.openaire.zenodo";
break;
case "eosc.vilnius-university.1ec069c1620d49d460e4cbcec0af57f6":
eoscDsId = "eosc.vilnius-university.tnoarda";
break;
case "eosc.icos_eric.25c5f3f0674fb287e05e697263e211e2":
eoscDsId = "eosc.icos_eric.data_discovery_and_access_portal";
break;
case "eosc.fris.8f42bfccf70de38b01763b704300f882":
eoscDsId = "eosc.fris.fris";
break;
}
return eoscDsId;
}
private static void writeMap(MasterDuplicate dm, BufferedWriter writer) {
if (dm == null)
return;
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (IOException var3) {
throw new RuntimeException(var3);
}
}
}

View File

@ -0,0 +1,345 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromEntities;
import eu.dnetlib.dhp.oa.graph.dump.skgif.Utils;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 12/03/24
*/
public class SelectConnectedEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SelectConnectedEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/select_connected_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String filterPath = parser.get("filterPath");
log.info("filterPath: {}", filterPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
selectConnectedEntities2(spark, inputPath, filterPath, workingDir);
});
}
private static void selectConnectedEntities2(SparkSession spark, String inputPath, String filterPath,
String workingDir) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType);
Dataset<String> resultIds = spark.emptyDataset(Encoders.STRING());
for (EntityType entity : ModelSupport.entityTypes.keySet())
if (ModelSupport.isResult(entity))
resultIds = resultIds
.union(
spark
.read()
.parquet(filterPath + entity.name() + "_ids")
.select("id")
.as(Encoders.STRING()));
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(inputPath + "relation")
.filter(("datainfo.deletedbyinference != true"))
.drop("datainfo");
Dataset<Row> matchingRels = relation
.join(
resultIds, relation
.col("source")
.equalTo(resultIds.col("value")),
"leftsemi")
.select("target")
.distinct();
Dataset<Row> organization = spark
.read()
.schema(Encoders.bean(Organization.class).schema())
.json(inputPath + "organization")
.filter("datainfo.deletedbyinference != true ");
Dataset<Project> projects = Utils
.readPath(spark, inputPath + "project", Project.class)
.filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference())
.filter(
(FilterFunction<Project>) p -> Optional.ofNullable(p.getFundingtree()).isPresent() &&
p.getFundingtree().size() > 0 &&
Utils
.getFunderName(p.getFundingtree().get(0).getValue())
.equalsIgnoreCase("European Commission"));
organization
.join(matchingRels, organization.col("id").equalTo(matchingRels.col("target")), "leftsemi")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "organization");
projects
.join(matchingRels, projects.col("id").equalTo(matchingRels.col("target")), "leftsemi")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "/project");
Dataset<Row> datasources = spark
.read()
.schema(Encoders.bean(Datasource.class).schema())
.json(inputPath + "datasource")
.filter("datainfo.deletedbyinference != true");
final Dataset<String> datasourceReferencedIds = getDatasourceReferenceIdDataset(spark, workingDir);
datasources
.join(
datasourceReferencedIds, datasourceReferencedIds.col("value").equalTo(datasources.col("id")),
"left_semi")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "datasource");
}
// private static <R extends Result> void selectConnectedEntities(SparkSession spark, String inputPath,
// String filterPath,
// String workingDir) throws JsonProcessingException {
//
// Dataset<String> resultIds = spark.emptyDataset(Encoders.STRING());
// for (EntityType entity : ModelSupport.entityTypes.keySet())
// if (ModelSupport.isResult(entity))
// resultIds = resultIds
// .union(
// spark
// .read()
// .parquet(filterPath + entity.name() + "_ids")
// .select("id")
// .as(Encoders.STRING()));
//
// Dataset<Relation> relation = Utils
// .readPath(spark, inputPath + "relation", Relation.class)
// .filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference());
// Dataset<Organization> organizations = Utils
// .readPath(spark, inputPath + "organization", Organization.class)
// .filter((FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference());
// Dataset<Project> projects = Utils
// .readPath(spark, inputPath + "project", Project.class)
// .filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference())
// .filter(
// (FilterFunction<Project>) p -> Optional.ofNullable(p.getFundingtree()).isPresent() &&
// p.getFundingtree().size() > 0 &&
// Utils
// .getFunderName(p.getFundingtree().get(0).getValue())
// .equalsIgnoreCase("European Commission"));
//
// Dataset<Datasource> datasources = Utils
// .readPath(spark, inputPath + "datasource", Datasource.class)
// .filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getDeletedbyinference());
//
// // select relations having source in the set of identifiers selected for eosc
// Dataset<Row> relationSource = relation
// .join(resultIds, resultIds.col("value").equalTo(relation.col("source")), "left_semi");
// relationSource
// .join(resultIds, resultIds.col("value").equalTo(relation.col("target")), "left_semi")
// .write()
// .option("compression", "gzip")
// .mode(SaveMode.Overwrite)
// .json(workingDir + "resultrelation");
////
//// // write relations between results and organizations
// relationSource
// .joinWith(organizations, relation.col("target").equalTo(organizations.col("id")), "left_semi")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "organizaitonrelation");
//
// relationSource
// .joinWith(projects, relation.col("target").equalTo(projects.col("id")), "left_semi")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "projectrelation");
//
// // write organizations linked to results in the set
//
// organizations
// .join(relationSource, relationSource.col("target").equalTo(organizations.col("id")), "left_semi")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "organization");
//
// // write projects linked to results in the set
// projects
// .join(relationSource, relationSource.col("target").equalTo(projects.col("id")))
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "project");
//
// // read the results and select all the distinct instance.hostedbykey
// final Dataset<String> datasourceReferencedIds = getDatasourceReferenceIdDataset(spark, workingDir);
// // join with the datasources and write the datasource in the join
// datasources
// .joinWith(
// datasourceReferencedIds, datasourceReferencedIds.col("value").equalTo(datasources.col("id")),
// "left_semi")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "datasource");
//
// // selecting relations between organizations and projects in the selected set
// StructType tp = StructType.fromDDL("`id` STRING");
// Dataset<Row> organizationSbs = spark
// .read()
// .schema(tp)
// .json(workingDir + "organization")
// .select("id");
//
// Dataset<Row> projectSbs = spark
// .read()
// .schema(tp)
// .json(workingDir + "project")
// .select("id");
////
// Dataset<Row> tmpRel;
// tmpRel = relation
// .join(
// organizationSbs, organizationSbs
// .col("id")
// .equalTo(relation.col("source")),
// "left_semi");
// tmpRel
// .join(projectSbs, tmpRel.col("target").equalTo(projectSbs.col("id")), "left_semi")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "orgprojelation");
//
// // selecting relations between datasources and organizations in the selected set
// Dataset<Row> datasourceSbs = spark
// .read()
// .schema(tp)
// .json(workingDir + "datasource")
// .select("id");
//
// tmpRel = relation
// .join(datasourceSbs, datasourceSbs.col("id").equalTo(relation.col("source")), "left_semi");
// tmpRel
// .join(organizationSbs, tmpRel.col("target").equalTo(organizationSbs.col("id")), "left_semi")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "datsorgrelation");
//
// Utils
// .readPath(spark, workingDir + "resultrelation", Relation.class)
// .union(Utils.readPath(spark, workingDir + "organizaitonrelation", Relation.class))
// .union(Utils.readPath(spark, workingDir + "projectrelation", Relation.class))
// .union(Utils.readPath(spark, workingDir + "orgprojelation", Relation.class))
// .union(Utils.readPath(spark, workingDir + "datsorgrelation", Relation.class))
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "relation");
//
// }
private static <R extends Result> Dataset<String> getDatasourceReferenceIdDataset(SparkSession spark,
String workingDir) {
Dataset<String> datasourceReferencedIds = spark.emptyDataset(Encoders.STRING());
for (EntityType entity : ModelSupport.entityTypes.keySet())
if (ModelSupport.isResult(entity)) {
Class<R> resultClazz = ModelSupport.entityTypes.get(entity);
datasourceReferencedIds = datasourceReferencedIds
.union(
Utils
.readPath(spark, workingDir + entity.name(), resultClazz)
.flatMap(
(FlatMapFunction<R, String>) r -> r
.getInstance()
.stream()
.filter(i -> i.getHostedby() != null && i.getHostedby().getKey() != null)
.map(i -> i.getHostedby().getKey())
.collect(Collectors.toList())
.iterator(),
Encoders.STRING()));
datasourceReferencedIds = datasourceReferencedIds
.union(
Utils
.readPath(spark, workingDir + entity.name(), resultClazz)
.flatMap(
(FlatMapFunction<R, String>) r -> r
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom() != null && i.getCollectedfrom().getKey() != null)
.map(i -> i.getCollectedfrom().getKey())
.collect(Collectors.toList())
.iterator(),
Encoders.STRING()));
}
datasourceReferencedIds = datasourceReferencedIds.distinct();
return datasourceReferencedIds;
}
}

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.functions.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.EmitFromEntities;
import eu.dnetlib.dhp.oa.graph.dump.skgif.Utils;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 20/03/24
*/
public class SelectEOSCEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectEOSCEntities.class);
private static final String B2FIND_IDENTIFIER = "10|re3data_____::730f562f9efe8a3b3742d2da510d4335";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
FilterEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/eosc_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String filterPath = parser.get("filterPath");
log.info("filterPath: {}", filterPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
selectEntities(spark, inputPath, filterPath);
});
}
private static <R extends Result> void selectEntities(SparkSession spark, String inputPath, String filterPath) {
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
// Utils
// .readPath(spark, inputPath + e.name(), ModelSupport.entityTypes.get(e))
// .filter(
// (FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference()
// && !r.getDataInfo().getInvisible()
// && (r.getContext().stream().anyMatch(c -> c.getId().equals("eosc")) ||
// r
// .getCollectedfrom()
// .stream()
// .anyMatch(cf -> cf.getValue().equalsIgnoreCase("B2FIND"))))
// .map((MapFunction<R, String>) r -> r.getId(), Encoders.STRING())
spark
.read()
.schema(Encoders.bean(Result.class).schema())
.json(inputPath + e.name())
.where("datainfo.deletedbyinference != true and datainfo.invisible != true")
.select("id", "context", "collectedfrom")
.where("array_contains(context.id,'eosc') or array_contains(collectedfrom.value,'B2FIND')")
.drop("context", "collectedfrom")
.distinct()
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.parquet(filterPath + e.name() + "_ids");
//
}
});
}
}

View File

@ -1,129 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.funderresults;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.amazonaws.transform.SimpleTypeUnmarshallers;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.oa.model.community.Funder;
import eu.dnetlib.dhp.oa.model.community.Project;
import io.netty.util.internal.StringUtil;
/**
* Splits the dumped results by funder and stores them in a folder named as the funder nsp (for all the funders, but the EC
* for the EC it specifies also the fundingStream (FP7 or H2020)
*/
public class SparkDumpFunderResults implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpFunderResults.class);
private static final ObjectMapper MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpFunderResults.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
writeResultProjectList(spark, inputPath, outputPath);
});
}
private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath) {
Dataset<CommunityResult> result = Utils
.readPath(spark, inputPath + "/publication", CommunityResult.class)
.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class))
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
log.info("Number of result {}", result.count());
Dataset<String> tmp = result
.flatMap((FlatMapFunction<CommunityResult, String>) cr -> cr.getProjects().stream().map(p -> {
return getFunderName(p);
}).collect(Collectors.toList()).iterator(), Encoders.STRING())
.distinct();
List<String> funderList = tmp.collectAsList();
funderList.stream().parallel().forEach(funder -> {
result
.filter(
(FilterFunction<CommunityResult>) r -> Optional.ofNullable(r.getProjects()).isPresent() &&
r.getProjects().stream().anyMatch(p -> getFunderName(p).equals(funder)))
.map((MapFunction<CommunityResult, String>) r -> MAPPER.writeValueAsString(r), Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(outputPath + "/" + funder);
});
}
@NotNull
private static String getFunderName(Project p) {
Optional<Funder> ofunder = Optional.ofNullable(p.getFunder());
if (ofunder.isPresent()) {
String fName = ofunder.get().getShortName();
if (StringUtil.isNullOrEmpty(fName))
return ofunder.get().getName();
if (fName.equalsIgnoreCase("ec")) {
fName += "_" + ofunder.get().getFundingStream();
}
return fName;
} else {
String fName = p.getId().substring(0, p.getId().indexOf("_")).toUpperCase();
if (fName.equalsIgnoreCase("ec")) {
if (p.getId().contains("he")) {
fName += "_HE";
} else if (p.getId().contains("h2020")) {
fName += "_H2020";
} else {
fName += "_FP7";
}
} else if (fName.equalsIgnoreCase("conicytf")) {
fName = "CONICYT";
} else if (fName.equalsIgnoreCase("dfgf")) {
fName = "DFG";
} else if (fName.equalsIgnoreCase("tubitakf")) {
fName = "TUBITAK";
} else if (fName.equalsIgnoreCase("euenvagency")) {
fName = "EEA";
}
return fName;
}
}
}

View File

@ -1,120 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.funderresults;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.community.ResultProject;
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
/**
* Selects the results linked to projects. Only for these results the dump will be performed.
* The code to perform the dump and to expend the dumped results with the information related to projects
* is the one used for the dump of the community products
*/
public class SparkResultLinkedToProject implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkResultLinkedToProject.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkResultLinkedToProject.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters_link_prj.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String resultProjectsPath = parser.get("graphPath");
log.info("graphPath: {}", resultProjectsPath);
String communityMapPath = parser.get("communityMapPath");
@SuppressWarnings("unchecked")
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
writeResultsLinkedToProjects(
communityMapPath, spark, inputClazz, inputPath, outputPath, resultProjectsPath);
});
}
private static <R extends Result> void writeResultsLinkedToProjects(String communityMapPath, SparkSession spark,
Class<R> inputClazz,
String inputPath, String outputPath, String resultProjectsPath) {
Dataset<R> results = Utils
.readPath(spark, inputPath, inputClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible());
Dataset<ResultProject> resultProjectDataset = Utils
.readPath(spark, resultProjectsPath, ResultProject.class);
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
results
.joinWith(resultProjectDataset, results.col("id").equalTo(resultProjectDataset.col("resultId")))
.map((MapFunction<Tuple2<R, ResultProject>, CommunityResult>) t2 -> {
CommunityResult cr = (CommunityResult) ResultMapper
.map(
t2._1(),
communityMap, Constants.DUMPTYPE.FUNDER.getType());
if (cr != null) {
cr.setProjects(t2._2().getProjectsList());
}
return cr;
}, Encoders.bean(CommunityResult.class))
.filter(Objects::nonNull)
.map(
(MapFunction<CommunityResult, String>) cr -> new ObjectMapper().writeValueAsString(cr),
Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(outputPath);
}
}

View File

@ -1,270 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.organizationonly;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.ENTITY_ID_SEPARATOR;
import static eu.dnetlib.dhp.oa.graph.dump.Utils.getEntityId;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.oa.model.Container;
import eu.dnetlib.dhp.oa.model.Provenance;
import eu.dnetlib.dhp.oa.model.Result;
import eu.dnetlib.dhp.oa.model.graph.*;
import eu.dnetlib.dhp.oa.model.graph.Datasource;
import eu.dnetlib.dhp.oa.model.graph.Organization;
import eu.dnetlib.dhp.oa.model.graph.Project;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/**
* Spark Job that fires the dump for the entites
*/
public class SparkDumpOrganizationJob implements Serializable {
private static final Logger log = LoggerFactory
.getLogger(eu.dnetlib.dhp.oa.graph.dump.organizationonly.SparkDumpOrganizationJob.class);
public static final String COMPRESSION = "compression";
public static final String GZIP = "gzip";
public static void main(String[] args) throws Exception {
Boolean isSparkSessionManaged = Boolean.TRUE;
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
log.info("inputPath: {}", inputPath);
final String outputPath = "/tmp/miriam/organizationsOnly/";
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath);
// relationMap2(spark, inputPath, outputPath);
});
}
private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "relation", Relation.class)
.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setReltype(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "relation");
}
private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
organization
.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
.map(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/tmp/orgSource");
rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
organization
.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
.map(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/tmp/orgSourceTarget");
Utils
.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setReltype(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "relation");
}
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
.map(
(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
Encoders.bean(Organization.class))
.filter((FilterFunction<Organization>) o -> o != null)
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.json(outputPath + "/organization");
}
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
eu.dnetlib.dhp.schema.oaf.Organization org) {
Organization organization = new Organization();
Optional
.ofNullable(org.getLegalshortname())
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
Optional
.ofNullable(org.getLegalname())
.ifPresent(value -> organization.setLegalname(value.getValue()));
Optional
.ofNullable(org.getWebsiteurl())
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
Optional
.ofNullable(org.getAlternativeNames())
.ifPresent(
value -> organization
.setAlternativenames(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(org.getCountry())
.ifPresent(
value -> {
if (!value.getClassid().equals(eu.dnetlib.dhp.oa.graph.dump.complete.Constants.UNKNOWN)) {
organization
.setCountry(
eu.dnetlib.dhp.oa.model.Country.newInstance(value.getClassid(), value.getClassname()));
}
});
Optional
.ofNullable(org.getId())
.ifPresent(value -> organization.setId(getEntityId(value, ENTITY_ID_SEPARATOR)));
Optional
.ofNullable(org.getPid())
.ifPresent(
value -> organization
.setPid(
value
.stream()
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
return organization;
}
}

View File

@ -1,87 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.projectssubset;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.model.graph.Project;
import scala.Tuple2;
public class ProjectsSubsetSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ProjectsSubsetSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
ProjectsSubsetSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String projectListPath = parser.get("projectListPath");
log.info("projectListPath: {}", projectListPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
getNewProjectList(spark, inputPath, outputPath, projectListPath);
});
}
private static void getNewProjectList(SparkSession spark, String inputPath, String outputPath,
String projectListPath) {
Dataset<String> projectList = spark.read().textFile(projectListPath);
Dataset<Project> projects;
projects = Utils
.readPath(spark, inputPath, Project.class)
.map((MapFunction<Project, Project>) p -> {
p.setId("40|" + p.getId());
return p;
}, Encoders.bean(Project.class));
projects
.joinWith(projectList, projects.col("id").equalTo(projectList.col("value")), "left")
.map((MapFunction<Tuple2<Project, String>, Project>) t2 -> {
if (Optional.ofNullable(t2._2()).isPresent())
return null;
return t2._1();
}, Encoders.bean(Project.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
Utils
.readPath(spark, outputPath, Project.class)
.map((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING())
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.text(projectListPath);
}
}

View File

@ -1,241 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.serafeim;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 04/05/23
*/
//STEP 2
public class SparkSelectResultsAndDumpRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class);
private static String RESULT_COMMUNITY_TABLE = "/result_community";
private static String COMMUNITY_RESULT_IDS = "/communityResultIds";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkSelectResultsAndDumpRelations.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String workingPath = parser.get("workingPath");
List<String> communityList = null;
Optional<String> communities = Optional.ofNullable(parser.get("communities"));
if (communities.isPresent()) {
communityList = Arrays.asList(communities.get().split(";"));
}
SparkConf conf = new SparkConf();
List<String> finalCommunityList = communityList;
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
run(spark, inputPath, outputPath, workingPath, finalCommunityList);
});
}
private static void run(SparkSession spark, String inputPath, String outputPath,
String workingPath,
List<String> communityList) {
// select the result ids related to the set of communities considered
writeCommunityRelatedIds(
spark, inputPath, Publication.class, communityList, workingPath, "publication");
writeCommunityRelatedIds(
spark, inputPath, Dataset.class, communityList, workingPath, "dataset");
writeCommunityRelatedIds(
spark, inputPath, Software.class, communityList, workingPath, "software");
writeCommunityRelatedIds(
spark, inputPath, OtherResearchProduct.class, communityList,
workingPath, "otherresearchproduct");
// select the relations with semantics cites
org.apache.spark.sql.Dataset<Relation> relations = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.CITES));
// select the relations having as source one of the results related to the
// communities
org.apache.spark.sql.Dataset<String> communityResultIds = spark
.read()
.textFile(workingPath + COMMUNITY_RESULT_IDS)
.distinct();
Utils
.readPath(spark, inputPath + "/publication", Publication.class)
.filter(
(FilterFunction<Publication>) p -> !p.getDataInfo().getDeletedbyinference()
&& !p.getDataInfo().getInvisible())
.map((MapFunction<Publication, String>) p -> p.getId(), Encoders.STRING())
.union(
Utils
.readPath(spark, inputPath + "/dataset", Dataset.class)
.filter(
(FilterFunction<Dataset>) p -> !p.getDataInfo().getDeletedbyinference()
&& !p.getDataInfo().getInvisible())
.map((MapFunction<Dataset, String>) p -> p.getId(), Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/software", Software.class)
.filter(
(FilterFunction<Software>) p -> !p.getDataInfo().getDeletedbyinference()
&& !p.getDataInfo().getInvisible())
.map((MapFunction<Software, String>) p -> p.getId(), Encoders.STRING()))
.union(
Utils
.readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
.filter(
(FilterFunction<OtherResearchProduct>) p -> !p.getDataInfo().getDeletedbyinference()
&& !p.getDataInfo().getInvisible())
.map((MapFunction<OtherResearchProduct, String>) p -> p.getId(), Encoders.STRING()))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(workingPath + "/resultIds");
org.apache.spark.sql.Dataset<String> resultIds = spark.read().textFile(workingPath + "/resultIds");
org.apache.spark.sql.Dataset<Relation> oksource = communityResultIds
.joinWith(relations, communityResultIds.col("value").equalTo(relations.col("source")))
.map(
(MapFunction<Tuple2<String, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class));
oksource
.joinWith(resultIds, oksource.col("target").equalTo(resultIds.col("value")))
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath + "/relation");
writeNodes(
spark, inputPath + "/publication", Publication.class, outputPath + "/publication",
outputPath + "/relation", workingPath);
writeNodes(
spark, inputPath + "/dataset", Dataset.class, outputPath + "/dataset", outputPath + "/relation",
workingPath);
writeNodes(
spark, inputPath + "/software", Software.class, outputPath + "/software", outputPath + "/relation",
workingPath);
writeNodes(
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class,
outputPath + "/otherresearchproduct", outputPath + "/relation", workingPath);
}
private static <R extends Result> void writeNodes(SparkSession spark, String inputPath, Class<R> clazz,
String outputPath, String relationPath, String workingPath) {
org.apache.spark.sql.Dataset<Relation> citingRelations = Utils.readPath(spark, relationPath, Relation.class);
org.apache.spark.sql.Dataset<R> result = Utils
.readPath(spark, inputPath, clazz)
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible());
// take the distinct result id for source and target of the relations
citingRelations
.flatMap(
(FlatMapFunction<Relation, String>) r -> Arrays
.asList(r.getSource(), r.getTarget())
.iterator(),
Encoders.STRING())
.distinct()
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(workingPath + "/relationIds");
org.apache.spark.sql.Dataset<String> relationIds = spark.read().textFile(workingPath + "/relationIds");
relationIds
.joinWith(result, relationIds.col("value").equalTo(result.col("id")))
.map((MapFunction<Tuple2<String, R>, R>) t2 -> t2._2(), Encoders.bean(clazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
private static <R extends Result> void writeCommunityRelatedIds(SparkSession spark, String inputPath,
Class<R> clazz, List<String> communityList, String outputPath, String resultType) {
org.apache.spark.sql.Dataset<R> results = Utils
.readPath(spark, inputPath + "/" + resultType, clazz)
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible() &&
isRelatedToCommunities(p, communityList));
results
.map((MapFunction<R, String>) Result::getId, Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.text(outputPath + COMMUNITY_RESULT_IDS);
// results
// // .repartition(10000)
// .write()
// .option("compression", "gzip")
// .mode(SaveMode.Append)
// .json(outputPath + "/" + resultType);
}
private static <R extends Result> boolean isRelatedToCommunities(R p, List<String> communityList) {
return p
.getContext()
.stream()
.anyMatch(
c -> communityList.contains(c.getId()) ||
(c.getId().contains("::")
&& communityList.contains(c.getId().substring(0, c.getId().indexOf("::")))));
}
}

View File

@ -0,0 +1,196 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.MinOrganization;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.RelationType;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class DumpDatasource implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpDatasource.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpDatasource.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/dump_datasource_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "datasources");
mapDatasource(spark, inputPath, outputPath, workingDir);
});
}
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Dataset<ExtendingOrganization> organizations = Utils
.readPath(spark, workingDir + "/relations/datasource_providing_organization", ExtendingOrganization.class);
Dataset<Datasource> datasourceDataset = Utils
.readPath(spark, inputPath + "datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
&& !d.getDataInfo().getDeletedbyinference());
datasourceDataset
.joinWith(
organizations, datasourceDataset.col("id").equalTo(organizations.col("entityId")), "left")
.map((MapFunction<Tuple2<Datasource, ExtendingOrganization>, eu.dnetlib.dhp.skgif.model.Datasource>) t2 -> {
eu.dnetlib.dhp.skgif.model.Datasource datasource = dumpDatasource(t2._1());
if (Optional.ofNullable(t2._2()).isPresent()) {
datasource.setOrganization(t2._2().getRelevant_organization());
}
return datasource;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "datasource");
}
private static eu.dnetlib.dhp.skgif.model.Datasource dumpDatasource(Datasource d) {
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
datasource.setLocal_identifier(d.getId());
datasource
.setIdentifiers(
d
.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
datasource.setName(d.getOfficialname().getValue());
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
datasource
.setJurisdiction(
Optional
.ofNullable(d.getJurisdiction())
.map(v -> v.getClassid())
.orElse(new String()));
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
datasource.setVersion_control(d.getVersioncontrol());
datasource
.setData_source_classification(
Optional
.ofNullable(d.getEoscdatasourcetype())
.map(v -> v.getClassname())
.orElse(new String()));
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
datasource.setThematic(d.getThematic());
datasource
.setResearch_product_access_policy(
Optional
.ofNullable(d.getDatabaseaccesstype())
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
.orElse(new ArrayList<>()));
datasource
.setResearch_product_metadata_access_policy(
Optional
.ofNullable(d.getResearchproductmetadataaccesspolicies())
.map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
.orElse(new ArrayList<>()));
return datasource;
}
private static List<String> getResearchProductAccessPolicy(List<String> value) {
return value
.stream()
.map(v -> getResearchProductAccessPolicy(v))
.filter(Objects::nonNull)
.map(v -> v.get(0))
.distinct()
.collect(Collectors.toList());
}
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch (value) {
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
private static List<String> getEoscProductType(List<String> researchentitytypes) {
List<String> eoscProductType = new ArrayList<>();
if (researchentitytypes != null) {
if (researchentitytypes.contains("Software"))
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature");
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data");
if (researchentitytypes.contains("Organization") ||
researchentitytypes.contains("Organizations") ||
researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
}

View File

@ -0,0 +1,201 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 22/02/24
*/
public class DumpGrant implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpGrant.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpGrant.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/dump_grant_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "grants");
mapGrants(spark, inputPath, outputPath, workingDir);
});
}
private static void mapGrants(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Dataset<Project> projects = Utils
.readPath(spark, inputPath + "project", Project.class)
.filter(
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible());
Dataset<ExtendingOrganization> partecipatingOrgs = Utils
.readPath(spark, workingDir + "relations/project_partecipating_organization", ExtendingOrganization.class);
projects = projects
.groupByKey((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Project, Project>) (k, v) -> v.next(), Encoders.bean(Project.class));
projects
.joinWith(partecipatingOrgs, projects.col("id").equalTo(partecipatingOrgs.col("entityId")), "left")
.map((MapFunction<Tuple2<Project, ExtendingOrganization>, Grant>) t2 -> {
Grant g = dumpGrant(t2._1());
if (Optional.ofNullable(t2._2()).isPresent())
g.setBeneficiaries(t2._2().getRelevant_organization());
return g;
}, Encoders.bean(Grant.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "grants");
}
private static Grant dumpGrant(Project project) throws DocumentException {
Grant g = new Grant();
g.setLocal_identifier(project.getId());
g.setGrantCode(project.getCode().getValue());
g.setIdentifiers(getProjectIdentifier(project));
if (Optional.ofNullable(project.getTitle()).isPresent())
g.setTitle(project.getTitle().getValue());
g
.setSummary(
Optional
.ofNullable(project.getSummary())
.map(value -> value.getValue())
.orElse(new String()));
g
.setAcronym(
Optional
.ofNullable(project.getAcronym())
.map(value -> value.getValue())
.orElse(new String()));
if (Optional.ofNullable(project.getFundingtree()).isPresent() &&
project.getFundingtree().size() > 0) {
g.setFunder(Utils.getFunderName(project.getFundingtree().get(0).getValue()));
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
g.setFunding_stream(getFundingStream(project.getFundingtree().get(0).getValue()));
}
g
.setCurrency(
Optional
.ofNullable(project.getCurrency())
.map(value -> value.getValue())
.orElse(new String()));
g
.setFunded_amount(
Optional
.ofNullable(project.getFundedamount())
.orElse(null));
if (Optional.ofNullable(project.getSubjects()).isPresent())
g
.setKeywords(
project
.getSubjects()
.stream()
.map(s -> s.getValue())
.collect(Collectors.toList()));
g
.setStart_date(
Optional
.ofNullable(project.getStartdate())
.map(value -> value.getValue())
.orElse(new String()));
g
.setEnd_date(
Optional
.ofNullable(project.getEnddate())
.map(value -> value.getValue())
.orElse(new String()));
g
.setWebsite(
Optional
.ofNullable(project.getWebsiteurl())
.map(value -> value.getValue())
.orElse(new String()));
return g;
}
private static String getFundingStream(String fundingtree) throws DocumentException {
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
doc.selectNodes("//funding_level_0").size() > 0 &&
Optional.ofNullable(doc.selectNodes("//funding_level_0/name")).isPresent() &&
doc.selectNodes("//funding_level_0/name").size() > 0)
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0/name").get(0))).getText();
return new String();
}
private static List<Identifier> getProjectIdentifier(Project project) throws DocumentException {
List<Identifier> identifiers = new ArrayList<>();
if (project.getPid().size() > 0)
project
.getPid()
.stream()
.forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())));
identifiers
.add(
Identifier
.newInstance(
Utils.getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue()));
return identifiers;
}
}

View File

@ -0,0 +1,145 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
import eu.dnetlib.dhp.skgif.model.Prefixes;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class DumpOrganization implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpOrganization.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpOrganization.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/dump_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "organizations");
mapOrganization(spark, inputPath, outputPath);
});
}
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
organizations = organizations
.filter(
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
&& !o.getDataInfo().getInvisible())
.groupByKey((MapFunction<Organization, String>) p -> p.getId(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Organization, Organization>) (k, v) -> v.next(),
Encoders.bean(Organization.class));
organizations.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
if (!Optional.ofNullable(o.getPid()).isPresent() || o.getPid().size() == 0)
return null;
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
// organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
organization.setLocal_identifier(o.getId());
organization
.setCountry(
Optional
.ofNullable(o.getCountry().getClassid())
.orElse(new String()));
organization
.setName(
Optional
.ofNullable(o.getLegalname().getValue())
.orElse(new String()));
organization
.setShort_name(
Optional
.ofNullable(o.getLegalshortname())
.map(v -> v.getValue())
.orElse(new String()));
organization
.setIdentifiers(
o
.getPid()
.stream()
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()));
organization
.setOther_names(
o
.getAlternativeNames()
.stream()
.map(a -> a.getValue())
.collect(Collectors.toList()));
organization.setType(getOrganizationType(o));
return organization;
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
.filter((FilterFunction<eu.dnetlib.dhp.skgif.model.Organization>) o -> o != null)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "organizations");
}
private static String getOrganizationType(Organization o) {
if (Optional.ofNullable(o.getEcenterprise()).isPresent()
&& o.getEcenterprise().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.COMPANY.label;
if (Optional.ofNullable(o.getEchighereducation()).isPresent()
&& o.getEchighereducation().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if (Optional.ofNullable(o.getEcresearchorganization()).isPresent()
&& o.getEcresearchorganization().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.EDUCATION.label;
if (Optional.ofNullable(o.getEcnonprofit()).isPresent()
&& o.getEcnonprofit().getValue().equalsIgnoreCase("true"))
return OrganizationTypes.NONPROFIT.label;
return OrganizationTypes.OTHER.label;
}
}

View File

@ -0,0 +1,442 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.filterentities.MasterDuplicate;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.*;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.skgif.model.AccessRight;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 06/02/24
*/
public class DumpResearchProduct implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpResearchProduct.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpResearchProduct.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/emit_biblio_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String masterDuplicatePath = parser.get("masterDuplicatePath");
log.info("masterDuplicatePath: {}", masterDuplicatePath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "products");
emitFromResult(spark, inputPath, outputPath, workingDir, masterDuplicatePath);
});
}
// per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
String workingDir, String masterDuplicatePath) {
dumpResearchProduct(spark, inputPath, workingDir, masterDuplicatePath);
moveDumpedProducts(spark, workingDir, outputPath);
}
private static void moveDumpedProducts(SparkSession spark, String workingDir, String outputPath) {
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
for (EntityType e : ModelSupport.entityTypes.keySet()) {
if (ModelSupport.isResult(e))
researchProducts = researchProducts
.union(
Utils
.readPath(
spark, workingDir + "products" + e.name() + "/researchproduct", ResearchProduct.class));
}
researchProducts
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "products");
}
private static <R extends Result> void dumpResearchProduct(SparkSession spark, String inputPath, String workingDir,
String masterDuplicatePath) {
List<MasterDuplicate> masterDuplicateList = Utils
.readPath(spark, masterDuplicatePath, MasterDuplicate.class)
.collectAsList();
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
if (e.name().equalsIgnoreCase("publication")) {
dumpPublication(spark, inputPath, workingDir, e, resultClazz, masterDuplicateList);
} else {
dumpOtherResults(spark, inputPath, workingDir, e, resultClazz, masterDuplicateList);
}
includeRelevantOrganization(spark, workingDir, e);
includeFunding(spark, workingDir, e);
includeRelatedProducts(spark, workingDir, e);
}
});
}
private static void includeRelatedProducts(SparkSession spark, String workingDir, EntityType e) {
Dataset<ResearchProduct> pprWitGrants = spark
.read()
.schema(Encoders.bean(ResearchProduct.class).schema())
.json(workingDir + "products" + e.name() + "/temp_researchproductgrant")
.as(Encoders.bean(ResearchProduct.class));
Dataset<ProductsRelation> relatedResults = Utils
.readPath(spark, workingDir + "/relations/related_products", ProductsRelation.class);
pprWitGrants
.joinWith(
relatedResults, pprWitGrants.col("local_identifier").equalTo(relatedResults.col("resultId")),
"left")
.map(
(MapFunction<Tuple2<ResearchProduct, ProductsRelation>, ResearchProduct>) t2 -> {
if (t2._2() == null)
return t2._1();
t2._1().setRelated_products(t2._2().getRelated_products());
return t2._1();
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "products" + e.name() + "/researchproduct");
Utils.removeOutputDir(spark, workingDir + "products" + e.name() + "/temp_researchproductgrant");
}
private static void includeFunding(SparkSession spark, String workingDir, EntityType e) {
Dataset<ResearchProduct> prrWithAffiliation = spark
.read()
.schema(Encoders.bean(ResearchProduct.class).schema())
.json(workingDir + "products" + e.name() + "/temp_researchproductaff")
.as(Encoders.bean(ResearchProduct.class));
Dataset<GrantRelation> grants = Utils
.readPath(spark, workingDir + "relations/funding", GrantRelation.class);
// Dataset<PartialResearchProduct> pprWitGrants =
prrWithAffiliation
.joinWith(
grants, prrWithAffiliation.col("local_identifier").equalTo(grants.col("resultId")), "left")
.map((MapFunction<Tuple2<ResearchProduct, GrantRelation>, ResearchProduct>) t2 -> {
if (t2._2() == null)
return t2._1();
t2._1().setFunding(t2._2().getFunding());
return t2._1();
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "products" + e.name() + "/temp_researchproductgrant");
Utils.removeOutputDir(spark, workingDir + "products" + e.name() + "/temp_researchproductaff");
}
private static void includeRelevantOrganization(SparkSession spark, String workingDir, EntityType e) {
Dataset<ExtendingOrganization> affiliations = Utils
.readPath(
spark, workingDir + "relations/result_relevant_organizations", ExtendingOrganization.class);
Dataset<ResearchProduct> partialResearchProduct = spark
.read()
.schema(Encoders.bean(ResearchProduct.class).schema())
.json(workingDir + "products" + e.name() + "/temp_researchProduct")
.as(Encoders.bean(ResearchProduct.class));
// Dataset<PartialResearchProduct> prrWithAffiliation =
partialResearchProduct
.joinWith(
affiliations,
partialResearchProduct.col("local_identifier").equalTo(affiliations.col("entityId")),
"left")
.map(
(MapFunction<Tuple2<ResearchProduct, ExtendingOrganization>, ResearchProduct>) t2 -> {
if (t2._2() == null)
return t2._1();
t2._1().setRelevant_organizations(t2._2().getRelevant_organization());
return t2._1();
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "products" + e.name() + "/temp_researchproductaff");
Utils.removeOutputDir(spark, workingDir + "products" + e.name() + "/temp_researchProduct");
}
private static <R extends Result> void dumpOtherResults(SparkSession spark, String inputPath, String workingDir,
EntityType e, Class<R> resultClazz, List<MasterDuplicate> masterDuplicateList) {
Dataset<R> results = Utils.readPath(spark, inputPath + e.name(), resultClazz);
results.map((MapFunction<R, ResearchProduct>) r -> {
ArrayList<String> journalHbIds = new ArrayList<>();
ResearchProduct rp = ResultMapper.map(r);
rp
.setManifestations(
r
.getInstance()
.stream()
.map(i -> getManifestation(i, journalHbIds, r, masterDuplicateList))
.collect(Collectors.toList()));
return rp;
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "products" + e.name() + "/temp_researchProduct");
}
private static <R extends Result> void dumpPublication(SparkSession spark, String inputPath, String workingDir,
EntityType e, Class<R> resultClazz, List<MasterDuplicate> masterDuplicateList) {
Dataset<Tuple2<String, String>> resultHostedBy = Utils
.readPath(spark, inputPath + e.name(), resultClazz)
.flatMap(
(FlatMapFunction<R, Tuple2<String, String>>) p -> p
.getInstance()
.stream()
.map(i -> new Tuple2<>(p.getId(), i.getHostedby().getKey()))
.collect(Collectors.toList())
.iterator(),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Dataset<Row> journalIds = spark
.read()
.schema(Encoders.bean(Datasource.class).schema())
.json(inputPath + "/datasource")
.filter(
"datainfo.deletedbyinference != true and " +
"eoscdatasourcetype.classid == 'Journal archive' ")
.select("id");
Dataset<Row> journalHostedByPerResult = resultHostedBy
.join(
journalIds,
resultHostedBy.col("_2").equalTo(journalIds.col("id")), "leftsemi")
.selectExpr("_1 as id", "_2 as journalHostedBy");
Dataset<Publication> results = Utils.readPath(spark, inputPath + e.name(), Publication.class);
results
.joinWith(
journalHostedByPerResult, results
.col("id")
.equalTo(journalHostedByPerResult.col("id")),
"left")
.groupByKey(
(MapFunction<Tuple2<Publication, Row>, String>) t2 -> t2._1().getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Tuple2<Publication, Row>, ResearchProduct>) (k, v) -> {
ArrayList<String> journalHbIds = new ArrayList<>();
Tuple2<Publication, Row> first = v.next();
if (Optional.ofNullable(first._2()).isPresent())
journalHbIds.add(first._2().getAs("journalHostedBy"));
v.forEachRemaining(value -> journalHbIds.add(value._2().getAs("journalHostedBy")));
Publication p = first._1();
ResearchProduct rp = ResultMapper.map(p);
rp
.setManifestations(
p
.getInstance()
.stream()
.map(i -> getManifestation(i, journalHbIds, p, masterDuplicateList))
.collect(Collectors.toList()));
return rp;
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "products" + e.name() + "/temp_researchProduct");
}
@NotNull
private static <R extends Result> Manifestation getManifestation(Instance i, ArrayList<String> journalHbIds, R p,
List<MasterDuplicate> eoscDatasourceIdMap) {
Manifestation m = new Manifestation();
m.setProduct_local_type(i.getInstancetype().getClassname());
m.setProduct_local_type_schema(i.getInstancetype().getSchemename());
m.setPeer_review(getPeerReviewd(i));
m.setAccess_right(getAccessRigth(i));
m
.setLicence(
getLicence(i));
if (Optional.ofNullable(i.getUrl()).isPresent() && i.getUrl().size() > 0)
m.setUrl(i.getUrl().get(0));
else
m.setUrl(null);
if (Optional.ofNullable(i.getPid()).isPresent() && i.getPid().size() > 0) {
m.setPid(i.getPid().get(0).getValue());
}
if (Optional.ofNullable(i.getDateofacceptance()).isPresent())
m
.setDates(
Arrays
.asList(
Dates.newInstance(i.getDateofacceptance().getValue(), "publishing")));
if (p instanceof Publication) {
if (journalHbIds.contains(i.getHostedby().getKey())
&& Optional.ofNullable(((Publication) p).getJournal()).isPresent()) {
Biblio biblio = getBiblio(((Publication) p).getJournal());
if (Optional.ofNullable(p.getPublisher()).isPresent())
biblio.setPublisher(p.getPublisher().getValue());
m.setBiblio(biblio);
if (Optional.ofNullable(((Publication) p).getJournal().getIssnPrinted()).isPresent())
m
.setVenue(
MinVenue
.newInstance(
Utils
.getIdentifier(Prefixes.VENUE, ((Publication) p).getJournal().getIssnPrinted()),
i.getHostedby().getValue()));
else if (Optional.ofNullable(((Publication) p).getJournal().getIssnOnline()).isPresent())
m
.setVenue(
MinVenue
.newInstance(
Utils.getIdentifier(Prefixes.VENUE, ((Publication) p).getJournal().getIssnOnline()),
i.getHostedby().getValue()));
}
}
List<MasterDuplicate> eoscDsIds = eoscDatasourceIdMap
.stream()
.filter(
dm -> dm
.getGraphId()
.equals(i.getHostedby().getKey()) ||
dm
.getGraphId()
.equals(i.getCollectedfrom().getKey()))
.collect(Collectors.toList());
if (eoscDsIds.size() > 0) {
m
.setEoscId(
eoscDsIds
.stream()
.map(dm -> dm.getEoscId())
.collect(Collectors.toList()));
}
m
.setHosting_datasource(
MinVenue
.newInstance(
// Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()),
i.getHostedby().getKey(),
i.getHostedby().getValue()));
return m;
}
private static Biblio getBiblio(Journal epm) {
Biblio biblio = new Biblio();
if (Optional.ofNullable(epm.getEdition()).isPresent())
biblio.setEdition(epm.getEdition());
if (Optional.ofNullable(epm.getIss()).isPresent())
biblio.setIssue(epm.getIss());
if (Optional.ofNullable(epm.getVol()).isPresent())
biblio.setVolume(epm.getVol());
if (Optional.ofNullable(epm.getEp()).isPresent())
biblio.setEnd_page(epm.getEp());
if (Optional.ofNullable(epm.getSp()).isPresent())
biblio.setStart_page(epm.getSp());
return biblio;
}
@Nullable
private static String getLicence(Instance i) {
return Optional
.ofNullable(i.getLicense())
.map(value -> value.getValue())
.orElse(null);
}
private static String getAccessRigth(Instance i) {
if (Optional.ofNullable(i.getAccessright()).isPresent())
switch (i.getAccessright().getClassid()) {
case "OPEN":
case "OPEN DATA":
case "OPEN SOURCE":
return AccessRight.OPEN.label;
case "CLOSED":
return AccessRight.CLOSED.label;
case "RESTRICTED":
return AccessRight.RESTRICTED.label;
case "EMBARGO":
case "12MONTHS":
case "6MONTHS":
return AccessRight.EMBARGO.label;
default:
return AccessRight.UNAVAILABLE.label;
}
return AccessRight.UNAVAILABLE.label;
}
private static String getPeerReviewd(Instance i) {
if (Optional.ofNullable(i.getRefereed()).isPresent())
switch (i.getRefereed().getClassid()) {
case "0000":
return PeerReview.UNAVAILABLE.label;
case "0001":
return PeerReview.PEER_REVIEWED.label;
case "0002":
return PeerReview.NON_PEER_REVIEWED.label;
}
return PeerReview.UNAVAILABLE.label;
}
}

View File

@ -0,0 +1,684 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.graph.dump.skgif.ResultMapper.map;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import javax.xml.crypto.Data;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.*;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.skgif.model.AccessRight;
import eu.dnetlib.dhp.skgif.model.Organization;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 06/02/24
*/
public class DumpResult implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpResult.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpResult.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/dump_result_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, workingDir + "aggrelation");
mapResult(spark, inputPath, workingDir, outputPath);
});
}
//per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
public static <R extends Result> void mapResult(SparkSession spark, String inputPath,
String workingDir, String outputPath) {
// merge of relations and manifestation for the same result
getRelationAndManifestation(spark, workingDir, inputPath);
// dump of the result and enrichment with relevant information for relations and manifestations
dumpResult(spark, inputPath, workingDir, outputPath);
}
private static void getRelationAndManifestation(SparkSession spark, String workingDir, String inputPath) {
// Dataset<RelationPerProduct> aggRelations = Utils
// .readPath(spark, workingDir + "aggrelation", RelationPerProduct.class);
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add(
"eoscdatasourcetype", new StructType()
.add("classid", DataTypes.StringType))
.add("id", DataTypes.StringType)
;
ModelSupport.entityTypes
.keySet()
.stream()
.filter(ModelSupport::isResult)
.forEach(e -> {
Utils.removeOutputDir(spark, workingDir + e.name() + "/partialresearchproduct");
log.info("executing on {}", e.name());
Dataset<Row> datasource = spark
.read()
.schema(rp)
.json(inputPath + "/datasource")
.filter(("datainfo.deletedbyinference != true and eoscdatasourcetype.classid == 'Journal archive'"))
.drop("datainfo", "eoscdatasourcetype");
Dataset<Row> man = spark
.read()
.schema(Encoders.bean(EmitPerManifestation.class).schema())
.json(workingDir + e.name() + "/manifestation");
// Dataset<PartialResearchProduct> partialResearchProduct =
man
.joinWith(datasource, man.col("hostedby").equalTo(datasource.col("id")), "left")
.groupByKey(
(MapFunction<Tuple2<Row, Row>, String>) t2 -> t2._1().getAs("resultId"),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<Row, Row>, PartialResearchProduct>) (
k, v) -> {
PartialResearchProduct prp = new PartialResearchProduct();
prp.setResultId(k);
List<Manifestation> manifestationList = new ArrayList<>();
Tuple2<Row, Row> first = v.next();
manifestationList.add(getManifestation(first));
v.forEachRemaining(value -> manifestationList.add(getManifestation(value)));
prp.setManifestations(manifestationList);
return prp;
}, Encoders.bean(PartialResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/temp_partitalresearchproduct");
Dataset<PartialResearchProduct> partialResearchProduct = spark
.read()
.schema(Encoders.bean(PartialResearchProduct.class).schema())
.json(workingDir + e.name() + "/temp_partitalresearchproduct")
.as(Encoders.bean(PartialResearchProduct.class));
Dataset<ExtendingOrganization> affiliations = Utils
.readPath(
spark, workingDir + "relations/result_relevant_organizations", ExtendingOrganization.class);
// Dataset<PartialResearchProduct> prrWithAffiliation =
partialResearchProduct
.joinWith(
affiliations, partialResearchProduct.col("resultId").equalTo(affiliations.col("entityId")),
"left")
.map(
(MapFunction<Tuple2<PartialResearchProduct, ExtendingOrganization>, PartialResearchProduct>) t2 -> {
if (t2._2() == null)
return t2._1();
t2._1().setRelevant_organizations(t2._2().getRelevant_organization());
return t2._1();
}, Encoders.bean(PartialResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/temp_partitalresearchproductaff");
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_partitalresearchproduct");
Dataset<PartialResearchProduct> prrWithAffiliation = spark
.read()
.schema(Encoders.bean(PartialResearchProduct.class).schema())
.json(workingDir + e.name() + "/temp_partitalresearchproductaff")
.as(Encoders.bean(PartialResearchProduct.class));
Dataset<GrantRelation> grants = Utils
.readPath(spark, workingDir + "relations/funding", GrantRelation.class);
// Dataset<PartialResearchProduct> pprWitGrants =
prrWithAffiliation
.joinWith(grants, prrWithAffiliation.col("resultId").equalTo(grants.col("resultId")), "left")
.map((MapFunction<Tuple2<PartialResearchProduct, GrantRelation>, PartialResearchProduct>) t2 -> {
if (t2._2() == null)
return t2._1();
t2._1().setFunding(t2._2().getFunding());
return t2._1();
}, Encoders.bean(PartialResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/temp_partitalresearchproductgrant");
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_partitalresearchproductaff");
Dataset<PartialResearchProduct> pprWitGrants = spark
.read()
.schema(Encoders.bean(PartialResearchProduct.class).schema())
.json(workingDir + e.name() + "/temp_partitalresearchproductgrant")
.as(Encoders.bean(PartialResearchProduct.class));
Dataset<ProductsRelation> relatedResults = Utils
.readPath(spark, workingDir + "/relations/related_products", ProductsRelation.class);
pprWitGrants
.joinWith(
relatedResults, pprWitGrants.col("resultId").equalTo(relatedResults.col("resultId")),
"left")
.map(
(MapFunction<Tuple2<PartialResearchProduct, ProductsRelation>, PartialResearchProduct>) t2 -> {
if (t2._2() == null)
return t2._1();
t2._1().setRelated_products(t2._2().getRelated_products());
return t2._1();
}, Encoders.bean(PartialResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/partialresearchproduct");
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_partitalresearchproductgrant");
});
}
private static Manifestation getManifestation(Tuple2<Row, Row> t2) {
// se il lato sinistro c'e' allora ho la biblio e la venue
// se non c'e' allora ho solo gli altri valori
Row epm = t2._1();
Manifestation manifestation = new Manifestation();
manifestation.setProduct_local_type(epm.getAs("product_local_type"));
manifestation.setProduct_local_type_schema(epm.getAs("product_local_type_schema"));
if (Optional.ofNullable(epm.getAs("publishing_date")).isPresent())
manifestation
.setDates(
Arrays
.asList(
Dates.newInstance(epm.getAs("publishing_date"), "publishing")));
manifestation.setPeer_review(epm.getAs("peer_reviewed"));
manifestation.setMetadata_curation("unavailable");
manifestation.setAccess_right(epm.getAs("access_right"));
manifestation.setLicence(epm.getAs("licence"));
manifestation.setUrl(epm.getAs("url"));
manifestation.setPid(epm.getAs("pid"));
if (Optional.ofNullable(t2._2()).isPresent()) {
Biblio biblio = getBiblio(epm);
// if (biblio == null)
// log.info("null biblio fo {} ", epm.getAs("resultId"));
manifestation.setBiblio(getBiblio(epm));
if (Optional.ofNullable(epm.getAs("journal")).isPresent() &&
Optional.ofNullable(epm.getAs("journal.issnPrinted")).isPresent())
manifestation
.setVenue(
MinVenue
.newInstance(
Utils.getIdentifier(Prefixes.VENUE, epm.getAs("journal.issnPrinted")),
epm.getAs("hostedbyvalue")));
else if (Optional.ofNullable(epm.getAs("journal")).isPresent() &&
Optional.ofNullable(epm.getAs("journal.issnOnline")).isPresent())
manifestation
.setVenue(
MinVenue
.newInstance(
Utils.getIdentifier(Prefixes.VENUE, epm.getAs("journal.issnOnline")),
epm.getAs("hostedbyvalue")));
}
manifestation
.setHosting_datasource(
MinVenue
.newInstance(
// Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()),
epm.getAs("hostedBy"),
epm.getAs("hostedbyvalue")));
return manifestation;
}
private static Biblio getBiblio(Row epm) {
Biblio biblio = new Biblio();
if (!Optional.ofNullable(epm.getAs("journal")).isPresent()) {
return null;
}
if (Optional.ofNullable(epm.getAs("journal.edition")).isPresent())
biblio.setEdition(epm.getAs("journal.edition"));
if (Optional.ofNullable(epm.getAs("journal.iss")).isPresent())
biblio.setIssue(epm.getAs("journal.iss"));
if (Optional.ofNullable(epm.getAs("publisher")).isPresent())
biblio.setPublisher(epm.getAs("publisher"));
if (Optional.ofNullable(epm.getAs("journal.vol")).isPresent())
biblio.setVolume(epm.getAs("journal.vol"));
if (Optional.ofNullable(epm.getAs("journal.ep")).isPresent())
biblio.setEnd_page(epm.getAs("journal.ep"));
if (Optional.ofNullable(epm.getAs("journal.sp")).isPresent())
biblio.setStart_page(epm.getAs("journal.sp"));
return biblio;
}
private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
ModelSupport.entityTypes
.keySet()
.parallelStream()
.filter(ModelSupport::isResult)
.forEach(e -> {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Utils.removeOutputDir(spark, workingDir + e.name() + "/researchproduct");
Dataset<R> results = Utils.readPath(spark, inputPath + e.name(), resultClazz);
Dataset<PartialResearchProduct> prr = Utils
.readPath(spark, workingDir + e.name() + "/partialresearchproduct", PartialResearchProduct.class);
results
.joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left")
.map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> {
ResearchProduct rp = map(t2._1());
if (Optional.ofNullable(t2._2()).isPresent()) {
if (Optional.ofNullable(t2._2().getRelated_products()).isPresent())
rp.setRelated_products(t2._2().getRelated_products());
if (Optional.ofNullable(t2._2().getFunding()).isPresent())
rp.setFunding(t2._2().getFunding());
if (Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent())
rp.setRelevant_organizations(t2._2().getRelevant_organizations());
if (Optional.ofNullable(t2._2().getManifestations()).isPresent())
rp.setManifestations(t2._2().getManifestations());
}
return rp;
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/researchproduct");
});
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
for (EntityType e : ModelSupport.entityTypes.keySet()) {
if (ModelSupport.isResult(e))
researchProducts = researchProducts
.union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class));
}
researchProducts
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "ResearchProduct");
}
private static void selectRelations(SparkSession spark, String inputPath, String workingDir) {
List<String> relationsProducts = Arrays
.asList(
RelationType.CITATION.label,
// RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label,
RelationType.SUPPLEMENT.label,
// RelationType.RESULT_OUTCOME_FUNDING.label,
RelationType.DOCUMENTS.label,
RelationType.PART.label,
RelationType.VERSION.label);
Dataset<Row> relation = spark
.read()
.schema(Encoders.bean(Relation.class).schema())
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == 'hasAuthorInstitution")
.select("source", "target");
Dataset<Row> organization = spark
.read()
.schema(Encoders.bean(Organization.class).schema())
.json(inputPath + "organization")
.filter("datainfo.deletedbyinference != true")
.select("id", "pid", "legalname.value");
// result = spark.read().schema(Encoders.bean(Result.class).schema())
// .json(inputPath + )
// relationsProducts
// .stream()
// .forEach(r -> buildRelationPerProducts(spark, inputPath, workingDir, r));
// buildRelationPerAffiliation(
// spark, inputPath, workingDir, RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label);
buildRelationPerGrant(spark, inputPath, workingDir, RelationType.RESULT_OUTCOME_FUNDING.label);
RDD<RelationPerProduct> temp = spark
.read()
.schema(Encoders.bean(RelationPerProduct.class).schema())
.json(workingDir + "aggrelation_temp")
.as(Encoders.bean(RelationPerProduct.class))
.toJavaRDD()
.mapToPair(v -> new Tuple2<>(v.getResultId(), v))
.reduceByKey((a, b) -> {
mergeRelationPerProduct(a, b);
return a;
})
.map(v -> v._2())
.rdd();
spark
.createDataset(temp, Encoders.bean(RelationPerProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "/aggrelation");
}
private static void buildRelationPerGrant(SparkSession spark, String inputPath, String workingDir,
String relationType) {
log.info("Relation: {}", relationType);
final StructType relationstructureSchema = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
final StructType grantSchema = new StructType()
.add(
"local_identifier", DataTypes.StringType)
.add("funder", DataTypes.StringType)
.add("code", DataTypes.StringType)
.add("title", DataTypes.StringType)
;
Dataset<Row> relation = spark
.read()
.schema(relationstructureSchema)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relclass == '" + relationType + "'")
.drop("dataInfo");
Dataset<Row> minProduct = spark
.read()
.schema(grantSchema)
.json(workingDir + "minGrant");
relation
.joinWith(
minProduct, relation.col("target").equalTo(minProduct.col("local_identifier")))
.selectExpr("_1.source as sourceResult", "_1.relClass as relClass", "_2.*")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("sourceResult"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, RelationPerProduct>) (k, it) -> {
RelationPerProduct rpp = new RelationPerProduct();
rpp.setResultId(k);
rpp.setRelatedProduct(new HashMap<>());
updateRelevantGrant(rpp, it.next());
it.forEachRemaining(r -> updateRelevantGrant(rpp, r));
return rpp;
}, Encoders.bean(RelationPerProduct.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(workingDir + "aggrelation_temp");
}
private static void updateRelevantGrant(RelationPerProduct rpp, Row next) {
if (!Optional.ofNullable(rpp.getFunding()).isPresent())
rpp.setFunding(new ArrayList<>());
MinGrant mo = new MinGrant();
mo.setLocal_identifier(next.getAs("local_identifier"));
mo.setTitle(next.getAs("title"));
mo.setFunder(next.getAs("fundet"));
mo.setCode(next.getAs("code"));
rpp.getFunding().add(mo);
}
private static void buildRelationPerAffiliation(SparkSession spark, String inputPath, String workingDir,
String relationType) {
log.info("Relation: {}", relationType);
final StructType relationstructureSchema = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
final StructType orgSchema = new StructType()
.add(
"local_identifier", DataTypes.StringType)
.add("name", DataTypes.StringType)
.add("ror", DataTypes.StringType)
.add("isni", DataTypes.StringType)
.add("fundRef", DataTypes.StringType)
.add("rinGold", DataTypes.StringType)
.add("wikidata", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(relationstructureSchema)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relclass == '" + relationType + "'")
.drop("dataInfo");
Dataset<Row> minOrganization = spark
.read()
.schema(orgSchema)
.json(workingDir + "minOrganization");
relation
.joinWith(
minOrganization, relation.col("target").equalTo(minOrganization.col("local_identifier")))
.selectExpr("_1.source as sourceResult", "_1.relClass as relClass", "_2.*")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("sourceResult"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, RelationPerProduct>) (k, it) -> {
RelationPerProduct rpp = new RelationPerProduct();
rpp.setResultId(k);
rpp.setRelatedProduct(new HashMap<>());
updateRelevantOrganization(rpp, it.next());
it.forEachRemaining(r -> updateRelevantOrganization(rpp, r));
return rpp;
}, Encoders.bean(RelationPerProduct.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(workingDir + "aggrelation_temp");
}
private static void updateRelevantOrganization(RelationPerProduct rpp, Row next) {
if (!Optional.ofNullable(rpp.getOrganizations()).isPresent())
rpp.setOrganizations(new ArrayList<>());
MinOrganization mo = new MinOrganization();
mo.setLocal_identifier(next.getAs("local_identifier"));
mo.setIsni(next.getAs("isni"));
mo.setRor(next.getAs("ror"));
mo.setName(next.getAs("name"));
mo.setWikidata(next.getAs("wikidata"));
mo.setFundRef(next.getAs("fundRef"));
mo.setRinGold(next.getAs("rinGold"));
rpp.getOrganizations().add(mo);
}
private static void buildRelationPerProducts(SparkSession spark, String inputPath, String workingDir,
String relationType) {
log.info("Relation: {}", relationType);
final StructType relationstructureSchema = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
final StructType productSchema = new StructType()
.add(
"local_identifier", DataTypes.StringType)
.add("title", DataTypes.StringType)
.add("doi", DataTypes.StringType)
.add("pmcid", DataTypes.StringType)
.add("arxivid", DataTypes.StringType)
.add("pmid", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(relationstructureSchema)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relclass == '" + relationType + "'")
.drop("dataInfo");
Dataset<Row> minProduct = spark
.read()
.schema(productSchema)
.json(workingDir + "minProduct");
relation
.joinWith(
minProduct, relation.col("target").equalTo(minProduct.col("local_identifier")))
.selectExpr("_1.source as sourceResult", "_1.relClass as relClass", "_2.*")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("sourceResult"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, RelationPerProduct>) (k, it) -> {
RelationPerProduct rpp = new RelationPerProduct();
rpp.setResultId(k);
rpp.setRelatedProduct(new HashMap<>());
updateRelatedProduct(rpp, it.next());
it.forEachRemaining(r -> updateRelatedProduct(rpp, r));
return rpp;
}, Encoders.bean(RelationPerProduct.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(workingDir + "aggrelation_temp");
// .map((MapFunction<Tuple2<Row, EncloseMinElement>, RelationPerProduct>) t2 -> {
// RelationPerProduct rpp = new RelationPerProduct();
// t2._2().setResultId(t2._1().getAs("source"));
// t2._2().setSemantics(t2._1().getAs("relClass"));
// insertEnclosedElement(rpp, t2._2());
// rpp.setResultId(t2._1().getAs("source"));
// return rpp;
// }, Encoders.bean(RelationPerProduct.class))
// .filter(Objects::nonNull)
// .toJavaRDD()
// .mapToPair(value -> new Tuple2<>(value.getResultId(), value))
// .reduceByKey((a, b) -> {
// mergeRelationPerProduct(a, b);
// return a;
// })
//
// .map(value -> value._2)
// .rdd(),
// Encoders.bean(RelationPerProduct.class))
//// .saveAsTextFile(workingDir + "/aggrelation", GzipCodec.class);
//// .groupByKey((MapFunction<EncloseMinElement, String>) eme -> eme.getResultId(), Encoders.STRING())
//// .mapGroups((MapGroupsFunction<String, EncloseMinElement, RelationPerProduct>) (k, v) -> {
//// RelationPerProduct rpp = new RelationPerProduct();
//// rpp.setResultId(k);
//// insertEnclosedElement(rpp, v.next());
//// v.forEachRemaining(e -> insertEnclosedElement(rpp, e));
//// return rpp;
//// }, Encoders.bean(RelationPerProduct.class))
// .write()
// .mode(SaveMode.Append)
// .option("compression", "gzip")
// .json(workingDir + "/aggrelation_temp");
}
private static void updateRelatedProduct(RelationPerProduct rpp, Row next) {
String key = next.getAs("relClass");
if (!rpp.getRelatedProduct().keySet().contains(key))
rpp.getRelatedProduct().put(key, new ArrayList<>());
MinProduct mp = new MinProduct();
mp.setLocal_identifier(next.getAs("local_identifier"));
mp.setTitle(next.getAs("title"));
mp.setPmid(next.getAs("pmid"));
mp.setArxivid(next.getAs("arxivid"));
mp.setPmcid(next.getAs("pmcid"));
mp.setDoi(next.getAs("doi"));
rpp.getRelatedProduct().get(key).add(mp);
}
private static void insertEnclosedElement(RelationPerProduct rpp, EncloseMinElement element) {
if (Optional.ofNullable(element.getMinOrganization()).isPresent())
rpp.getOrganizations().add(element.getMinOrganization());
if (Optional.ofNullable(element.getMinGrant()).isPresent())
rpp.getFunding().add(element.getMinGrant());
if (Optional.ofNullable(element.getMinProduct()).isPresent()) {
String sem = element.getSemantics();
if (!rpp.getRelatedProduct().containsKey(sem))
rpp.getRelatedProduct().put(sem, new ArrayList<>());
rpp.getRelatedProduct().get(sem).add(element.getMinProduct());
}
}
private static void mergeRelationPerProduct(RelationPerProduct rpp1, RelationPerProduct rpp2) {
if (Optional.ofNullable(rpp2.getOrganizations()).isPresent())
rpp1.getOrganizations().addAll(rpp2.getOrganizations());
if (Optional.ofNullable(rpp2.getFunding()).isPresent())
rpp1.getFunding().addAll(rpp2.getFunding());
if (Optional.ofNullable(rpp2.getRelatedProduct()).isPresent()) {
Map<String, List<MinProduct>> temp = rpp2.getRelatedProduct();
for (String key : temp.keySet()) {
if (!rpp1.getRelatedProduct().containsKey(key))
rpp1.getRelatedProduct().put(key, new ArrayList<>());
for (MinProduct mp : rpp2.getRelatedProduct().get(key))
rpp1.getRelatedProduct().get(key).add(mp);
}
}
}
}

View File

@ -0,0 +1,184 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 21/02/24
*/
public class DumpVenue implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpVenue.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpVenue.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/dump_datasource_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "venues");
mapVenue(spark, inputPath, outputPath, workingDir);
});
}
private static void mapVenue(SparkSession spark, String inputPath, String outputPath, String workingDir) {
StructType tp = StructType.fromDDL("`hostedby` STRING, `publisher` STRING");
Dataset<Row> journalIdsDataset = spark.read().schema(tp).json(workingDir + "datasourcePublisher");
Dataset<Datasource> datasourceDataset;
datasourceDataset = spark
.read()
.schema(Encoders.bean(Datasource.class).schema())
.json(inputPath + "datasource")
.filter("datainfo.deletedbyinference != true and eoscdatasourcetype.classid == 'Journal archive' ")
.as(Encoders.bean(Datasource.class));
datasourceDataset
.joinWith(
journalIdsDataset, datasourceDataset.col("id").equalTo(journalIdsDataset.col("hostedby")),
"left")
.map((MapFunction<Tuple2<Datasource, Row>, Venue>) t2 -> {
if (!Optional.ofNullable(t2._1().getJournal()).isPresent())
return null;
Venue venue = new Venue();
Datasource d = t2._1();
if (Optional.ofNullable(d.getJournal()).isPresent()
&& Optional.ofNullable(d.getJournal().getIssnPrinted()).isPresent())
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnPrinted()));
else if (Optional.ofNullable(d.getJournal()).isPresent()
&& Optional.ofNullable(d.getJournal().getIssnOnline()).isPresent())
venue.setLocal_identifier(Utils.getIdentifier(Prefixes.VENUE, d.getJournal().getIssnOnline()));
venue.setIdentifiers(getVenueIdentifier(d.getJournal()));
venue.setName(d.getOfficialname().getValue());
venue.setType(VenueType.JOURNAL.label);
if (Optional.ofNullable(t2._2()).isPresent())
venue.setPublisher(t2._2().getAs("publisher"));
venue.setAcronym(null);
venue.setSeries(null);
venue.setIs_currently_full_oa(null);
venue.setCreation_date(null);
venue.setContributions(null);
return venue;
}, Encoders.bean(Venue.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "venues");
Utils
.readPath(spark, workingDir + "venues", Venue.class)
.groupByKey((MapFunction<Venue, String>) v -> v.getLocal_identifier(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Venue, Venue>) (k, v) -> v.next(), Encoders.bean(Venue.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "venues");
}
private static List<Identifier> getVenueIdentifier(Journal journal) {
List<Identifier> identifiers = new ArrayList<>();
if (Optional.ofNullable((journal.getIssnOnline())).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.EISSN.label, journal.getIssnOnline()));
if (Optional.ofNullable(journal.getIssnPrinted()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.ISSN.label, journal.getIssnPrinted()));
if (Optional.ofNullable(journal.getIssnLinking()).isPresent())
identifiers.add(Identifier.newInstance(VenueIdentifierType.LISSN.label, journal.getIssnLinking()));
return identifiers;
}
private static List<String> getResearchProductAccessPolicy(List<String> value) {
return value
.stream()
.map(v -> getResearchProductAccessPolicy(v))
.filter(Objects::nonNull)
.map(v -> v.get(0))
.distinct()
.collect(Collectors.toList());
}
private static List<String> getResearchProductAccessPolicy(String value) {
// "databaseaccesstype if open => open access (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
// if restricted => restricted access (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
// if closed => metadata only access (https://vocabularies.coar-repositories.org/access_rights/c_14cb/) "
switch (value) {
case "open":// (https://vocabularies.coar-repositories.org/access_rights/c_abf2/)
return Arrays.asList("open access");
case "restricted":// (https://vocabularies.coar-repositories.org/access_rights/c_16ec/)
return Arrays.asList("restricted access");
case "closed":// (https://vocabularies.coar-repositories.org/access_rights/c_14cb/)
return Arrays.asList("metadata only access");
default:
return null;
}
}
private static List<String> getEoscProductType(List<String> researchentitytypes) {
List<String> eoscProductType = new ArrayList<>();
if (researchentitytypes != null) {
if (researchentitytypes.contains("Software"))
eoscProductType.add("Research Software");
if (researchentitytypes.contains("Research Publications") || researchentitytypes.contains("Literature"))
eoscProductType.add("Research Literature");
if (researchentitytypes.contains("Research Data"))
eoscProductType.add("Research Data");
if (researchentitytypes.contains("Organization") ||
researchentitytypes.contains("Organizations") ||
researchentitytypes.contains("Services") ||
researchentitytypes.contains("Projects"))
eoscProductType.add("Other research product");
}
return eoscProductType;
}
}

View File

@ -0,0 +1,249 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 06/02/24
*/
public class EmitFromEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(EmitFromEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
EmitFromEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/emit_biblio_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
emitFromResult(spark, inputPath, outputPath, workingDir);
});
}
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
String workingDir) {
emitPerson(spark, inputPath, outputPath, workingDir);
emitTopic(spark, inputPath, outputPath, workingDir);
emitDatasourcePublisher(spark, inputPath, workingDir);
}
private static void emitDatasourcePublisher(SparkSession spark, String inputPath, String workingDir) {
Dataset<Row> journalIds = spark
.read()
.schema(Encoders.bean(Datasource.class).schema())
.json((inputPath + "datasource"))
.filter(
"datainfo.deletedbyinference !=true and " +
"eoscdatasourcetype.classid == 'Journal archive' ")
.select("id");
Dataset<Publication> result = spark
.read()
.schema(Encoders.bean(Publication.class).schema())
.json(inputPath + "publication")
.filter("datainfo.deletedbyinference != true ")
.as(Encoders.bean(Publication.class));
Dataset<Row> datasourcePublisher = result.flatMap((FlatMapFunction<Publication, Tuple2<String, String>>) r -> {
ArrayList<Tuple2<String, String>> dsPub = new ArrayList<>();
if (Optional.ofNullable(r.getJournal()).isPresent() &&
Optional.ofNullable(r.getPublisher()).isPresent()) {
for (Instance i : r.getInstance())
dsPub.add(new Tuple2<>(i.getHostedby().getKey(), r.getPublisher().getValue()));
}
return dsPub.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.selectExpr("_1 as hostedby", "_2 as publisher");
datasourcePublisher
.join(journalIds, datasourcePublisher.col("hostedby").equalTo(journalIds.col("id")), "leftsemi")
.distinct()
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "/datasourcePublisher");
}
private static <R extends Result> void emitTopic(SparkSession spark, String inputPath, String outputPath,
String workingDir) {
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Utils
.readPath(spark, inputPath + e.name(), resultClazz)
.filter((FilterFunction<R>) r -> Optional.ofNullable(r.getSubject()).isPresent())
.flatMap(
(FlatMapFunction<R, Topic>) r -> r
.getSubject()
.stream()
.filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos")
|| s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.map(s -> {
Topic t = new Topic();
t
.setLocal_identifier(
Utils
.getIdentifier(
Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
t
.setIdentifiers(
Arrays
.asList(
Identifier.newInstance(s.getQualifier().getClassid(), s.getValue())));
t.setName(s.getValue());
return t;
})
.collect(Collectors.toList())
.iterator(),
Encoders.bean(Topic.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/topic");
}
});
Dataset<Topic> topics = spark.emptyDataset(Encoders.bean(Topic.class));
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
if (ModelSupport.isResult(entityType))
topics = topics.union(Utils.readPath(spark, workingDir + entityType.name() + "/topic", Topic.class));
}
topics
.groupByKey((MapFunction<Topic, String>) p -> p.getLocal_identifier(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Topic, Topic>) (k, v) -> v.next(), Encoders.bean(Topic.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/topics");
}
private static <R extends Result> void emitPerson(SparkSession spark, String inputPath, String outputPath,
String workingDir) {
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Utils
.readPath(spark, inputPath + e.name(), resultClazz)
.flatMap((FlatMapFunction<R, Persons>) r -> {
List<Persons> authors = new ArrayList<>();
if (Optional.ofNullable(r.getAuthor()).isPresent() && r.getAuthor().size() > 0) {
int count = 0;
for (Author a : r.getAuthor()) {
count += 1;
Persons p = new Persons();
p.setFamily_name(a.getSurname());
p.setGiven_name(a.getName());
p.setFullname(a.getFullname());
String identifier = new String();
if (Optional.ofNullable(a.getPid()).isPresent()) {
Tuple2<String, Boolean> orcid = eu.dnetlib.dhp.oa.graph.dump.skgif.Utils
.getOrcid(a.getPid());
if (orcid != null) {
identifier = Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2());
if (orcid._2())
p
.setIdentifiers(
Arrays.asList(Identifier.newInstance("orcid", orcid._1())));
else
p
.setIdentifiers(
Arrays
.asList(Identifier.newInstance("inferred_orcid", orcid._1())));
} else {
if (Optional.ofNullable(a.getRank()).isPresent()) {
identifier = Utils
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + a.getRank());
} else {
identifier = Utils
.getIdentifier(Prefixes.TEMPORARY_PERSON, r.getId() + count);
}
}
}
p.setLocal_identifier(identifier);
authors.add(p);
}
}
return authors.iterator();
}, Encoders.bean(Persons.class))
.filter((FilterFunction<Persons>) p -> p != null)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/person");
}
});
Dataset<Persons> persons = spark.emptyDataset(Encoders.bean(Persons.class));
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
if (ModelSupport.isResult(entityType))
persons = persons
.union(Utils.readPath(spark, workingDir + entityType.name() + "/person", Persons.class));
}
persons
.groupByKey((MapFunction<Persons, String>) p -> p.getLocal_identifier(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Persons, Persons>) (k, v) -> v.next(), Encoders.bean(Persons.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/persons");
}
}

View File

@ -0,0 +1,194 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoAllowedTypeException;
import eu.dnetlib.dhp.oa.graph.dump.skgif.exception.NoTitleFoundException;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 01/09/23
*/
public class ResultMapper implements Serializable {
public static <E extends Result> ResearchProduct map(
E input)
throws Exception {
ResearchProduct out = new ResearchProduct();
Optional<Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
try {
// out.setLocal_identifier(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, input.getId()));
out.setLocal_identifier(input.getId());
mapPid(out, input);
mapTitle(out, input);
mapAbstract(out, input);
mapType(out, input);
mapTopic(out, input);
mapContribution(out, input);
//The manifestation will be included extending the result as well as the relations to funder, organization and other results
return out;
} catch (ClassCastException cce) {
return null;
}
}
return null;
}
private static <E extends Result> void mapContribution(ResearchProduct out, E input) {
if (Optional.ofNullable(input.getAuthor()).isPresent()) {
int count = 0;
List<Contribution> contributionList = new ArrayList<>();
for (Author a : input.getAuthor()) {
count += 1;
Contribution contribution = new Contribution();
Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid());
MinPerson minPerson = new MinPerson();
minPerson.setFull_name(a.getFullname());
if (orcid != null) {
minPerson.setLocal_identifier(Utils.getIdentifier(Prefixes.PERSON, orcid._1() + orcid._2()));
minPerson.setOrcid(orcid._1());
contribution.setPerson(minPerson);
} else {
if (Optional.ofNullable(a.getRank()).isPresent()) {
minPerson
.setLocal_identifier(
Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + a.getRank()));
contribution
.setPerson(minPerson);
} else {
minPerson
.setLocal_identifier(Utils.getIdentifier(Prefixes.TEMPORARY_PERSON, input.getId() + count));
contribution.setPerson(minPerson);
}
}
if (Optional.ofNullable(a.getRank()).isPresent()) {
contribution.setRank(a.getRank());
}
contributionList.add(contribution);
}
out.setContributions(contributionList);
}
}
private static <E extends Result> void mapTopic(ResearchProduct out, E input) {
if (Optional.ofNullable(input.getSubject()).isPresent()) {
out
.setTopics(
input
.getSubject()
.stream()
.filter(
s -> s.getQualifier().getClassid().equalsIgnoreCase("fos"))
// ||
// s.getQualifier().getClassid().equalsIgnoreCase("sdg"))
.map(s -> {
ResultTopic topic = new ResultTopic();
MinTopic minTopic = new MinTopic();
minTopic
.setLocal_identifier(
Utils.getIdentifier(Prefixes.TOPIC, s.getQualifier().getClassid() + s.getValue()));
minTopic.setValue(s.getValue());
topic
.setTopic(minTopic);
if (Optional.ofNullable(s.getDataInfo()).isPresent()) {
Provenance provenance = new Provenance();
try {
provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust()));
} catch (NumberFormatException nfe) {
}
provenance.setType(s.getDataInfo().getInferenceprovenance());
topic.setProvenance(provenance);
}
return topic;
})
.collect(Collectors.toList()));
}
}
private static <E extends Result> void mapType(ResearchProduct out, E input) throws NoAllowedTypeException {
switch (input.getResulttype().getClassid()) {
case "publication":
out.setProduct_type(ResearchTypes.LITERATURE.label);
break;
case "dataset":
out.setProduct_type(ResearchTypes.RESEARCH_DATA.label);
break;
case "software":
out.setProduct_type(ResearchTypes.RESEARCH_SOFTWARE.label);
break;
case "other":
out.setProduct_type(ResearchTypes.OTHER.label);
break;
default:
throw new ClassCastException("Result type not present or not allowed");
}
}
private static void mapPid(ResearchProduct out, Result input) {
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> out
.setIdentifiers(
value
.stream()
.map(
p -> {
Identifier identifier = new Identifier();
identifier.setValue(p.getValue());
identifier.setScheme(p.getQualifier().getClassid());
return identifier;
})
.collect(Collectors.toList())));
}
private static void mapTitle(ResearchProduct out, Result input) throws NoTitleFoundException {
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
return;
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (!iTitle.isEmpty()) {
out.setTitles(Collections.singletonMap("none", Arrays.asList(iTitle.get(0).getValue())));
}
}
}
private static void mapAbstract(ResearchProduct out, Result input) {
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setAbstracts(Collections.singletonMap("none", descriptionList));
}
}

View File

@ -0,0 +1,516 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ProductsRelation;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple5;
/**
* @author miriam.baglioni
* @Date 16/03/24
*/
public class SelectRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectRelation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpResult.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/select_relation_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relationPath = parser.get("relationPath");
log.info("relationPath: {}", relationPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, workingDir + "aggrelation");
// selectAffiliationRelations(spark, inputPath, workingDir, outputPath);
createOrganizationExtention(
spark, inputPath, RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label,
workingDir + "relations/result_relevant_organizations", relationPath);
selectFundingRelations(spark, inputPath, workingDir, relationPath);
selectProductRelation(spark, inputPath, workingDir, relationPath);
// selectDatasourceOrganizationRelation(spark, inputPath, workingDir, outputPath);
createOrganizationExtention(
spark, inputPath, RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION.label,
workingDir + "relations/datasource_providing_organization", relationPath);
createOrganizationExtention(
spark, inputPath, RelationType.PROJECT_HAS_PARTICIPANT_ORGANIZATION.label,
workingDir + "relations/project_partecipating_organization", relationPath);
});
}
private static void createOrganizationExtention(SparkSession spark, String inputPath, String relationSem,
String outputPath, String relationPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(relationPath)
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + relationSem + "'")
.drop("datainfo", "relClass");
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
relation
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
ExtendingOrganization ar = new ExtendingOrganization();
ar.setEntityId(k);
addRelevantOrganization(ar, v);
return ar;
}, Encoders.bean(ExtendingOrganization.class))
// .show(false);
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static void selectDatasourceOrganizationRelation(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION + "'")
.drop("datainfo", "relClass");
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
relation
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
ExtendingOrganization ar = new ExtendingOrganization();
ar.setEntityId(k);
addRelevantOrganization(ar, v);
return ar;
}, Encoders.bean(ExtendingOrganization.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json("/tmp/miriam/prova/providingOrganization");
}
private static void selectProductRelation(SparkSession spark, String inputPath, String workingDir,
String relationPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(relationPath)
.filter("datainfo.deletedbyinference != true")
.filter(
"relClass == '" + RelationType.DOCUMENTS.label + "' or " +
"relClass == '" + RelationType.CITATION.label + "' or " +
"relClass == '" + RelationType.PART.label + "' or " +
"relClass == '" + RelationType.SUPPLEMENT.label + "' or " +
"relClass == '" + RelationType.VERSION.label + "'")
.drop("datainfo");
Dataset<Row> result = spark
.read()
.schema(Encoders.bean(Result.class).schema())
.json(inputPath + "publication")
.filter(
"datainfo.deletedbyinference != true and " +
"datainfo.invisible != true")
.selectExpr("id", "title[0].value as title", "pid");
result.createOrReplaceTempView("res");
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, title " +
"from res " +
"lateral view explode (pid) p as pide ";
Dataset<MinProduct> minProduct = spark
.sql(query)
// .show(false);
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, MinProduct>) (k, v) -> {
MinProduct mp = new MinProduct();
mp.setLocal_identifier(k);
Row r = v.next();
mp.setTitle(r.getAs("title"));
addProductPid(mp, r);
v.forEachRemaining(row -> addProductPid(mp, row));
return mp;
}, Encoders.bean(MinProduct.class));
relation
.join(minProduct, relation.col("target").equalTo(minProduct.col("local_identifier")))
.selectExpr("source", "local_identifier", "title", "doi", "pmcid", "pmid", "arxivid", "relClass as sem")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ProductsRelation>) (k, v) -> {
ProductsRelation pr = new ProductsRelation();
pr.setResultId(k);
addResulRelations(pr, v);
return pr;
}, Encoders.bean(ProductsRelation.class))
// .show(false);
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "relations/related_products");
}
private static void addResulRelations(ProductsRelation pr, Iterator<Row> v) {
pr.setRelated_products(new ArrayList<>());
Map<String, ArrayList<MinProduct>> hashMap = new HashMap<>();
while (v.hasNext()) {
Row next = v.next();
String sem = next.getAs("sem");
if (!hashMap.containsKey(sem))
hashMap.put(sem, new ArrayList<>());
hashMap.get(sem).add(getMinProduct(next));
}
hashMap
.keySet()
.stream()
.forEach(key -> pr.getRelated_products().add(Relations.newInstance(key, hashMap.get(key))));
}
private static MinProduct getMinProduct(Row next) {
MinProduct mp = new MinProduct();
mp.setLocal_identifier(next.getAs("local_identifier"));
if (Optional.ofNullable(next.getAs("doi")).isPresent())
mp.setDoi(next.getAs("doi"));
if (Optional.ofNullable(next.getAs("pmid")).isPresent())
mp.setPmid(next.getAs("pmid"));
if (Optional.ofNullable(next.getAs("pmcid")).isPresent())
mp.setPmcid(next.getAs("pmcid"));
if (Optional.ofNullable(next.getAs("arxivid")).isPresent())
mp.setArxivid(next.getAs("arxivid"));
return mp;
}
private static void addProductPid(MinProduct mp, Row next) {
String schema = next.getAs("schema");
if (Optional.ofNullable(schema).isPresent()) {
switch (schema) {
case "doi":
mp.setDoi(next.getAs("pid"));
break;
case "pmcid":
mp.setPmcid(next.getAs("pid"));
break;
case "pmid":
mp.setPmid(next.getAs("pid"));
break;
case "arXiv":
mp.setArxivid(next.getAs("pid"));
break;
}
}
}
private static void selectFundingRelations(SparkSession spark, String inputPath, String workingDir,
String relationPath) {
final StructType tp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("id", DataTypes.StringType);
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(relationPath)
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + RelationType.RESULT_OUTCOME_FUNDING.label + "'")
.drop("datainfo", "relClass");
Dataset<Row> projects = Utils
.readPath(spark, inputPath + "project", Project.class)
.filter(
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
p.getFundingtree().size() > 0
&&
Utils
.getFunderName(p.getFundingtree().get(0).getValue())
.equalsIgnoreCase("European Commission"))
.map((MapFunction<Project, Tuple5<String, String, String, String, String>>) p -> {
String id = p.getId();
String acronym = "";
if (Optional.ofNullable(p.getAcronym()).isPresent())
acronym = p.getAcronym().getValue();
String title = "";
if (Optional.ofNullable(p.getTitle()).isPresent())
title = p.getTitle().getValue();
String funder = Utils.getFunderName(p.getFundingtree().get(0).getValue());
String code = p.getCode().getValue();
return new Tuple5<>(id, acronym, title, funder, code);
}, Encoders
.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING()))
.selectExpr("_1 as id", "_2 as acronym", "_3 as title", "_4 as funder", "_5 as code");
relation
.join(projects, relation.col("target").equalTo(projects.col("id")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, GrantRelation>) (k, v) -> {
GrantRelation gr = new GrantRelation();
gr.setResultId(k);
addFunding(gr, v);
return gr;
}, Encoders.bean(GrantRelation.class))
// .show(false);
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "relations/funding");
}
private static void addFunding(GrantRelation gr, Iterator<Row> v) {
gr.setFunding(new ArrayList<>());
while (v.hasNext()) {
gr.getFunding().add(getMinGrant(v.next()));
}
}
private static MinGrant getMinGrant(Row next) {
MinGrant mn = new MinGrant();
mn.setCode(next.getAs("code"));
mn.setLocal_identifier(next.getAs("id"));
mn.setFunder(next.getAs("funder"));
if (Optional.ofNullable(next.getAs("acronym")).isPresent())
mn.setTitle(next.getAs("acronym"));
else
mn.setTitle(next.getAs("title"));
return mn;
}
private static void selectAffiliationRelations(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label + "'")
.drop("datainfo", "relClass");
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
relation
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
ExtendingOrganization ar = new ExtendingOrganization();
ar.setEntityId(k);
addRelevantOrganization(ar, v);
return ar;
}, Encoders.bean(ExtendingOrganization.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json("/tmp/miriam/prova/relevantOrganization");
}
private static Dataset<MinOrganization> getMinOrganizationDataset(SparkSession spark, String inputPath) {
Dataset<Row> organization = spark
.read()
.schema(Encoders.bean(Organization.class).schema())
.json(inputPath + "organization")
.filter("datainfo.deletedbyinference != true")
.selectExpr("id", "legalname.value as name", "pid");
organization.createOrReplaceTempView("org");
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, name " +
"from org " +
"lateral view explode (pid) p as pide ";
return spark
.sql(query)
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, MinOrganization>) (k, v) -> {
MinOrganization mn = new MinOrganization();
mn.setLocal_identifier(k);
Row r = v.next();
mn.setName(r.getAs("name"));
addOrganizationPid(mn, r);
v.forEachRemaining(row -> addOrganizationPid(mn, row));
return mn;
}, Encoders.bean(MinOrganization.class));
}
private static void addOrganizationPid(MinOrganization mo, Row next) {
String schema = next.getAs("schema");
if (Optional.ofNullable(schema).isPresent()) {
switch (schema) {
case "ROR":
mo.setRor(next.getAs("pid"));
break;
case "ISNI":
mo.setIsni(next.getAs("pid"));
break;
case "FundRef":
mo.setFundRef(next.getAs("pid"));
break;
case "RingGold":
mo.setRinGold(next.getAs("pid"));
break;
case "Wikidata":
mo.setWikidata(next.getAs("pid"));
break;
}
}
}
private static void addRelevantOrganization(ExtendingOrganization ar, Iterator<Row> v) {
ar.setRelevant_organization(new ArrayList<>());
while (v.hasNext())
ar.getRelevant_organization().add(getMinOrg(v.next()));
}
private static MinOrganization getMinOrg(Row next) {
MinOrganization mo = new MinOrganization();
mo.setLocal_identifier(next.getAs("local_identifier"));
mo.setName(next.getAs("name"));
if (Optional.ofNullable(next.getAs("ror")).isPresent())
mo.setRor(next.getAs("ror"));
if (Optional.ofNullable(next.getAs("isni")).isPresent())
mo.setIsni(next.getAs("isni"));
if (Optional.ofNullable(next.getAs("fundRef")).isPresent())
mo.setFundRef(next.getAs("fundRef"));
if (Optional.ofNullable(next.getAs("rinGold")).isPresent())
mo.setRinGold(next.getAs("rinGold"));
if (Optional.ofNullable(next.getAs("wikidata")).isPresent())
mo.setWikidata(next.getAs("wikidata"));
// return mo;
// }
//
// if (Optional.ofNullable(pids).isPresent())
// pids.toStream().foreach(pid -> {
// if (Optional.ofNullable(pid.getQualifier()).isPresent() &&
// Optional.ofNullable(pid.getQualifier().getClassid()).isPresent())
// switch (pid.getQualifier().getClassid().toLowerCase()) {
// case "ror":
// mo.setRor(pid.getValue());
// break;
// case "isni":
// mo.setIsni(pid.getValue());
// break;
// case "fundref":
// mo.setFundRef(pid.getValue());
// break;
// case "ringgold":
// mo.setRinGold(pid.getValue());
// break;
// case "wikidata":
// mo.setWikidata(pid.getValue());
// break;
//
// }
// return null;
// });
return mo;
}
}

View File

@ -0,0 +1,166 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif;
import java.io.Serializable;
import java.io.StringReader;
import java.util.List;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.skgif.model.MinGrant;
import eu.dnetlib.dhp.skgif.model.MinOrganization;
import eu.dnetlib.dhp.skgif.model.MinProduct;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 16/02/24
*/
public class Utils implements Serializable {
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Utils() {
}
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
if (!Optional.ofNullable(pid).isPresent())
return null;
if (pid.size() == 0)
return null;
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
return new Tuple2<>(p.getValue(), Boolean.TRUE);
}
}
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
return new Tuple2<>(p.getValue(), Boolean.FALSE);
}
}
return null;
}
public static String getIdentifier(Prefixes entity, String id) {
return entity.label + DHPUtils.md5(id);
}
public static String getFunderName(String fundingtree) throws DocumentException {
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
// f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
}
public static MinOrganization getMinOrganization(Organization o) {
MinOrganization mo = new MinOrganization();
// mo.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
mo.setLocal_identifier(o.getId());
if (Optional.ofNullable(o.getLegalname()).isPresent())
mo.setName(o.getLegalname().getValue());
if (Optional.ofNullable(o.getPid()).isPresent())
for (StructuredProperty pid : o.getPid()) {
if (Optional.ofNullable(pid.getQualifier()).isPresent() &&
Optional.ofNullable(pid.getQualifier().getClassid()).isPresent())
switch (pid.getQualifier().getClassid().toLowerCase()) {
case "ror":
mo.setRor(pid.getValue());
break;
case "isni":
mo.setIsni(pid.getValue());
break;
case "fundref":
mo.setFundRef(pid.getValue());
break;
case "ringgold":
mo.setRinGold(pid.getValue());
break;
case "wikidata":
mo.setWikidata(pid.getValue());
break;
}
}
return mo;
}
public static MinGrant getMinGrant(Project p) throws DocumentException {
MinGrant mg = new MinGrant();
// mg.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, p.getId()));
mg.setLocal_identifier(p.getId());
if (Optional.ofNullable(p.getCode()).isPresent())
mg.setCode(p.getCode().getValue());
if (Optional.ofNullable(p.getFundingtree()).isPresent() && p.getFundingtree().size() > 0)
mg.setFunder(getFunderName(p.getFundingtree().get(0).getValue()));
if (Optional.ofNullable(p.getAcronym()).isPresent())
mg.setTitle(p.getAcronym().getValue());
else if (Optional.ofNullable(p.getTitle()).isPresent()) {
mg.setTitle(p.getTitle().getValue());
}
return mg;
}
public static <R extends Result> MinProduct getMinProduct(R r) throws JsonProcessingException {
MinProduct mp = new MinProduct();
// mp.setLocal_identifier(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, r.getId()));
mp.setLocal_identifier(r.getId());
for (StructuredProperty title : r.getTitle()) {
if (title.getQualifier().getClassid().equalsIgnoreCase("main title")) {
mp.setTitle(title.getValue());
}
}
if (r.getPid() != null)
for (StructuredProperty pid : r.getPid()) {
switch (pid.getQualifier().getClassid().toLowerCase()) {
case "doi":
mp.setDoi(pid.getValue());
break;
case "pmcid":
mp.setPmcid(pid.getValue());
break;
case "arxiv":
mp.setArxivid(pid.getValue());
break;
case "pmid":
mp.setPmid(pid.getValue());
break;
}
}
return mp;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 04/03/24
*/
public class Couple implements Serializable {
private String originalIdentifier;
private String localIdentifier;
public String getOriginalIdentifier() {
return originalIdentifier;
}
public void setOriginalIdentifier(String originalIdentifier) {
this.originalIdentifier = originalIdentifier;
}
public String getLocalIdentifier() {
return localIdentifier;
}
public void setLocalIdentifier(String localIdentifier) {
this.localIdentifier = localIdentifier;
}
public static Couple newInstance(String originalIdentifier, String localIdentifier) {
Couple couple = new Couple();
couple.originalIdentifier = originalIdentifier;
couple.localIdentifier = localIdentifier;
return couple;
}
}

View File

@ -0,0 +1,136 @@
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.oaf.Journal;
/**
* @author miriam.baglioni
* @Date 15/02/24
*/
public class EmitPerManifestation implements Serializable {
private String resultId;
private String hostedBy;
private String hostedbyvalue;
private Journal journal;
// private Instance instance;
private String publisher;
private String product_local_type; // instance.getinstancetype.getclassname
private String product_local_type_schema; // getInstance().getInstancetype().getSchemename()
private String publishing_date;
private String peer_reviewed;
private String access_right;
private String licence;
private String url;
private String pid;
public String getProduct_local_type() {
return product_local_type;
}
public void setProduct_local_type(String product_local_type) {
this.product_local_type = product_local_type;
}
public String getProduct_local_type_schema() {
return product_local_type_schema;
}
public void setProduct_local_type_schema(String product_local_type_schema) {
this.product_local_type_schema = product_local_type_schema;
}
public String getPublishing_date() {
return publishing_date;
}
public void setPublishing_date(String publishing_date) {
this.publishing_date = publishing_date;
}
public String getPeer_reviewed() {
return peer_reviewed;
}
public void setPeer_reviewed(String peer_reviewed) {
this.peer_reviewed = peer_reviewed;
}
public String getAccess_right() {
return access_right;
}
public void setAccess_right(String access_right) {
this.access_right = access_right;
}
public String getLicence() {
return licence;
}
public void setLicence(String licence) {
this.licence = licence;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getHostedbyvalue() {
return hostedbyvalue;
}
public void setHostedbyvalue(String hostedbyvalue) {
this.hostedbyvalue = hostedbyvalue;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public String getHostedBy() {
return hostedBy;
}
public void setHostedBy(String hostedBy) {
this.hostedBy = hostedBy;
}
public Journal getJournal() {
return journal;
}
public void setJournal(Journal journal) {
this.journal = journal;
}
}

Some files were not shown because too many files have changed in this diff Show More