added code to dump the relations between organizaiton and projects in the subset of entities relevant for EOSC

This commit is contained in:
Miriam Baglioni 2023-10-25 11:46:10 +02:00
parent da19f117d8
commit a821371af2
13 changed files with 459 additions and 442 deletions

View File

@ -1,28 +1,27 @@
package eu.dnetlib.dhp.eosc.model; package eu.dnetlib.dhp.eosc.model;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 25/10/23 * @Date 25/10/23
*/ */
import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
/** /**
* To store information about the funder funding the project related to the result. It extends * To store information about the funder funding the project related to the result. It extends
* eu.dnetlib.dhp.schema.dump.oaf.Funder with the following parameter: - - private * eu.dnetlib.dhp.schema.dump.oaf.Funder with the following parameter: - - private
* eu.dnetdlib.dhp.schema.dump.oaf.graph.Fundings funding_stream to store the fundingstream * eu.dnetdlib.dhp.schema.dump.oaf.graph.Fundings funding_stream to store the fundingstream
*/ */
public class Funder extends FunderShort { public class Funder extends FunderShort {
@JsonSchema(description = "Description of the funding stream") @JsonSchema(description = "Description of the funding stream")
private Fundings funding_stream; private Fundings funding_stream;
public Fundings getFunding_stream() { public Fundings getFunding_stream() {
return funding_stream; return funding_stream;
} }
public void setFunding_stream(Fundings funding_stream) { public void setFunding_stream(Fundings funding_stream) {
this.funding_stream = funding_stream; this.funding_stream = funding_stream;
} }
} }

View File

@ -1,12 +1,10 @@
package eu.dnetlib.dhp.eosc.model; package eu.dnetlib.dhp.eosc.model;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 25/10/23 * @Date 25/10/23
*/ */
import java.io.Serializable; import java.io.Serializable;
import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
@ -22,23 +20,23 @@ import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
*/ */
public class Fundings implements Serializable { public class Fundings implements Serializable {
@JsonSchema(description = "Id of the funding stream") @JsonSchema(description = "Id of the funding stream")
private String id; private String id;
private String description; private String description;
public String getId() { public String getId() {
return id; return id;
} }
public void setId(String id) { public void setId(String id) {
this.id = id; this.id = id;
} }
public String getDescription() { public String getDescription() {
return description; return description;
} }
public void setDescription(String description) { public void setDescription(String description) {
this.description = description; this.description = description;
} }
} }

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.eosc.model; package eu.dnetlib.dhp.eosc.model;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 25/10/23 * @Date 25/10/23
*/ */
import java.io.Serializable; import java.io.Serializable;
import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
@ -16,51 +15,51 @@ import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
* funded amount by the funder * funded amount by the funder
*/ */
public class Granted implements Serializable { public class Granted implements Serializable {
@JsonSchema(description = "The currency of the granted amount (e.g. EUR)") @JsonSchema(description = "The currency of the granted amount (e.g. EUR)")
private String currency; private String currency;
@JsonSchema(description = "The total cost of the project") @JsonSchema(description = "The total cost of the project")
private float totalcost; private float totalcost;
@JsonSchema(description = "The funded amount") @JsonSchema(description = "The funded amount")
private float fundedamount; private float fundedamount;
public String getCurrency() { public String getCurrency() {
return currency; return currency;
} }
public void setCurrency(String currency) { public void setCurrency(String currency) {
this.currency = currency; this.currency = currency;
} }
public float getTotalcost() { public float getTotalcost() {
return totalcost; return totalcost;
} }
public void setTotalcost(float totalcost) { public void setTotalcost(float totalcost) {
this.totalcost = totalcost; this.totalcost = totalcost;
} }
public float getFundedamount() { public float getFundedamount() {
return fundedamount; return fundedamount;
} }
public void setFundedamount(float fundedamount) { public void setFundedamount(float fundedamount) {
this.fundedamount = fundedamount; this.fundedamount = fundedamount;
} }
public static Granted newInstance(String currency, float totalcost, float fundedamount) { public static Granted newInstance(String currency, float totalcost, float fundedamount) {
Granted granted = new Granted(); Granted granted = new Granted();
granted.currency = currency; granted.currency = currency;
granted.totalcost = totalcost; granted.totalcost = totalcost;
granted.fundedamount = fundedamount; granted.fundedamount = fundedamount;
return granted; return granted;
} }
public static Granted newInstance(String currency, float fundedamount) { public static Granted newInstance(String currency, float fundedamount) {
Granted granted = new Granted(); Granted granted = new Granted();
granted.currency = currency; granted.currency = currency;
granted.fundedamount = fundedamount; granted.fundedamount = fundedamount;
return granted; return granted;
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.eosc.model; package eu.dnetlib.dhp.eosc.model;
import java.io.Serializable; import java.io.Serializable;
@ -16,76 +17,76 @@ import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
* - private List<OrganizationPid> pid to store the list of pids for the organization * - private List<OrganizationPid> pid to store the list of pids for the organization
*/ */
public class Organization implements Serializable { public class Organization implements Serializable {
private String legalshortname; private String legalshortname;
private String legalname; private String legalname;
private String websiteurl; private String websiteurl;
@JsonSchema(description = "Alternative names that identify the organisation") @JsonSchema(description = "Alternative names that identify the organisation")
private List<String> alternativenames; private List<String> alternativenames;
@JsonSchema(description = "The organisation country") @JsonSchema(description = "The organisation country")
private Country country; private Country country;
@JsonSchema(description = "The OpenAIRE id for the organisation") @JsonSchema(description = "The OpenAIRE id for the organisation")
private String id; private String id;
@JsonSchema(description = "Persistent identifiers for the organisation i.e. isni 0000000090326370") @JsonSchema(description = "Persistent identifiers for the organisation i.e. isni 0000000090326370")
private List<OrganizationPid> pid; private List<OrganizationPid> pid;
public String getLegalshortname() { public String getLegalshortname() {
return legalshortname; return legalshortname;
} }
public void setLegalshortname(String legalshortname) { public void setLegalshortname(String legalshortname) {
this.legalshortname = legalshortname; this.legalshortname = legalshortname;
} }
public String getLegalname() { public String getLegalname() {
return legalname; return legalname;
} }
public void setLegalname(String legalname) { public void setLegalname(String legalname) {
this.legalname = legalname; this.legalname = legalname;
} }
public String getWebsiteurl() { public String getWebsiteurl() {
return websiteurl; return websiteurl;
} }
public void setWebsiteurl(String websiteurl) { public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl; this.websiteurl = websiteurl;
} }
public List<String> getAlternativenames() { public List<String> getAlternativenames() {
return alternativenames; return alternativenames;
} }
public void setAlternativenames(List<String> alternativenames) { public void setAlternativenames(List<String> alternativenames) {
this.alternativenames = alternativenames; this.alternativenames = alternativenames;
} }
public Country getCountry() { public Country getCountry() {
return country; return country;
} }
public void setCountry(Country country) { public void setCountry(Country country) {
this.country = country; this.country = country;
} }
public String getId() { public String getId() {
return id; return id;
} }
public void setId(String id) { public void setId(String id) {
this.id = id; this.id = id;
} }
public List<OrganizationPid> getPid() { public List<OrganizationPid> getPid() {
return pid; return pid;
} }
public void setPid(List<OrganizationPid> pid) { public void setPid(List<OrganizationPid> pid) {
this.pid = pid; this.pid = pid;
} }
} }

View File

@ -33,7 +33,7 @@ public class OrganizationPid implements Serializable {
this.value = value; this.value = value;
} }
public static OrganizationPid newInstance(String type, String value){ public static OrganizationPid newInstance(String type, String value) {
OrganizationPid op = new OrganizationPid(); OrganizationPid op = new OrganizationPid();
op.type = type; op.type = type;
op.value = value; op.value = value;

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.eosc.model; package eu.dnetlib.dhp.eosc.model;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 25/10/23 * @Date 25/10/23
*/ */
import java.io.Serializable; import java.io.Serializable;
import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
@ -15,32 +14,32 @@ import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
* to store the code of the programme - private String description to store the description of the programme * to store the code of the programme - private String description to store the description of the programme
*/ */
public class Programme implements Serializable { public class Programme implements Serializable {
@JsonSchema(description = "The code of the programme") @JsonSchema(description = "The code of the programme")
private String code; private String code;
@JsonSchema(description = "The description of the programme") @JsonSchema(description = "The description of the programme")
private String description; private String description;
public String getCode() { public String getCode() {
return code; return code;
} }
public void setCode(String code) { public void setCode(String code) {
this.code = code; this.code = code;
} }
public String getDescription() { public String getDescription() {
return description; return description;
} }
public void setDescription(String description) { public void setDescription(String description) {
this.description = description; this.description = description;
} }
public static Programme newInstance(String code, String description) { public static Programme newInstance(String code, String description) {
Programme p = new Programme(); Programme p = new Programme();
p.code = code; p.code = code;
p.description = description; p.description = description;
return p; return p;
} }
} }

View File

@ -1,17 +1,15 @@
package eu.dnetlib.dhp.eosc.model; package eu.dnetlib.dhp.eosc.model;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 25/10/23 * @Date 25/10/23
*/ */
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
/** /**
* This is the class representing the Project in the model used for the dumps of the whole graph. At the moment the dump * This is the class representing the Project in the model used for the dumps of the whole graph. At the moment the dump
* of the Projects differs from the other dumps because we do not create relations between Funders (Organization) and * of the Projects differs from the other dumps because we do not create relations between Funders (Organization) and
@ -43,164 +41,162 @@ import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema;
*/ */
public class Project implements Serializable { public class Project implements Serializable {
private String id; private String id;
private String websiteurl; private String websiteurl;
private String code; private String code;
private String acronym; private String acronym;
private String title; private String title;
private String startdate; private String startdate;
private String enddate; private String enddate;
private String callidentifier; private String callidentifier;
private String keywords; private String keywords;
private boolean openaccessmandateforpublications; private boolean openaccessmandateforpublications;
private boolean openaccessmandatefordataset; private boolean openaccessmandatefordataset;
private List<String> subject; private List<String> subject;
@JsonSchema(description = "Funding information for the project") @JsonSchema(description = "Funding information for the project")
private List<Funder> funding; private List<Funder> funding;
private String summary; private String summary;
@JsonSchema(description = "The money granted to the project") @JsonSchema(description = "The money granted to the project")
private Granted granted; private Granted granted;
@JsonSchema(description = "The h2020 programme funding the project") @JsonSchema(description = "The h2020 programme funding the project")
private List<Programme> h2020programme; private List<Programme> h2020programme;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getId() { public String getWebsiteurl() {
return id; return websiteurl;
} }
public void setId(String id) { public void setWebsiteurl(String websiteurl) {
this.id = id; this.websiteurl = websiteurl;
} }
public String getWebsiteurl() { public String getCode() {
return websiteurl; return code;
} }
public void setWebsiteurl(String websiteurl) { public void setCode(String code) {
this.websiteurl = websiteurl; this.code = code;
} }
public String getCode() { public String getAcronym() {
return code; return acronym;
} }
public void setCode(String code) { public void setAcronym(String acronym) {
this.code = code; this.acronym = acronym;
} }
public String getAcronym() { public String getTitle() {
return acronym; return title;
} }
public void setAcronym(String acronym) { public void setTitle(String title) {
this.acronym = acronym; this.title = title;
} }
public String getTitle() { public String getStartdate() {
return title; return startdate;
} }
public void setTitle(String title) { public void setStartdate(String startdate) {
this.title = title; this.startdate = startdate;
} }
public String getStartdate() { public String getEnddate() {
return startdate; return enddate;
} }
public void setStartdate(String startdate) { public void setEnddate(String enddate) {
this.startdate = startdate; this.enddate = enddate;
} }
public String getEnddate() { public String getCallidentifier() {
return enddate; return callidentifier;
} }
public void setEnddate(String enddate) { public void setCallidentifier(String callidentifier) {
this.enddate = enddate; this.callidentifier = callidentifier;
} }
public String getCallidentifier() { public String getKeywords() {
return callidentifier; return keywords;
} }
public void setCallidentifier(String callidentifier) { public void setKeywords(String keywords) {
this.callidentifier = callidentifier; this.keywords = keywords;
} }
public String getKeywords() { public boolean isOpenaccessmandateforpublications() {
return keywords; return openaccessmandateforpublications;
} }
public void setKeywords(String keywords) { public void setOpenaccessmandateforpublications(boolean openaccessmandateforpublications) {
this.keywords = keywords; this.openaccessmandateforpublications = openaccessmandateforpublications;
} }
public boolean isOpenaccessmandateforpublications() { public boolean isOpenaccessmandatefordataset() {
return openaccessmandateforpublications; return openaccessmandatefordataset;
} }
public void setOpenaccessmandateforpublications(boolean openaccessmandateforpublications) { public void setOpenaccessmandatefordataset(boolean openaccessmandatefordataset) {
this.openaccessmandateforpublications = openaccessmandateforpublications; this.openaccessmandatefordataset = openaccessmandatefordataset;
} }
public boolean isOpenaccessmandatefordataset() { public List<String> getSubject() {
return openaccessmandatefordataset; return subject;
} }
public void setOpenaccessmandatefordataset(boolean openaccessmandatefordataset) { public void setSubject(List<String> subject) {
this.openaccessmandatefordataset = openaccessmandatefordataset; this.subject = subject;
} }
public List<String> getSubject() { public List<Funder> getFunding() {
return subject; return funding;
} }
public void setSubject(List<String> subject) { public void setFunding(List<Funder> funding) {
this.subject = subject; this.funding = funding;
} }
public List<Funder> getFunding() { public String getSummary() {
return funding; return summary;
} }
public void setFunding(List<Funder> funding) { public void setSummary(String summary) {
this.funding = funding; this.summary = summary;
} }
public String getSummary() { public Granted getGranted() {
return summary; return granted;
} }
public void setSummary(String summary) { public void setGranted(Granted granted) {
this.summary = summary; this.granted = granted;
} }
public Granted getGranted() { public List<Programme> getH2020programme() {
return granted; return h2020programme;
} }
public void setGranted(Granted granted) { public void setH2020programme(List<Programme> h2020programme) {
this.granted = granted; this.h2020programme = h2020programme;
} }
public List<Programme> getH2020programme() {
return h2020programme;
}
public void setH2020programme(List<Programme> h2020programme) {
this.h2020programme = h2020programme;
}
} }

View File

@ -6,7 +6,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import eu.dnetlib.dhp.eosc.model.Affiliation;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -20,6 +19,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.eosc.model.Affiliation;
import eu.dnetlib.dhp.eosc.model.OrganizationPid; import eu.dnetlib.dhp.eosc.model.OrganizationPid;
import eu.dnetlib.dhp.eosc.model.Result; import eu.dnetlib.dhp.eosc.model.Result;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;

View File

@ -7,8 +7,6 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.eosc.model.Affiliation;
import eu.dnetlib.dhp.eosc.model.Country;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -22,6 +20,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.eosc.model.Affiliation;
import eu.dnetlib.dhp.eosc.model.Country;
import eu.dnetlib.dhp.eosc.model.OrganizationPid; import eu.dnetlib.dhp.eosc.model.OrganizationPid;
import eu.dnetlib.dhp.eosc.model.Result; import eu.dnetlib.dhp.eosc.model.Result;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -70,7 +70,7 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, workingPath + "publicationextendedaffiliation"); Utils.removeOutputDir(spark, workingPath + "publicationextendedaffiliation");
addOrganizations(spark, inputPath, workingPath , outputPath); addOrganizations(spark, inputPath, workingPath, outputPath);
}); });
} }
@ -156,85 +156,89 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
.json(workingPath + "publicationextendedaffiliation"); .json(workingPath + "publicationextendedaffiliation");
relations relations
.joinWith(organizations, relations.col("source").equalTo(organizations.col("id"))) .joinWith(organizations, relations.col("source").equalTo(organizations.col("id")))
.map((MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Organization>) t2 -> mapOrganization(t2._2()),Encoders.bean(eu.dnetlib.dhp.eosc.model.Organization.class)) .map(
.filter(Objects::nonNull) (MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Organization>) t2 -> mapOrganization(
.write() t2._2()),
.mode(SaveMode.Overwrite) Encoders.bean(eu.dnetlib.dhp.eosc.model.Organization.class))
.option("compression","gzip") .filter(Objects::nonNull)
.json(outputPath + "organization"); .write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "organization");
relations relations
.joinWith(organizations, relations.col("source").equalTo(organizations.col("id"))) .joinWith(organizations, relations.col("source").equalTo(organizations.col("id")))
.map((MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Relation>) t2 -> eu.dnetlib.dhp.eosc.model.Relation.newInstance(t2._1().getSource(), t2._1().getTarget()), Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class) ) .map(
.write() (MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Relation>) t2 -> eu.dnetlib.dhp.eosc.model.Relation
.mode(SaveMode.Overwrite) .newInstance(t2._1().getSource(), t2._1().getTarget()),
.option("compression","gzip") Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class))
.json(outputPath + "resultOrganization"); .write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "resultOrganization");
} }
private static eu.dnetlib.dhp.eosc.model.Organization mapOrganization(Organization org){ private static eu.dnetlib.dhp.eosc.model.Organization mapOrganization(Organization org) {
if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference())) if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference()))
return null; return null;
if (!Optional.ofNullable(org.getLegalname()).isPresent() if (!Optional.ofNullable(org.getLegalname()).isPresent()
&& !Optional.ofNullable(org.getLegalshortname()).isPresent()) && !Optional.ofNullable(org.getLegalshortname()).isPresent())
return null; return null;
eu.dnetlib.dhp.eosc.model.Organization organization = new eu.dnetlib.dhp.eosc.model.Organization(); eu.dnetlib.dhp.eosc.model.Organization organization = new eu.dnetlib.dhp.eosc.model.Organization();
Optional Optional
.ofNullable(org.getLegalshortname()) .ofNullable(org.getLegalshortname())
.ifPresent(value -> organization.setLegalshortname(value.getValue())); .ifPresent(value -> organization.setLegalshortname(value.getValue()));
Optional Optional
.ofNullable(org.getLegalname()) .ofNullable(org.getLegalname())
.ifPresent(value -> organization.setLegalname(value.getValue())); .ifPresent(value -> organization.setLegalname(value.getValue()));
Optional Optional
.ofNullable(org.getWebsiteurl()) .ofNullable(org.getWebsiteurl())
.ifPresent(value -> organization.setWebsiteurl(value.getValue())); .ifPresent(value -> organization.setWebsiteurl(value.getValue()));
Optional Optional
.ofNullable(org.getAlternativeNames()) .ofNullable(org.getAlternativeNames())
.ifPresent( .ifPresent(
value -> organization value -> organization
.setAlternativenames( .setAlternativenames(
value value
.stream() .stream()
.map(v -> v.getValue()) .map(v -> v.getValue())
.collect(Collectors.toList()))); .collect(Collectors.toList())));
Optional Optional
.ofNullable(org.getCountry()) .ofNullable(org.getCountry())
.ifPresent( .ifPresent(
value -> { value -> {
if (!value.getClassid().equals(UNKNOWN)) { if (!value.getClassid().equals(UNKNOWN)) {
organization organization
.setCountry( .setCountry(
Country.newInstance(value.getClassid(), value.getClassname())); Country.newInstance(value.getClassid(), value.getClassname()));
} }
}); });
Optional Optional
.ofNullable(org.getId()) .ofNullable(org.getId())
.ifPresent(value -> organization.setId(value)); .ifPresent(value -> organization.setId(value));
Optional Optional
.ofNullable(org.getPid()) .ofNullable(org.getPid())
.ifPresent( .ifPresent(
value -> organization value -> organization
.setPid( .setPid(
value value
.stream() .stream()
.map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue())) .map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList()))); .collect(Collectors.toList())));
return organization;
}
return organization;
} }
}

View File

@ -6,8 +6,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.eosc.model.Organization;
import eu.dnetlib.dhp.eosc.model.Project;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
@ -21,19 +25,20 @@ import eu.dnetlib.dhp.eosc.model.Provenance;
import eu.dnetlib.dhp.eosc.model.RelType; import eu.dnetlib.dhp.eosc.model.RelType;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 12/01/23 * @Date 12/01/23
*/ */
public class SparkDumpRelation implements Serializable { public class SparkDumpOrganizationProject implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpRelation.class); private static final Logger log = LoggerFactory.getLogger(SparkDumpOrganizationProject.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
SparkDumpRelation.class SparkDumpOrganizationProject.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json"));
@ -66,49 +71,21 @@ public class SparkDumpRelation implements Serializable {
} }
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) { private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) {
Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class); Dataset<Organization> organization = Utils.readPath(spark, outputPath + "organization", Organization.class);
relations Dataset<Project> project = Utils.readPath(spark, outputPath + "project", Project.class);
.map((MapFunction<Relation, eu.dnetlib.dhp.eosc.model.Relation>) relation -> { Dataset<Relation> relation = Utils.readPath(spark, inputPath + "/relation", Relation.class)
eu.dnetlib.dhp.eosc.model.Relation relNew = new eu.dnetlib.dhp.eosc.model.Relation(); .filter((FilterFunction<Relation>) r-> !r.getDataInfo().getDeletedbyinference() && r.getRelClass().equalsIgnoreCase(ModelConstants.IS_PARTICIPANT));
relNew
.setSource(
relation.getSource()); Dataset<Relation> eoscOrgs = relation.joinWith(organization, relation.col("source").equalTo(organization.col("id")))
.map((MapFunction<Tuple2<Relation, Organization>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
relNew eoscOrgs.joinWith(project, eoscOrgs.col("target").equalTo(project.col("id")))
.setTarget( .map((MapFunction<Tuple2<Relation, Project>, eu.dnetlib.dhp.eosc.model.Relation>) t2-> eu.dnetlib.dhp.eosc.model.Relation.newInstance(t2._1().getSource(), t2._1().getTarget()), Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class))
.write()
relation.getTarget()); .mode(SaveMode.Overwrite)
.option("compression","gzip")
relNew .json(outputPath + "organizationProject");
.setReltype(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(outputPath);
} }

View File

@ -10,9 +10,6 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.eosc.model.*;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Project;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
@ -31,7 +28,10 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.eosc.model.*;
import eu.dnetlib.dhp.oa.graph.dump.Constants; import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Project;
import scala.Array; import scala.Array;
import scala.Tuple2; import scala.Tuple2;
@ -110,68 +110,86 @@ public class SparkUpdateProjectInfo implements Serializable {
Dataset<Project> project = Utils.readPath(spark, inputPath + "/project", Project.class); Dataset<Project> project = Utils.readPath(spark, inputPath + "/project", Project.class);
Dataset<String> projectIds = result.joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId"))) Dataset<String> projectIds = result
.flatMap((FlatMapFunction<Tuple2<Result, ResultProject>, String>) t2 -> t2._2().getProjectsList() .joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId")))
.stream().map(p -> p.getId()).collect(Collectors.toList()).iterator(), Encoders.STRING()) .flatMap(
.distinct(); (FlatMapFunction<Tuple2<Result, ResultProject>, String>) t2 -> t2
._2()
.getProjectsList()
.stream()
.map(p -> p.getId())
.collect(Collectors.toList())
.iterator(),
Encoders.STRING())
.distinct();
projectIds.joinWith(project, projectIds.col("value").equalTo(project.col("id"))) projectIds
.map((MapFunction<Tuple2<String, Project>, eu.dnetlib.dhp.eosc.model.Project>)t2->mapProject(t2._2()), Encoders.bean(eu.dnetlib.dhp.eosc.model.Project.class) ) .joinWith(project, projectIds.col("value").equalTo(project.col("id")))
.write() .map(
.mode(SaveMode.Overwrite) (MapFunction<Tuple2<String, Project>, eu.dnetlib.dhp.eosc.model.Project>) t2 -> mapProject(t2._2()),
.option("compression","gzip") Encoders.bean(eu.dnetlib.dhp.eosc.model.Project.class))
.json(outputPath + "project"); .write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "project");
resultProject.flatMap((FlatMapFunction<ResultProject, Relation>) rp -> resultProject
rp.getProjectsList().stream().map(p -> Relation.newInstance(rp.getResultId(), p.getId())) .flatMap(
.collect(Collectors.toList()).iterator(), Encoders.bean(Relation.class)) (FlatMapFunction<ResultProject, Relation>) rp -> rp
.write() .getProjectsList()
.mode(SaveMode.Overwrite) .stream()
.option("compression","gzip") .map(p -> Relation.newInstance(rp.getResultId(), p.getId()))
.json(outputPath + "resultProject"); .collect(Collectors.toList())
.iterator(),
Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "resultProject");
} }
private static eu.dnetlib.dhp.eosc.model.Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p) throws DocumentException { private static eu.dnetlib.dhp.eosc.model.Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p)
throws DocumentException {
if (Boolean.TRUE.equals(p.getDataInfo().getDeletedbyinference())) if (Boolean.TRUE.equals(p.getDataInfo().getDeletedbyinference()))
return null; return null;
eu.dnetlib.dhp.eosc.model.Project project = new eu.dnetlib.dhp.eosc.model.Project(); eu.dnetlib.dhp.eosc.model.Project project = new eu.dnetlib.dhp.eosc.model.Project();
Optional Optional
.ofNullable(p.getId()) .ofNullable(p.getId())
.ifPresent(id -> project.setId(id)); .ifPresent(id -> project.setId(id));
Optional Optional
.ofNullable(p.getWebsiteurl()) .ofNullable(p.getWebsiteurl())
.ifPresent(w -> project.setWebsiteurl(w.getValue())); .ifPresent(w -> project.setWebsiteurl(w.getValue()));
Optional Optional
.ofNullable(p.getCode()) .ofNullable(p.getCode())
.ifPresent(code -> project.setCode(code.getValue())); .ifPresent(code -> project.setCode(code.getValue()));
Optional Optional
.ofNullable(p.getAcronym()) .ofNullable(p.getAcronym())
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue())); .ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
Optional Optional
.ofNullable(p.getTitle()) .ofNullable(p.getTitle())
.ifPresent(title -> project.setTitle(title.getValue())); .ifPresent(title -> project.setTitle(title.getValue()));
Optional Optional
.ofNullable(p.getStartdate()) .ofNullable(p.getStartdate())
.ifPresent(sdate -> project.setStartdate(sdate.getValue())); .ifPresent(sdate -> project.setStartdate(sdate.getValue()));
Optional Optional
.ofNullable(p.getEnddate()) .ofNullable(p.getEnddate())
.ifPresent(edate -> project.setEnddate(edate.getValue())); .ifPresent(edate -> project.setEnddate(edate.getValue()));
Optional Optional
.ofNullable(p.getCallidentifier()) .ofNullable(p.getCallidentifier())
.ifPresent(cide -> project.setCallidentifier(cide.getValue())); .ifPresent(cide -> project.setCallidentifier(cide.getValue()));
Optional Optional
.ofNullable(p.getKeywords()) .ofNullable(p.getKeywords())
.ifPresent(key -> project.setKeywords(key.getValue())); .ifPresent(key -> project.setKeywords(key.getValue()));
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications()); Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39()); Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
@ -191,19 +209,19 @@ public class SparkUpdateProjectInfo implements Serializable {
project.setOpenaccessmandatefordataset(false); project.setOpenaccessmandatefordataset(false);
Optional Optional
.ofNullable(p.getEcarticle29_3()) .ofNullable(p.getEcarticle29_3())
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true"))); .ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true")));
project project
.setSubject( .setSubject(
Optional Optional
.ofNullable(p.getSubjects()) .ofNullable(p.getSubjects())
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList())) .map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
.orElse(new ArrayList<>())); .orElse(new ArrayList<>()));
Optional Optional
.ofNullable(p.getSummary()) .ofNullable(p.getSummary())
.ifPresent(summary -> project.setSummary(summary.getValue())); .ifPresent(summary -> project.setSummary(summary.getValue()));
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount()); Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency()); Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
@ -213,8 +231,8 @@ public class SparkUpdateProjectInfo implements Serializable {
if (ofundedamount.isPresent()) { if (ofundedamount.isPresent()) {
if (ototalcost.isPresent()) { if (ototalcost.isPresent()) {
project project
.setGranted( .setGranted(
Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get())); Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
} else { } else {
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get())); project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
} }
@ -222,21 +240,21 @@ public class SparkUpdateProjectInfo implements Serializable {
} }
project project
.setH2020programme( .setH2020programme(
Optional Optional
.ofNullable(p.getH2020classification()) .ofNullable(p.getH2020classification())
.map( .map(
classification -> classification classification -> classification
.stream() .stream()
.map( .map(
c -> Programme c -> Programme
.newInstance( .newInstance(
c.getH2020Programme().getCode(), c.getH2020Programme().getDescription())) c.getH2020Programme().getCode(), c.getH2020Programme().getDescription()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(new ArrayList<>())); .orElse(new ArrayList<>()));
Optional<List<Field<String>>> ofundTree = Optional Optional<List<Field<String>>> ofundTree = Optional
.ofNullable(p.getFundingtree()); .ofNullable(p.getFundingtree());
List<Funder> funList = new ArrayList<>(); List<Funder> funList = new ArrayList<>();
if (ofundTree.isPresent()) { if (ofundTree.isPresent()) {
for (Field<String> fundingtree : ofundTree.get()) { for (Field<String> fundingtree : ofundTree.get()) {

View File

@ -0,0 +1,26 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the path used to store temporary output files",
"paramRequired": false
}
]

View File

@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.eosc.model.Indicator;
import eu.dnetlib.dhp.eosc.model.Affiliation; import eu.dnetlib.dhp.eosc.model.Affiliation;
import eu.dnetlib.dhp.eosc.model.Indicator;
import eu.dnetlib.dhp.eosc.model.Result; import eu.dnetlib.dhp.eosc.model.Result;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import scala.Tuple2; import scala.Tuple2;