diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Affiliation.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Affiliation.java new file mode 100644 index 0000000..0c0d3c4 --- /dev/null +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Affiliation.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.eosc.model; + +import java.io.Serializable; +import java.util.List; + +import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; + +/** + * @author miriam.baglioni + * @Date 13/09/22 + */ +public class Affiliation implements Serializable { + @JsonSchema(description = "the OpenAIRE id of the organizaiton") + private String id; + + @JsonSchema(description = "the name of the organization") + private String name; + + @JsonSchema(description = "the list of pids we have in OpenAIRE for the organization") + private List pid; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getPid() { + return pid; + } + + public void setPid(List pid) { + this.pid = pid; + } +} diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java index 639a3c1..395ac24 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Organization.java @@ -1,4 +1,3 @@ - package eu.dnetlib.dhp.eosc.model; import java.io.Serializable; @@ -7,40 +6,86 @@ import java.util.List; import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; /** - * @author miriam.baglioni - * @Date 13/09/22 + * To represent the generic organizaiton. It has the following parameters: + * - private String legalshortname to store the legalshortname of the organizaiton + * - private String legalname to store the legal name of the organization + * - private String websiteurl to store the websiteurl of the organization + * - private List alternativenames to store the alternative names of the organization + * - private Country country to store the country of the organization + * - private String id to store the openaire id of the organization + * - private List pid to store the list of pids for the organization */ public class Organization implements Serializable { - @JsonSchema(description = "the OpenAIRE id of the organizaiton") - private String id; + private String legalshortname; + private String legalname; + private String websiteurl; - @JsonSchema(description = "the name of the organization") - private String name; + @JsonSchema(description = "Alternative names that identify the organisation") + private List alternativenames; - @JsonSchema(description = "the list of pids we have in OpenAIRE for the organization") - private List pid; + @JsonSchema(description = "The organisation country") + private Country country; - public String getId() { - return id; - } + @JsonSchema(description = "The OpenAIRE id for the organisation") + private String id; - public void setId(String id) { - this.id = id; - } + @JsonSchema(description = "Persistent identifiers for the organisation i.e. isni 0000000090326370") + private List pid; - public String getName() { - return name; - } + public String getLegalshortname() { + return legalshortname; + } - public void setName(String name) { - this.name = name; - } + public void setLegalshortname(String legalshortname) { + this.legalshortname = legalshortname; + } - public List getPid() { - return pid; - } + public String getLegalname() { + return legalname; + } + + public void setLegalname(String legalname) { + this.legalname = legalname; + } + + public String getWebsiteurl() { + return websiteurl; + } + + public void setWebsiteurl(String websiteurl) { + this.websiteurl = websiteurl; + } + + public List getAlternativenames() { + return alternativenames; + } + + public void setAlternativenames(List alternativenames) { + this.alternativenames = alternativenames; + } + + public Country getCountry() { + return country; + } + + public void setCountry(Country country) { + this.country = country; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getPid() { + return pid; + } + + public void setPid(List pid) { + this.pid = pid; + } - public void setPid(List pid) { - this.pid = pid; - } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java index 79ffdcf..4613d4d 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/OrganizationPid.java @@ -32,4 +32,12 @@ public class OrganizationPid implements Serializable { public void setValue(String value) { this.value = value; } + + public static OrganizationPid newInstance(String type, String value){ + OrganizationPid op = new OrganizationPid(); + op.type = type; + op.value = value; + + return op; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Relation.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Relation.java index 91ce474..93f4deb 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Relation.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Relation.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.eosc.model; import java.io.Serializable; import java.util.Objects; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; /** @@ -20,12 +21,15 @@ public class Relation implements Serializable { private String target; @JsonSchema(description = "To represent the semantics of a relation between two entities") + @JsonIgnoreProperties(ignoreUnknown = true) private RelType reltype; @JsonSchema(description = "The reason why OpenAIRE holds the relation ") + @JsonIgnoreProperties(ignoreUnknown = true) private Provenance provenance; @JsonSchema(description = "The result type of the target for this relation") + @JsonIgnoreProperties(ignoreUnknown = true) private String targetType; public String getTargetType() { @@ -82,4 +86,12 @@ public class Relation implements Serializable { relation.provenance = provenance; return relation; } + + public static Relation newInstance(String source, String target) { + Relation relation = new Relation(); + relation.source = source; + relation.target = target; + + return relation; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Result.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Result.java index 61b8584..e3ef5b5 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Result.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/Result.java @@ -24,7 +24,7 @@ public class Result implements Serializable { private List keywords; @JsonSchema(description = "The list of organizations the result is affiliated to") - private List affiliation; + private List affiliation; @JsonSchema(description = "The indicators for this result") private Indicator indicator; @@ -465,11 +465,11 @@ public class Result implements Serializable { this.subject = subject; } - public List getAffiliation() { + public List getAffiliation() { return affiliation; } - public void setAffiliation(List affiliation) { + public void setAffiliation(List affiliation) { this.affiliation = affiliation; } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganization.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganization.java index 09cc1c7..ff40538 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganization.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganization.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; +import eu.dnetlib.dhp.eosc.model.Affiliation; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -91,7 +92,7 @@ public class ExtendEoscResultWithOrganization implements Serializable { if (t2._2() != null) { ResultOrganizations rOrg = new ResultOrganizations(); rOrg.setResultId(t2._1().getTarget()); - eu.dnetlib.dhp.eosc.model.Organization org = new eu.dnetlib.dhp.eosc.model.Organization(); + Affiliation org = new Affiliation(); org.setId(t2._2().getId()); if (Optional.ofNullable(t2._2().getLegalname()).isPresent()) { org.setName(t2._2().getLegalname().getValue()); @@ -135,7 +136,7 @@ public class ExtendEoscResultWithOrganization implements Serializable { return first._1(); } Result ret = first._1(); - List affiliation = new ArrayList<>(); + List affiliation = new ArrayList<>(); Set alreadyInsertedAffiliations = new HashSet<>(); affiliation.add(first._2().getAffiliation()); alreadyInsertedAffiliations.add(first._2().getAffiliation().getId()); diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java index 55bed6d..9c0f785 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ExtendEoscResultWithOrganizationStep2.java @@ -5,7 +5,10 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; +import java.util.stream.Collectors; +import eu.dnetlib.dhp.eosc.model.Affiliation; +import eu.dnetlib.dhp.eosc.model.Country; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -33,6 +36,8 @@ import scala.Tuple2; public class ExtendEoscResultWithOrganizationStep2 implements Serializable { private static final Logger log = LoggerFactory.getLogger(ExtendEoscResultWithOrganizationStep2.class); + private final static String UNKNOWN = "UNKNOWN"; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -52,11 +57,11 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable { final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); - final String resultPath = parser.get("resultPath"); - log.info("resultPath: {}", resultPath); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); +// final String outputPath = parser.get("outputPath"); +// log.info("outputPath: {}", outputPath); SparkConf conf = new SparkConf(); @@ -64,15 +69,15 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable { conf, isSparkSessionManaged, spark -> { - Utils.removeOutputDir(spark, outputPath); - addOrganizations(spark, inputPath, outputPath, resultPath); + Utils.removeOutputDir(spark, workingPath + "publicationextendedaffiliation"); + addOrganizations(spark, inputPath, workingPath ); }); } - private static void addOrganizations(SparkSession spark, String inputPath, String outputPath, - String resultPath) { + private static void addOrganizations(SparkSession spark, String inputPath, String workingPath) { + Dataset results = Utils - .readPath(spark, resultPath, Result.class); + .readPath(spark, workingPath + "publication", Result.class); Dataset relations = Utils .readPath(spark, inputPath + "/relation", Relation.class) @@ -88,7 +93,7 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable { if (t2._2() != null) { ResultOrganizations rOrg = new ResultOrganizations(); rOrg.setResultId(t2._1().getTarget()); - eu.dnetlib.dhp.eosc.model.Organization org = new eu.dnetlib.dhp.eosc.model.Organization(); + Affiliation org = new Affiliation(); org.setId(t2._2().getId()); if (Optional.ofNullable(t2._2().getLegalname()).isPresent()) { org.setName(t2._2().getLegalname().getValue()); @@ -131,7 +136,7 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable { return first._1(); } Result ret = first._1(); - List affiliation = new ArrayList<>(); + List affiliation = new ArrayList<>(); Set alreadyInsertedAffiliations = new HashSet<>(); affiliation.add(first._2().getAffiliation()); alreadyInsertedAffiliations.add(first._2().getAffiliation().getId()); @@ -148,8 +153,88 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable { .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(outputPath); + .json(workingPath + "publicationextendedaffiliation"); + + relations + .joinWith(organizations, relations.col("source").equalTo(organizations.col("id"))) + .map((MapFunction, eu.dnetlib.dhp.eosc.model.Organization>) t2 -> mapOrganization(t2._2()),Encoders.bean(eu.dnetlib.dhp.eosc.model.Organization.class)) + .filter(Objects::nonNull) + .write() + .mode(SaveMode.Overwrite) + .option("compression","gzip") + .json(workingPath + "organization"); + + relations + .joinWith(organizations, relations.col("source").equalTo(organizations.col("id"))) + .map((MapFunction, eu.dnetlib.dhp.eosc.model.Relation>) t2 -> eu.dnetlib.dhp.eosc.model.Relation.newInstance(t2._1().getSource(), t2._1().getTarget()), Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class) ) + .write() + .mode(SaveMode.Overwrite) + .option("compression","gzip") + .json(workingPath + "resultOrganization"); } -} + private static eu.dnetlib.dhp.eosc.model.Organization mapOrganization(Organization org){ + + if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference())) + return null; + if (!Optional.ofNullable(org.getLegalname()).isPresent() + && !Optional.ofNullable(org.getLegalshortname()).isPresent()) + return null; + + eu.dnetlib.dhp.eosc.model.Organization organization = new eu.dnetlib.dhp.eosc.model.Organization(); + + Optional + .ofNullable(org.getLegalshortname()) + .ifPresent(value -> organization.setLegalshortname(value.getValue())); + + Optional + .ofNullable(org.getLegalname()) + .ifPresent(value -> organization.setLegalname(value.getValue())); + + Optional + .ofNullable(org.getWebsiteurl()) + .ifPresent(value -> organization.setWebsiteurl(value.getValue())); + + Optional + .ofNullable(org.getAlternativeNames()) + .ifPresent( + value -> organization + .setAlternativenames( + value + .stream() + .map(v -> v.getValue()) + .collect(Collectors.toList()))); + + Optional + .ofNullable(org.getCountry()) + .ifPresent( + value -> { + if (!value.getClassid().equals(UNKNOWN)) { + organization + .setCountry( + Country.newInstance(value.getClassid(), value.getClassname())); + } + + }); + + Optional + .ofNullable(org.getId()) + .ifPresent(value -> organization.setId(value)); + + Optional + .ofNullable(org.getPid()) + .ifPresent( + value -> organization + .setPid( + value + .stream() + .map(p -> OrganizationPid.newInstance(p.getQualifier().getClassid(), p.getValue())) + .collect(Collectors.toList()))); + + return organization; + } + + } + + diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ResultOrganizations.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ResultOrganizations.java index c8a7e03..8bd6514 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ResultOrganizations.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/ResultOrganizations.java @@ -2,9 +2,8 @@ package eu.dnetlib.dhp.oa.graph.dump.eosc; import java.io.Serializable; -import java.util.List; -import eu.dnetlib.dhp.eosc.model.Organization; +import eu.dnetlib.dhp.eosc.model.Affiliation; /** * @author miriam.baglioni @@ -12,7 +11,7 @@ import eu.dnetlib.dhp.eosc.model.Organization; */ public class ResultOrganizations implements Serializable { private String resultId; - private Organization affiliation; + private Affiliation affiliation; public String getResultId() { return resultId; @@ -22,11 +21,11 @@ public class ResultOrganizations implements Serializable { this.resultId = resultId; } - public Organization getAffiliation() { + public Affiliation getAffiliation() { return affiliation; } - public void setAffiliation(Organization affiliation) { + public void setAffiliation(Affiliation affiliation) { this.affiliation = affiliation; } } diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml index 796aacb..631986f 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/oozie_app/workflow.xml @@ -164,8 +164,9 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --sourcePath${sourcePath} - --resultPath${workingDir}/dump/publication - --outputPath${workingDir}/dump/publicationextendedaffiliation + --workingPath${workingDir}/dump/ + + diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java index 0b7fd5b..691cdab 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultTest.java @@ -25,7 +25,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.eosc.model.Indicator; -import eu.dnetlib.dhp.eosc.model.Organization; +import eu.dnetlib.dhp.eosc.model.Affiliation; import eu.dnetlib.dhp.eosc.model.Result; import eu.dnetlib.dhp.schema.action.AtomicAction; import scala.Tuple2; @@ -174,7 +174,7 @@ public class SelectEoscResultTest { .getAffiliation() .size()); - List affiliations = tmp + List affiliations = tmp .filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) .first() .getAffiliation(); @@ -184,7 +184,7 @@ public class SelectEoscResultTest { affiliations.stream().anyMatch(a -> a.getName().equalsIgnoreCase("Doris Engineering (France)"))); Assertions.assertTrue(affiliations.stream().anyMatch(a -> a.getName().equalsIgnoreCase("RENNES METROPOLE"))); - Organization organization = affiliations + Affiliation organization = affiliations .stream() .filter(a -> a.getId().equalsIgnoreCase("20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff")) .findFirst()