[SKG-IF] changed the implementation to reduce the number of needed joins. Reduced the memory usage by defining specific flat beans
This commit is contained in:
parent
3c4c4e8ce0
commit
9f9ade077b
|
@ -10,9 +10,10 @@ import java.io.Serializable;
|
||||||
public enum RelationType implements Serializable {
|
public enum RelationType implements Serializable {
|
||||||
RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION(
|
RESULT_OUTCOME_FUNDING("isProducedBy"), RESULT_AFFILIATIED_TO_ORGANIZATION(
|
||||||
"hasAuthorInstitution"), DATASOURCE_PROVIDED_BY_ORGANIZATION(
|
"hasAuthorInstitution"), DATASOURCE_PROVIDED_BY_ORGANIZATION(
|
||||||
"isProvidedBy"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant"), SUPPLEMENT(
|
"isProvidedBy"), PROJECT_HAS_PARTICIPANT_ORGANIZATION("hasParticipant"), SUPPLEMENT(
|
||||||
"IsSupplementedBy"), DOCUMENTS(
|
"IsSupplementedBy"), DOCUMENTS(
|
||||||
"IsDocumentedBy"), PART("IsPartOf"), VERSION("IsNewVersionOf"), CITATION("Cites");
|
"IsDocumentedBy"), PART("IsPartOf"), VERSION(
|
||||||
|
"IsNewVersionOf"), CITATION("Cites"), ORGANIZATION_PARTICIPANT_IN_PROJECT("isParticipant");
|
||||||
|
|
||||||
public final String label;
|
public final String label;
|
||||||
|
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 13/03/24
|
|
||||||
*/
|
|
||||||
public class Identifiers implements Serializable {
|
|
||||||
|
|
||||||
private String id;
|
|
||||||
private String CCL;
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCCL() {
|
|
||||||
return CCL;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCCL(String CCL) {
|
|
||||||
this.CCL = CCL;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -18,6 +18,7 @@ import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.types.DataType;
|
import org.apache.spark.sql.types.DataType;
|
||||||
|
import org.apache.spark.sql.types.DataTypes;
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -70,14 +71,19 @@ public class SelectConnectedEntities implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
selectConnectedEntities(spark, inputPath, filterPath, workingDir);
|
selectConnectedEntities2(spark, inputPath, filterPath, workingDir);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> void selectConnectedEntities(SparkSession spark, String inputPath,
|
private static void selectConnectedEntities2(SparkSession spark, String inputPath, String filterPath,
|
||||||
String filterPath,
|
String workingDir) {
|
||||||
String workingDir) throws JsonProcessingException {
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("source", DataTypes.StringType)
|
||||||
|
.add("target", DataTypes.StringType);
|
||||||
|
|
||||||
Dataset<String> resultIds = spark.emptyDataset(Encoders.STRING());
|
Dataset<String> resultIds = spark.emptyDataset(Encoders.STRING());
|
||||||
for (EntityType entity : ModelSupport.entityTypes.keySet())
|
for (EntityType entity : ModelSupport.entityTypes.keySet())
|
||||||
|
@ -90,12 +96,29 @@ public class SelectConnectedEntities implements Serializable {
|
||||||
.select("id")
|
.select("id")
|
||||||
.as(Encoders.STRING()));
|
.as(Encoders.STRING()));
|
||||||
|
|
||||||
Dataset<Relation> relation = Utils
|
Dataset<Row> relation = spark
|
||||||
.readPath(spark, inputPath + "relation", Relation.class)
|
.read()
|
||||||
.filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference());
|
.schema(rp)
|
||||||
Dataset<Organization> organizations = Utils
|
.json(inputPath + "relation")
|
||||||
.readPath(spark, inputPath + "organization", Organization.class)
|
.filter(("datainfo.deletedbyinference != true"))
|
||||||
.filter((FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference());
|
.drop("datainfo");
|
||||||
|
|
||||||
|
Dataset<Row> matchingRels = relation
|
||||||
|
.join(
|
||||||
|
resultIds, relation
|
||||||
|
.col("source")
|
||||||
|
.equalTo(resultIds.col("value")),
|
||||||
|
"leftsemi")
|
||||||
|
.select("target")
|
||||||
|
.distinct()
|
||||||
|
;
|
||||||
|
|
||||||
|
Dataset<Row> organization = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Organization.class).schema())
|
||||||
|
.json(inputPath + "organization")
|
||||||
|
.filter("datainfo.deletedbyinference != true ");
|
||||||
|
|
||||||
Dataset<Project> projects = Utils
|
Dataset<Project> projects = Utils
|
||||||
.readPath(spark, inputPath + "project", Project.class)
|
.readPath(spark, inputPath + "project", Project.class)
|
||||||
.filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference())
|
.filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference())
|
||||||
|
@ -106,53 +129,187 @@ public class SelectConnectedEntities implements Serializable {
|
||||||
.getFunderName(p.getFundingtree().get(0).getValue())
|
.getFunderName(p.getFundingtree().get(0).getValue())
|
||||||
.equalsIgnoreCase("European Commission"));
|
.equalsIgnoreCase("European Commission"));
|
||||||
|
|
||||||
Dataset<Datasource> datasources = Utils
|
|
||||||
.readPath(spark, inputPath + "datasource", Datasource.class)
|
|
||||||
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getDeletedbyinference());
|
|
||||||
|
|
||||||
// select relations having source in the set of identifiers selected for eosc
|
organization
|
||||||
Dataset<Row> relationSource = relation
|
.join(matchingRels, organization.col("id").equalTo(matchingRels.col("target")), "leftsemi")
|
||||||
.join(resultIds, resultIds.col("value").equalTo(relation.col("source")), "left_semi");
|
|
||||||
relationSource
|
|
||||||
.join(resultIds, resultIds.col("value").equalTo(relation.col("target")), "left_semi")
|
|
||||||
.write()
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(workingDir + "resultrelation");
|
|
||||||
//
|
|
||||||
// // write relations between results and organizations
|
|
||||||
relationSource
|
|
||||||
.joinWith(organizations, relation.col("target").equalTo(organizations.col("id")), "left_semi")
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir + "organizaitonrelation");
|
|
||||||
|
|
||||||
relationSource
|
|
||||||
.joinWith(projects, relation.col("target").equalTo(projects.col("id")), "left_semi")
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir + "projectrelation");
|
|
||||||
|
|
||||||
// write organizations linked to results in the set
|
|
||||||
|
|
||||||
organizations
|
|
||||||
.join(relationSource, relationSource.col("target").equalTo(organizations.col("id")), "left_semi")
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(workingDir + "organization");
|
.json(workingDir + "organization");
|
||||||
|
|
||||||
// write projects linked to results in the set
|
|
||||||
projects
|
projects
|
||||||
.join(relationSource, relationSource.col("target").equalTo(projects.col("id")))
|
.join(matchingRels, projects.col("id").equalTo(matchingRels.col("target")), "leftsemi")
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(workingDir + "project");
|
.json(workingDir + "/project");
|
||||||
|
|
||||||
// read the results and select all the distinct instance.hostedbykey
|
Dataset<Row> datasources = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Datasource.class).schema())
|
||||||
|
.json(inputPath + "datasource")
|
||||||
|
.filter("datainfo.deletedbyinference != true");
|
||||||
|
final Dataset<String> datasourceReferencedIds = getDatasourceReferenceIdDataset(spark, workingDir);
|
||||||
|
datasources
|
||||||
|
.join(
|
||||||
|
datasourceReferencedIds, datasourceReferencedIds.col("value").equalTo(datasources.col("id")),
|
||||||
|
"left_semi")
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + "datasource");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// private static <R extends Result> void selectConnectedEntities(SparkSession spark, String inputPath,
|
||||||
|
// String filterPath,
|
||||||
|
// String workingDir) throws JsonProcessingException {
|
||||||
|
//
|
||||||
|
// Dataset<String> resultIds = spark.emptyDataset(Encoders.STRING());
|
||||||
|
// for (EntityType entity : ModelSupport.entityTypes.keySet())
|
||||||
|
// if (ModelSupport.isResult(entity))
|
||||||
|
// resultIds = resultIds
|
||||||
|
// .union(
|
||||||
|
// spark
|
||||||
|
// .read()
|
||||||
|
// .parquet(filterPath + entity.name() + "_ids")
|
||||||
|
// .select("id")
|
||||||
|
// .as(Encoders.STRING()));
|
||||||
|
//
|
||||||
|
// Dataset<Relation> relation = Utils
|
||||||
|
// .readPath(spark, inputPath + "relation", Relation.class)
|
||||||
|
// .filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference());
|
||||||
|
// Dataset<Organization> organizations = Utils
|
||||||
|
// .readPath(spark, inputPath + "organization", Organization.class)
|
||||||
|
// .filter((FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference());
|
||||||
|
// Dataset<Project> projects = Utils
|
||||||
|
// .readPath(spark, inputPath + "project", Project.class)
|
||||||
|
// .filter((FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference())
|
||||||
|
// .filter(
|
||||||
|
// (FilterFunction<Project>) p -> Optional.ofNullable(p.getFundingtree()).isPresent() &&
|
||||||
|
// p.getFundingtree().size() > 0 &&
|
||||||
|
// Utils
|
||||||
|
// .getFunderName(p.getFundingtree().get(0).getValue())
|
||||||
|
// .equalsIgnoreCase("European Commission"));
|
||||||
|
//
|
||||||
|
// Dataset<Datasource> datasources = Utils
|
||||||
|
// .readPath(spark, inputPath + "datasource", Datasource.class)
|
||||||
|
// .filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getDeletedbyinference());
|
||||||
|
//
|
||||||
|
// // select relations having source in the set of identifiers selected for eosc
|
||||||
|
// Dataset<Row> relationSource = relation
|
||||||
|
// .join(resultIds, resultIds.col("value").equalTo(relation.col("source")), "left_semi");
|
||||||
|
// relationSource
|
||||||
|
// .join(resultIds, resultIds.col("value").equalTo(relation.col("target")), "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .json(workingDir + "resultrelation");
|
||||||
|
////
|
||||||
|
//// // write relations between results and organizations
|
||||||
|
// relationSource
|
||||||
|
// .joinWith(organizations, relation.col("target").equalTo(organizations.col("id")), "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "organizaitonrelation");
|
||||||
|
//
|
||||||
|
// relationSource
|
||||||
|
// .joinWith(projects, relation.col("target").equalTo(projects.col("id")), "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "projectrelation");
|
||||||
|
//
|
||||||
|
// // write organizations linked to results in the set
|
||||||
|
//
|
||||||
|
// organizations
|
||||||
|
// .join(relationSource, relationSource.col("target").equalTo(organizations.col("id")), "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "organization");
|
||||||
|
//
|
||||||
|
// // write projects linked to results in the set
|
||||||
|
// projects
|
||||||
|
// .join(relationSource, relationSource.col("target").equalTo(projects.col("id")))
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "project");
|
||||||
|
//
|
||||||
|
// // read the results and select all the distinct instance.hostedbykey
|
||||||
|
// final Dataset<String> datasourceReferencedIds = getDatasourceReferenceIdDataset(spark, workingDir);
|
||||||
|
// // join with the datasources and write the datasource in the join
|
||||||
|
// datasources
|
||||||
|
// .joinWith(
|
||||||
|
// datasourceReferencedIds, datasourceReferencedIds.col("value").equalTo(datasources.col("id")),
|
||||||
|
// "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "datasource");
|
||||||
|
//
|
||||||
|
// // selecting relations between organizations and projects in the selected set
|
||||||
|
// StructType tp = StructType.fromDDL("`id` STRING");
|
||||||
|
// Dataset<Row> organizationSbs = spark
|
||||||
|
// .read()
|
||||||
|
// .schema(tp)
|
||||||
|
// .json(workingDir + "organization")
|
||||||
|
// .select("id");
|
||||||
|
//
|
||||||
|
// Dataset<Row> projectSbs = spark
|
||||||
|
// .read()
|
||||||
|
// .schema(tp)
|
||||||
|
// .json(workingDir + "project")
|
||||||
|
// .select("id");
|
||||||
|
////
|
||||||
|
// Dataset<Row> tmpRel;
|
||||||
|
// tmpRel = relation
|
||||||
|
// .join(
|
||||||
|
// organizationSbs, organizationSbs
|
||||||
|
// .col("id")
|
||||||
|
// .equalTo(relation.col("source")),
|
||||||
|
// "left_semi");
|
||||||
|
// tmpRel
|
||||||
|
// .join(projectSbs, tmpRel.col("target").equalTo(projectSbs.col("id")), "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "orgprojelation");
|
||||||
|
//
|
||||||
|
// // selecting relations between datasources and organizations in the selected set
|
||||||
|
// Dataset<Row> datasourceSbs = spark
|
||||||
|
// .read()
|
||||||
|
// .schema(tp)
|
||||||
|
// .json(workingDir + "datasource")
|
||||||
|
// .select("id");
|
||||||
|
//
|
||||||
|
// tmpRel = relation
|
||||||
|
// .join(datasourceSbs, datasourceSbs.col("id").equalTo(relation.col("source")), "left_semi");
|
||||||
|
// tmpRel
|
||||||
|
// .join(organizationSbs, tmpRel.col("target").equalTo(organizationSbs.col("id")), "left_semi")
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "datsorgrelation");
|
||||||
|
//
|
||||||
|
// Utils
|
||||||
|
// .readPath(spark, workingDir + "resultrelation", Relation.class)
|
||||||
|
// .union(Utils.readPath(spark, workingDir + "organizaitonrelation", Relation.class))
|
||||||
|
// .union(Utils.readPath(spark, workingDir + "projectrelation", Relation.class))
|
||||||
|
// .union(Utils.readPath(spark, workingDir + "orgprojelation", Relation.class))
|
||||||
|
// .union(Utils.readPath(spark, workingDir + "datsorgrelation", Relation.class))
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(workingDir + "relation");
|
||||||
|
//
|
||||||
|
// }
|
||||||
|
|
||||||
|
private static <R extends Result> Dataset<String> getDatasourceReferenceIdDataset(SparkSession spark,
|
||||||
|
String workingDir) {
|
||||||
Dataset<String> datasourceReferencedIds = spark.emptyDataset(Encoders.STRING());
|
Dataset<String> datasourceReferencedIds = spark.emptyDataset(Encoders.STRING());
|
||||||
|
|
||||||
for (EntityType entity : ModelSupport.entityTypes.keySet())
|
for (EntityType entity : ModelSupport.entityTypes.keySet())
|
||||||
|
@ -186,70 +343,6 @@ public class SelectConnectedEntities implements Serializable {
|
||||||
Encoders.STRING()));
|
Encoders.STRING()));
|
||||||
}
|
}
|
||||||
datasourceReferencedIds = datasourceReferencedIds.distinct();
|
datasourceReferencedIds = datasourceReferencedIds.distinct();
|
||||||
// join with the datasources and write the datasource in the join
|
return datasourceReferencedIds;
|
||||||
datasources
|
|
||||||
.joinWith(
|
|
||||||
datasourceReferencedIds, datasourceReferencedIds.col("value").equalTo(datasources.col("id")),
|
|
||||||
"left_semi")
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir + "datasource");
|
|
||||||
|
|
||||||
// selecting relations between organizations and projects in the selected set
|
|
||||||
StructType tp = StructType.fromDDL("`id` STRING");
|
|
||||||
Dataset<Row> organizationSbs = spark
|
|
||||||
.read()
|
|
||||||
.schema(tp)
|
|
||||||
.json(workingDir + "organization")
|
|
||||||
.select("id");
|
|
||||||
|
|
||||||
Dataset<Row> projectSbs = spark
|
|
||||||
.read()
|
|
||||||
.schema(tp)
|
|
||||||
.json(workingDir + "project")
|
|
||||||
.select("id");
|
|
||||||
//
|
|
||||||
Dataset<Row> tmpRel;
|
|
||||||
tmpRel = relation
|
|
||||||
.join(
|
|
||||||
organizationSbs, organizationSbs
|
|
||||||
.col("id")
|
|
||||||
.equalTo(relation.col("source")),
|
|
||||||
"left_semi");
|
|
||||||
tmpRel
|
|
||||||
.join(projectSbs, tmpRel.col("target").equalTo(projectSbs.col("id")), "left_semi")
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir + "orgprojelation");
|
|
||||||
|
|
||||||
// selecting relations between datasources and organizations in the selected set
|
|
||||||
Dataset<Row> datasourceSbs = spark
|
|
||||||
.read()
|
|
||||||
.schema(tp)
|
|
||||||
.json(workingDir + "datasource")
|
|
||||||
.select("id");
|
|
||||||
|
|
||||||
tmpRel = relation
|
|
||||||
.join(datasourceSbs, datasourceSbs.col("id").equalTo(relation.col("source")), "left_semi");
|
|
||||||
tmpRel
|
|
||||||
.join(organizationSbs, tmpRel.col("target").equalTo(organizationSbs.col("id")), "left_semi")
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir + "datsorgrelation");
|
|
||||||
|
|
||||||
Utils
|
|
||||||
.readPath(spark, workingDir + "resultrelation", Relation.class)
|
|
||||||
.union(Utils.readPath(spark, workingDir + "organizaitonrelation", Relation.class))
|
|
||||||
.union(Utils.readPath(spark, workingDir + "projectrelation", Relation.class))
|
|
||||||
.union(Utils.readPath(spark, workingDir + "orgprojelation", Relation.class))
|
|
||||||
.union(Utils.readPath(spark, workingDir + "datsorgrelation", Relation.class))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(workingDir + "relation");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
@ -76,92 +77,24 @@ public class DumpDatasource implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath, String workingDir) {
|
private static void mapDatasource(SparkSession spark, String inputPath, String outputPath, String workingDir) {
|
||||||
Dataset<Relation> relation = Utils
|
Dataset<ExtendingOrganization> organizations = Utils
|
||||||
.readPath(spark, inputPath + "relation", Relation.class)
|
.readPath(spark, workingDir + "/relations/datasource_providing_organization", ExtendingOrganization.class);
|
||||||
.filter((FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference())
|
|
||||||
.filter(
|
|
||||||
(FilterFunction<Relation>) r -> r
|
|
||||||
.getRelClass()
|
|
||||||
.equalsIgnoreCase(RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION.label));
|
|
||||||
|
|
||||||
Dataset<EncloseMinElement> eme = Utils
|
|
||||||
.readPath(spark, workingDir + "minEntity", EncloseMinElement.class)
|
|
||||||
.filter((FilterFunction<EncloseMinElement>) e -> Optional.ofNullable(e.getMinOrganization()).isPresent());
|
|
||||||
|
|
||||||
Dataset<Datasource> datasourceDataset = Utils
|
Dataset<Datasource> datasourceDataset = Utils
|
||||||
.readPath(spark, inputPath + "datasource", Datasource.class)
|
.readPath(spark, inputPath + "datasource", Datasource.class)
|
||||||
.filter(
|
.filter(
|
||||||
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
|
(FilterFunction<Datasource>) d -> !d.getDataInfo().getInvisible()
|
||||||
&& !d.getDataInfo().getDeletedbyinference());
|
&& !d.getDataInfo().getDeletedbyinference());
|
||||||
Dataset<Tuple2<String, EncloseMinElement>> datasourceOrganization = relation
|
|
||||||
.joinWith(eme, relation.col("target").equalTo(eme.col("enclosedEntityId")))
|
|
||||||
.map(
|
|
||||||
(MapFunction<Tuple2<Relation, EncloseMinElement>, Tuple2<String, EncloseMinElement>>) t2 -> new Tuple2<>(
|
|
||||||
t2._1().getSource(), t2._2()),
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(EncloseMinElement.class)));
|
|
||||||
|
|
||||||
datasourceDataset
|
datasourceDataset
|
||||||
.joinWith(
|
.joinWith(
|
||||||
datasourceOrganization, datasourceDataset.col("id").equalTo(datasourceOrganization.col("_1")), "left")
|
organizations, datasourceDataset.col("id").equalTo(organizations.col("entityId")), "left")
|
||||||
.groupByKey(
|
.map((MapFunction<Tuple2<Datasource, ExtendingOrganization>, eu.dnetlib.dhp.skgif.model.Datasource>) t2 -> {
|
||||||
(MapFunction<Tuple2<Datasource, Tuple2<String, EncloseMinElement>>, String>) t2 -> t2._1().getId(),
|
eu.dnetlib.dhp.skgif.model.Datasource datasource = dumpDatasource(t2._1());
|
||||||
Encoders.STRING())
|
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||||
.mapGroups(
|
datasource.setOrganization(t2._2().getRelevant_organization());
|
||||||
(MapGroupsFunction<String, Tuple2<Datasource, Tuple2<String, EncloseMinElement>>, eu.dnetlib.dhp.skgif.model.Datasource>) (
|
}
|
||||||
k, vs) -> {
|
return datasource;
|
||||||
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
|
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class))
|
||||||
Tuple2<Datasource, Tuple2<String, EncloseMinElement>> first = vs.next();
|
|
||||||
Datasource d = first._1();
|
|
||||||
// datasource.setLocal_identifier(Utils.getIdentifier(Prefixes.DATASOURCE, d.getId()));
|
|
||||||
datasource.setLocal_identifier(d.getId());
|
|
||||||
datasource
|
|
||||||
.setIdentifiers(
|
|
||||||
d
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
|
|
||||||
datasource.setName(d.getOfficialname().getValue());
|
|
||||||
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
|
|
||||||
datasource
|
|
||||||
.setJurisdiction(
|
|
||||||
Optional
|
|
||||||
.ofNullable(d.getJurisdiction())
|
|
||||||
.map(v -> v.getClassid())
|
|
||||||
.orElse(new String()));
|
|
||||||
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
|
|
||||||
datasource.setVersion_control(d.getVersioncontrol());
|
|
||||||
|
|
||||||
datasource
|
|
||||||
.setData_source_classification(
|
|
||||||
Optional
|
|
||||||
.ofNullable(d.getEoscdatasourcetype())
|
|
||||||
.map(v -> v.getClassname())
|
|
||||||
.orElse(new String()));
|
|
||||||
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
|
|
||||||
datasource.setThematic(d.getThematic());
|
|
||||||
datasource
|
|
||||||
.setResearch_product_access_policy(
|
|
||||||
Optional
|
|
||||||
.ofNullable(d.getDatabaseaccesstype())
|
|
||||||
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
|
|
||||||
.orElse(new ArrayList<>()));
|
|
||||||
datasource
|
|
||||||
.setResearch_product_metadata_access_policy(
|
|
||||||
Optional
|
|
||||||
.ofNullable(d.getResearchproductmetadataaccesspolicies())
|
|
||||||
.map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
|
|
||||||
.orElse(new ArrayList<>()));
|
|
||||||
if (Optional.ofNullable(first._2()).isPresent()) {
|
|
||||||
List<MinOrganization> organizations = new ArrayList<>();
|
|
||||||
organizations.add(first._2()._2().getMinOrganization());
|
|
||||||
vs.forEachRemaining(org -> organizations.add(org._2()._2().getMinOrganization()));
|
|
||||||
datasource.setOrganization(organizations);
|
|
||||||
}
|
|
||||||
return datasource;
|
|
||||||
|
|
||||||
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Datasource.class))
|
|
||||||
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
@ -169,6 +102,51 @@ public class DumpDatasource implements Serializable {
|
||||||
.json(outputPath + "Datasource");
|
.json(outputPath + "Datasource");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static eu.dnetlib.dhp.skgif.model.Datasource dumpDatasource(Datasource d) {
|
||||||
|
eu.dnetlib.dhp.skgif.model.Datasource datasource = new eu.dnetlib.dhp.skgif.model.Datasource();
|
||||||
|
datasource.setLocal_identifier(d.getId());
|
||||||
|
datasource
|
||||||
|
.setIdentifiers(
|
||||||
|
d
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
datasource.setName(d.getOfficialname().getValue());
|
||||||
|
datasource.setSubmission_policy_url(d.getSubmissionpolicyurl());
|
||||||
|
datasource
|
||||||
|
.setJurisdiction(
|
||||||
|
Optional
|
||||||
|
.ofNullable(d.getJurisdiction())
|
||||||
|
.map(v -> v.getClassid())
|
||||||
|
.orElse(new String()));
|
||||||
|
datasource.setPreservation_policy_url(d.getPreservationpolicyurl());
|
||||||
|
datasource.setVersion_control(d.getVersioncontrol());
|
||||||
|
|
||||||
|
datasource
|
||||||
|
.setData_source_classification(
|
||||||
|
Optional
|
||||||
|
.ofNullable(d.getEoscdatasourcetype())
|
||||||
|
.map(v -> v.getClassname())
|
||||||
|
.orElse(new String()));
|
||||||
|
datasource.setResearch_product_type(getEoscProductType(d.getResearchentitytypes()));
|
||||||
|
datasource.setThematic(d.getThematic());
|
||||||
|
datasource
|
||||||
|
.setResearch_product_access_policy(
|
||||||
|
Optional
|
||||||
|
.ofNullable(d.getDatabaseaccesstype())
|
||||||
|
.map(v -> getResearchProductAccessPolicy(d.getDatabaseaccesstype().getValue()))
|
||||||
|
.orElse(new ArrayList<>()));
|
||||||
|
datasource
|
||||||
|
.setResearch_product_metadata_access_policy(
|
||||||
|
Optional
|
||||||
|
.ofNullable(d.getResearchproductmetadataaccesspolicies())
|
||||||
|
.map(v -> getResearchProductAccessPolicy(d.getResearchproductmetadataaccesspolicies()))
|
||||||
|
.orElse(new ArrayList<>()));
|
||||||
|
return datasource;
|
||||||
|
}
|
||||||
|
|
||||||
private static List<String> getResearchProductAccessPolicy(List<String> value) {
|
private static List<String> getResearchProductAccessPolicy(List<String> value) {
|
||||||
|
|
||||||
return value
|
return value
|
||||||
|
|
|
@ -16,10 +16,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.dom4j.io.SAXReader;
|
import org.dom4j.io.SAXReader;
|
||||||
|
@ -28,6 +25,8 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.skgif.model.*;
|
import eu.dnetlib.dhp.skgif.model.*;
|
||||||
|
@ -84,112 +83,102 @@ public class DumpGrant implements Serializable {
|
||||||
.filter(
|
.filter(
|
||||||
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||||
!p.getDataInfo().getInvisible());
|
!p.getDataInfo().getInvisible());
|
||||||
Dataset<Relation> relations = Utils
|
Dataset<ExtendingOrganization> partecipatingOrgs = Utils
|
||||||
.readPath(spark, inputPath + "relation", Relation.class)
|
.readPath(spark, workingDir + "relations/project_partecipating_organization", ExtendingOrganization.class);
|
||||||
.filter(
|
|
||||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
|
||||||
!r.getDataInfo().getInvisible() &&
|
|
||||||
r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label));
|
|
||||||
Dataset<EncloseMinElement> eme = Utils
|
|
||||||
.readPath(spark, workingDir + "minEntity", EncloseMinElement.class)
|
|
||||||
.filter((FilterFunction<EncloseMinElement>) e -> e.getMinOrganization() != null);
|
|
||||||
|
|
||||||
Dataset<Tuple2<String, EncloseMinElement>> partecipantOrganization = relations
|
|
||||||
.joinWith(eme, relations.col("source").equalTo(eme.col("enclosedEntityId")))
|
|
||||||
.map(
|
|
||||||
(MapFunction<Tuple2<Relation, EncloseMinElement>, Tuple2<String, EncloseMinElement>>) t2 -> new Tuple2<>(
|
|
||||||
t2._1().getTarget(), t2._2()),
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(EncloseMinElement.class)));
|
|
||||||
|
|
||||||
|
projects = projects
|
||||||
|
.groupByKey((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Project, Project>) (k, v) -> v.next(), Encoders.bean(Project.class));
|
||||||
projects
|
projects
|
||||||
.joinWith(partecipantOrganization, projects.col("id").equalTo(partecipantOrganization.col("_1")), "left")
|
.joinWith(partecipatingOrgs, projects.col("id").equalTo(partecipatingOrgs.col("entityId")), "left")
|
||||||
.groupByKey(
|
.map((MapFunction<Tuple2<Project, ExtendingOrganization>, Grant>) t2 -> {
|
||||||
(MapFunction<Tuple2<Project, Tuple2<String, EncloseMinElement>>, String>) t2 -> t2._1().getId(),
|
Grant g = dumpGrant(t2._1());
|
||||||
Encoders.STRING())
|
if (Optional.ofNullable(t2._2()).isPresent())
|
||||||
.mapGroups(
|
g.setBeneficiaries(t2._2().getRelevant_organization());
|
||||||
(MapGroupsFunction<String, Tuple2<Project, Tuple2<String, EncloseMinElement>>, Grant>) (k, v) -> {
|
return g;
|
||||||
Grant g = new Grant();
|
}, Encoders.bean(Grant.class))
|
||||||
Tuple2<Project, Tuple2<String, EncloseMinElement>> first = v.next();
|
|
||||||
g.setLocal_identifier(k);
|
|
||||||
g.setGrantCode(first._1().getCode().getValue());
|
|
||||||
g.setIdentifiers(getProjectIdentifier(first._1()));
|
|
||||||
g.setTitle(first._1().getTitle().getValue());
|
|
||||||
g
|
|
||||||
.setSummary(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getSummary())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(new String()));
|
|
||||||
g
|
|
||||||
.setAcronym(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getAcronym())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(new String()));
|
|
||||||
g.setFunder(Utils.getFunderName(first._1().getFundingtree().get(0).getValue()));
|
|
||||||
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
|
|
||||||
g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue()));
|
|
||||||
g
|
|
||||||
.setCurrency(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getCurrency())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(new String()));
|
|
||||||
g
|
|
||||||
.setFunded_amount(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getFundedamount())
|
|
||||||
.orElse(null));
|
|
||||||
g
|
|
||||||
.setKeywords(
|
|
||||||
first
|
|
||||||
._1()
|
|
||||||
.getSubjects()
|
|
||||||
.stream()
|
|
||||||
.map(s -> s.getValue())
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
g
|
|
||||||
.setStart_date(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getStartdate())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(new String()));
|
|
||||||
g
|
|
||||||
.setEnd_date(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getEnddate())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(new String()));
|
|
||||||
g
|
|
||||||
.setWebsite(
|
|
||||||
Optional
|
|
||||||
.ofNullable(first._1().getWebsiteurl())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(new String()));
|
|
||||||
if (Optional.ofNullable(first._2()).isPresent()) {
|
|
||||||
List<MinOrganization> relevantOrganizatios = new ArrayList<>();
|
|
||||||
relevantOrganizatios.add(first._2()._2().getMinOrganization());
|
|
||||||
v
|
|
||||||
.forEachRemaining(
|
|
||||||
t2 -> relevantOrganizatios
|
|
||||||
.add(t2._2()._2().getMinOrganization()));
|
|
||||||
g.setBeneficiaries(relevantOrganizatios);
|
|
||||||
}
|
|
||||||
return g;
|
|
||||||
}, Encoders.bean(Grant.class))
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "Grant");
|
.json(outputPath + "Grant");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Grant dumpGrant(Project project) throws DocumentException {
|
||||||
|
Grant g = new Grant();
|
||||||
|
|
||||||
|
g.setLocal_identifier(project.getId());
|
||||||
|
g.setGrantCode(project.getCode().getValue());
|
||||||
|
g.setIdentifiers(getProjectIdentifier(project));
|
||||||
|
if (Optional.ofNullable(project.getTitle()).isPresent())
|
||||||
|
g.setTitle(project.getTitle().getValue());
|
||||||
|
g
|
||||||
|
.setSummary(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getSummary())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(new String()));
|
||||||
|
g
|
||||||
|
.setAcronym(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getAcronym())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(new String()));
|
||||||
|
if (Optional.ofNullable(project.getFundingtree()).isPresent() &&
|
||||||
|
project.getFundingtree().size() > 0) {
|
||||||
|
g.setFunder(Utils.getFunderName(project.getFundingtree().get(0).getValue()));
|
||||||
|
// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
|
||||||
|
g.setFunding_stream(getFundingStream(project.getFundingtree().get(0).getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
g
|
||||||
|
.setCurrency(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getCurrency())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(new String()));
|
||||||
|
g
|
||||||
|
.setFunded_amount(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getFundedamount())
|
||||||
|
.orElse(null));
|
||||||
|
if (Optional.ofNullable(project.getSubjects()).isPresent())
|
||||||
|
g
|
||||||
|
.setKeywords(
|
||||||
|
project
|
||||||
|
.getSubjects()
|
||||||
|
.stream()
|
||||||
|
.map(s -> s.getValue())
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
g
|
||||||
|
.setStart_date(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getStartdate())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(new String()));
|
||||||
|
g
|
||||||
|
.setEnd_date(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getEnddate())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(new String()));
|
||||||
|
g
|
||||||
|
.setWebsite(
|
||||||
|
Optional
|
||||||
|
.ofNullable(project.getWebsiteurl())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(new String()));
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
|
||||||
private static String getFundingStream(String fundingtree) throws DocumentException {
|
private static String getFundingStream(String fundingtree) throws DocumentException {
|
||||||
final Document doc;
|
final Document doc;
|
||||||
|
|
||||||
doc = new SAXReader().read(new StringReader(fundingtree));
|
doc = new SAXReader().read(new StringReader(fundingtree));
|
||||||
if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
|
if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
|
||||||
doc.selectNodes("//funding_level_0").size() > 0)
|
doc.selectNodes("//funding_level_0").size() > 0 &&
|
||||||
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText();
|
Optional.ofNullable(doc.selectNodes("//funding_level_0/name")).isPresent() &&
|
||||||
|
doc.selectNodes("//funding_level_0/name").size() > 0)
|
||||||
|
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0/name").get(0))).getText();
|
||||||
return new String();
|
return new String();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,7 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.skgif.model.Identifier;
|
import eu.dnetlib.dhp.skgif.model.Identifier;
|
||||||
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
|
import eu.dnetlib.dhp.skgif.model.OrganizationTypes;
|
||||||
import eu.dnetlib.dhp.skgif.model.Prefixes;
|
import eu.dnetlib.dhp.skgif.model.Prefixes;
|
||||||
|
@ -68,47 +70,55 @@ public class DumpOrganization implements Serializable {
|
||||||
|
|
||||||
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
|
private static void mapOrganization(SparkSession spark, String inputPath, String outputPath) {
|
||||||
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
|
Dataset<Organization> organizations = Utils.readPath(spark, inputPath + "organization", Organization.class);
|
||||||
organizations
|
organizations = organizations
|
||||||
.filter(
|
.filter(
|
||||||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
|
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference()
|
||||||
&& !o.getDataInfo().getInvisible())
|
&& !o.getDataInfo().getInvisible())
|
||||||
.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
|
.groupByKey((MapFunction<Organization, String>) p -> p.getId(), Encoders.STRING())
|
||||||
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
|
.mapGroups(
|
||||||
// organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
|
(MapGroupsFunction<String, Organization, Organization>) (k, v) -> v.next(),
|
||||||
organization.setLocal_identifier(o.getId());
|
Encoders.bean(Organization.class));
|
||||||
organization
|
|
||||||
.setCountry(
|
organizations.map((MapFunction<Organization, eu.dnetlib.dhp.skgif.model.Organization>) o -> {
|
||||||
Optional
|
if (!Optional.ofNullable(o.getPid()).isPresent() || o.getPid().size() == 0)
|
||||||
.ofNullable(o.getCountry().getClassid())
|
return null;
|
||||||
.orElse(new String()));
|
eu.dnetlib.dhp.skgif.model.Organization organization = new eu.dnetlib.dhp.skgif.model.Organization();
|
||||||
organization
|
// organization.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
|
||||||
.setName(
|
organization.setLocal_identifier(o.getId());
|
||||||
Optional
|
organization
|
||||||
.ofNullable(o.getLegalname().getValue())
|
.setCountry(
|
||||||
.orElse(new String()));
|
Optional
|
||||||
organization
|
.ofNullable(o.getCountry().getClassid())
|
||||||
.setShort_name(
|
.orElse(new String()));
|
||||||
Optional
|
organization
|
||||||
.ofNullable(o.getLegalshortname())
|
.setName(
|
||||||
.map(v -> v.getValue())
|
Optional
|
||||||
.orElse(new String()));
|
.ofNullable(o.getLegalname().getValue())
|
||||||
organization
|
.orElse(new String()));
|
||||||
.setIdentifiers(
|
organization
|
||||||
o
|
.setShort_name(
|
||||||
.getPid()
|
Optional
|
||||||
.stream()
|
.ofNullable(o.getLegalshortname())
|
||||||
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
.map(v -> v.getValue())
|
||||||
.collect(Collectors.toList()));
|
.orElse(new String()));
|
||||||
organization
|
organization
|
||||||
.setOther_names(
|
.setIdentifiers(
|
||||||
o
|
o
|
||||||
.getAlternativeNames()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.map(a -> a.getValue())
|
.map(p -> Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
organization.setType(getOrganizationType(o));
|
organization
|
||||||
return organization;
|
.setOther_names(
|
||||||
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
|
o
|
||||||
|
.getAlternativeNames()
|
||||||
|
.stream()
|
||||||
|
.map(a -> a.getValue())
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
organization.setType(getOrganizationType(o));
|
||||||
|
return organization;
|
||||||
|
}, Encoders.bean(eu.dnetlib.dhp.skgif.model.Organization.class))
|
||||||
|
.filter((FilterFunction<eu.dnetlib.dhp.skgif.model.Organization>) o -> o != null)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -0,0 +1,408 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.*;
|
||||||
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
import eu.dnetlib.dhp.skgif.model.*;
|
||||||
|
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 06/02/24
|
||||||
|
*/
|
||||||
|
public class DumpResearchProduct implements Serializable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(DumpResearchProduct.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
DumpResearchProduct.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/dump/skgif/emit_biblio_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String workingDir = parser.get("workingDir");
|
||||||
|
log.info("workingDir: {}", workingDir);
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
Utils.removeOutputDir(spark, outputPath + "ResearchProduct");
|
||||||
|
emitFromResult(spark, inputPath, outputPath, workingDir);
|
||||||
|
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
||||||
|
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
|
||||||
|
String workingDir) {
|
||||||
|
dumpResearchProduct(spark, inputPath, workingDir, outputPath);
|
||||||
|
moveDumpedProducts(spark, workingDir, outputPath);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void moveDumpedProducts(SparkSession spark, String workingDir, String outputPath) {
|
||||||
|
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
|
||||||
|
for (EntityType e : ModelSupport.entityTypes.keySet()) {
|
||||||
|
if (ModelSupport.isResult(e))
|
||||||
|
researchProducts = researchProducts
|
||||||
|
.union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class));
|
||||||
|
}
|
||||||
|
researchProducts
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath + "ResearchProduct");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void dumpResearchProduct(SparkSession spark, String inputPath, String workingDir,
|
||||||
|
String outputPath) {
|
||||||
|
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||||
|
if (ModelSupport.isResult(e)) {
|
||||||
|
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||||
|
|
||||||
|
if (e.name().equalsIgnoreCase("publication")) {
|
||||||
|
dumpPublication(spark, inputPath, workingDir, e, resultClazz);
|
||||||
|
} else {
|
||||||
|
dumpOtherResults(spark, inputPath, workingDir, e, resultClazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
includeRelevantOrganization(spark, workingDir, e);
|
||||||
|
includeFunding(spark, workingDir, e);
|
||||||
|
includeRelatedProducts(spark, workingDir, e);
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void includeRelatedProducts(SparkSession spark, String workingDir, EntityType e) {
|
||||||
|
Dataset<ResearchProduct> pprWitGrants = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(ResearchProduct.class).schema())
|
||||||
|
.json(workingDir + e.name() + "/temp_researchproductgrant")
|
||||||
|
.as(Encoders.bean(ResearchProduct.class));
|
||||||
|
Dataset<ProductsRelation> relatedResults = Utils
|
||||||
|
.readPath(spark, workingDir + "/relations/related_products", ProductsRelation.class);
|
||||||
|
pprWitGrants
|
||||||
|
.joinWith(
|
||||||
|
relatedResults, pprWitGrants.col("local_identifier").equalTo(relatedResults.col("resultId")),
|
||||||
|
"left")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<ResearchProduct, ProductsRelation>, ResearchProduct>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1();
|
||||||
|
t2._1().setRelated_products(t2._2().getRelated_products());
|
||||||
|
return t2._1();
|
||||||
|
|
||||||
|
}, Encoders.bean(ResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/researchproduct");
|
||||||
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_researchproductgrant");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void includeFunding(SparkSession spark, String workingDir, EntityType e) {
|
||||||
|
Dataset<ResearchProduct> prrWithAffiliation = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(ResearchProduct.class).schema())
|
||||||
|
.json(workingDir + e.name() + "/temp_researchproductaff")
|
||||||
|
.as(Encoders.bean(ResearchProduct.class));
|
||||||
|
|
||||||
|
Dataset<GrantRelation> grants = Utils
|
||||||
|
.readPath(spark, workingDir + "relations/funding", GrantRelation.class);
|
||||||
|
|
||||||
|
// Dataset<PartialResearchProduct> pprWitGrants =
|
||||||
|
prrWithAffiliation
|
||||||
|
.joinWith(
|
||||||
|
grants, prrWithAffiliation.col("local_identifier").equalTo(grants.col("resultId")), "left")
|
||||||
|
.map((MapFunction<Tuple2<ResearchProduct, GrantRelation>, ResearchProduct>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1();
|
||||||
|
t2._1().setFunding(t2._2().getFunding());
|
||||||
|
return t2._1();
|
||||||
|
}, Encoders.bean(ResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/temp_researchproductgrant");
|
||||||
|
|
||||||
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_researchproductaff");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void includeRelevantOrganization(SparkSession spark, String workingDir, EntityType e) {
|
||||||
|
Dataset<ExtendingOrganization> affiliations = Utils
|
||||||
|
.readPath(
|
||||||
|
spark, workingDir + "relations/result_relevant_organizations", ExtendingOrganization.class);
|
||||||
|
Dataset<ResearchProduct> partialResearchProduct = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(ResearchProduct.class).schema())
|
||||||
|
.json(workingDir + e.name() + "/temp_researchProduct")
|
||||||
|
.as(Encoders.bean(ResearchProduct.class));
|
||||||
|
// Dataset<PartialResearchProduct> prrWithAffiliation =
|
||||||
|
partialResearchProduct
|
||||||
|
.joinWith(
|
||||||
|
affiliations,
|
||||||
|
partialResearchProduct.col("local_identifier").equalTo(affiliations.col("entityId")),
|
||||||
|
"left")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<ResearchProduct, ExtendingOrganization>, ResearchProduct>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1();
|
||||||
|
t2._1().setRelevant_organizations(t2._2().getRelevant_organization());
|
||||||
|
return t2._1();
|
||||||
|
}, Encoders.bean(ResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/temp_researchproductaff");
|
||||||
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_researchProduct");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void dumpOtherResults(SparkSession spark, String inputPath, String workingDir, EntityType e, Class<R> resultClazz) {
|
||||||
|
Dataset<R> results = Utils.readPath(spark, inputPath + e.name(), resultClazz);
|
||||||
|
results.map((MapFunction<R, ResearchProduct>) r -> {
|
||||||
|
ArrayList<String> journalHbIds = new ArrayList<>();
|
||||||
|
|
||||||
|
ResearchProduct rp = ResultMapper.map(r);
|
||||||
|
rp
|
||||||
|
.setManifestations(
|
||||||
|
r
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.map(i -> getManifestation(i, journalHbIds, r))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
return rp;
|
||||||
|
}, Encoders.bean(ResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/temp_researchProduct");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends Result> void dumpPublication(SparkSession spark, String inputPath, String workingDir, EntityType e, Class<R> resultClazz) {
|
||||||
|
Dataset<Tuple2<String, String>> resultHostedBy = Utils
|
||||||
|
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||||
|
.flatMap(
|
||||||
|
(FlatMapFunction<R, Tuple2<String, String>>) p -> p
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.map(i -> new Tuple2<>(p.getId(), i.getHostedby().getKey()))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.iterator(),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
||||||
|
|
||||||
|
Dataset<Row> journalIds = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Datasource.class).schema())
|
||||||
|
.json(inputPath + "/datasource")
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"eoscdatasourcetype.classid == 'Journal archive' ")
|
||||||
|
.select("id");
|
||||||
|
|
||||||
|
Dataset<Row> journalHostedByPerResult = resultHostedBy
|
||||||
|
.join(
|
||||||
|
journalIds,
|
||||||
|
resultHostedBy.col("_2").equalTo(journalIds.col("id")), "leftsemi")
|
||||||
|
.selectExpr("_1 as id", "_2 as journalHostedBy");
|
||||||
|
Dataset<Publication> results = Utils.readPath(spark, inputPath + e.name(), Publication.class);
|
||||||
|
results
|
||||||
|
.joinWith(
|
||||||
|
journalHostedByPerResult, results
|
||||||
|
.col("id")
|
||||||
|
.equalTo(journalHostedByPerResult.col("id")),
|
||||||
|
"left")
|
||||||
|
.groupByKey(
|
||||||
|
(MapFunction<Tuple2<Publication, Row>, String>) t2 -> t2._1().getId(), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Tuple2<Publication, Row>, ResearchProduct>) (k, v) -> {
|
||||||
|
ArrayList<String> journalHbIds = new ArrayList<>();
|
||||||
|
Tuple2<Publication, Row> first = v.next();
|
||||||
|
if (Optional.ofNullable(first._2()).isPresent())
|
||||||
|
journalHbIds.add(first._2().getAs("journalHostedBy"));
|
||||||
|
v.forEachRemaining(value -> journalHbIds.add(value._2().getAs("journalHostedBy")));
|
||||||
|
Publication p = first._1();
|
||||||
|
ResearchProduct rp = ResultMapper.map(p);
|
||||||
|
rp
|
||||||
|
.setManifestations(
|
||||||
|
p
|
||||||
|
.getInstance()
|
||||||
|
.stream()
|
||||||
|
.map(i -> getManifestation(i, journalHbIds, p))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
return rp;
|
||||||
|
}, Encoders.bean(ResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/temp_researchProduct");
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private static <R extends Result> Manifestation getManifestation(Instance i, ArrayList<String> journalHbIds, R p) {
|
||||||
|
Manifestation m = new Manifestation();
|
||||||
|
m.setProduct_local_type(i.getInstancetype().getClassname());
|
||||||
|
m.setProduct_local_type_schema(i.getInstancetype().getSchemename());
|
||||||
|
m.setPeer_review(getPeerReviewd(i));
|
||||||
|
m.setAccess_right(getAccessRigth(i));
|
||||||
|
m
|
||||||
|
.setLicence(
|
||||||
|
getLicence(i));
|
||||||
|
if (Optional.ofNullable(i.getUrl()).isPresent() && i.getUrl().size() > 0)
|
||||||
|
m.setUrl(i.getUrl().get(0));
|
||||||
|
else
|
||||||
|
m.setUrl(null);
|
||||||
|
if (Optional.ofNullable(i.getPid()).isPresent() && i.getPid().size() > 0) {
|
||||||
|
m.setPid(i.getPid().get(0).getValue());
|
||||||
|
}
|
||||||
|
if (Optional.ofNullable(i.getDateofacceptance()).isPresent())
|
||||||
|
m
|
||||||
|
.setDates(
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
Dates.newInstance(i.getDateofacceptance().getValue(), "publishing")));
|
||||||
|
|
||||||
|
if (p instanceof Publication) {
|
||||||
|
if (journalHbIds.contains(i.getHostedby().getKey())
|
||||||
|
&& Optional.ofNullable(((Publication) p).getJournal()).isPresent()) {
|
||||||
|
Biblio biblio = getBiblio(((Publication) p).getJournal());
|
||||||
|
if (Optional.ofNullable(p.getPublisher()).isPresent())
|
||||||
|
biblio.setPublisher(p.getPublisher().getValue());
|
||||||
|
m.setBiblio(biblio);
|
||||||
|
if (Optional.ofNullable(((Publication) p).getJournal().getIssnPrinted()).isPresent())
|
||||||
|
m
|
||||||
|
.setVenue(
|
||||||
|
MinVenue
|
||||||
|
.newInstance(
|
||||||
|
Utils
|
||||||
|
.getIdentifier(Prefixes.VENUE, ((Publication) p).getJournal().getIssnPrinted()),
|
||||||
|
i.getHostedby().getValue()));
|
||||||
|
else if (Optional.ofNullable(((Publication) p).getJournal().getIssnOnline()).isPresent())
|
||||||
|
m
|
||||||
|
.setVenue(
|
||||||
|
MinVenue
|
||||||
|
.newInstance(
|
||||||
|
Utils.getIdentifier(Prefixes.VENUE, ((Publication) p).getJournal().getIssnOnline()),
|
||||||
|
i.getHostedby().getValue()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m
|
||||||
|
.setHosting_datasource(
|
||||||
|
MinVenue
|
||||||
|
.newInstance(
|
||||||
|
// Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()),
|
||||||
|
i.getHostedby().getKey(),
|
||||||
|
i.getHostedby().getValue()));
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Biblio getBiblio(Journal epm) {
|
||||||
|
Biblio biblio = new Biblio();
|
||||||
|
if (Optional.ofNullable(epm.getEdition()).isPresent())
|
||||||
|
biblio.setEdition(epm.getEdition());
|
||||||
|
if (Optional.ofNullable(epm.getIss()).isPresent())
|
||||||
|
biblio.setIssue(epm.getIss());
|
||||||
|
if (Optional.ofNullable(epm.getVol()).isPresent())
|
||||||
|
biblio.setVolume(epm.getVol());
|
||||||
|
if (Optional.ofNullable(epm.getEp()).isPresent())
|
||||||
|
biblio.setEnd_page(epm.getEp());
|
||||||
|
if (Optional.ofNullable(epm.getSp()).isPresent())
|
||||||
|
biblio.setStart_page(epm.getSp());
|
||||||
|
return biblio;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private static String getLicence(Instance i) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(i.getLicense())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getAccessRigth(Instance i) {
|
||||||
|
if (Optional.ofNullable(i.getAccessright()).isPresent())
|
||||||
|
switch (i.getAccessright().getClassid()) {
|
||||||
|
case "OPEN":
|
||||||
|
case "OPEN DATA":
|
||||||
|
case "OPEN SOURCE":
|
||||||
|
return AccessRight.OPEN.label;
|
||||||
|
|
||||||
|
case "CLOSED":
|
||||||
|
return AccessRight.CLOSED.label;
|
||||||
|
|
||||||
|
case "RESTRICTED":
|
||||||
|
return AccessRight.RESTRICTED.label;
|
||||||
|
|
||||||
|
case "EMBARGO":
|
||||||
|
case "12MONTHS":
|
||||||
|
case "6MONTHS":
|
||||||
|
return AccessRight.EMBARGO.label;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return AccessRight.UNAVAILABLE.label;
|
||||||
|
|
||||||
|
}
|
||||||
|
return AccessRight.UNAVAILABLE.label;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getPeerReviewd(Instance i) {
|
||||||
|
if (Optional.ofNullable(i.getRefereed()).isPresent())
|
||||||
|
|
||||||
|
switch (i.getRefereed().getClassid()) {
|
||||||
|
case "0000":
|
||||||
|
return PeerReview.UNAVAILABLE.label;
|
||||||
|
|
||||||
|
case "0001":
|
||||||
|
return PeerReview.PEER_REVIEWED.label;
|
||||||
|
|
||||||
|
case "0002":
|
||||||
|
return PeerReview.NON_PEER_REVIEWED.label;
|
||||||
|
}
|
||||||
|
return PeerReview.UNAVAILABLE.label;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -8,6 +8,8 @@ import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import javax.xml.crypto.Data;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
@ -30,6 +32,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.skgif.model.*;
|
import eu.dnetlib.dhp.skgif.model.*;
|
||||||
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
||||||
|
import eu.dnetlib.dhp.skgif.model.Organization;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -87,7 +90,7 @@ public class DumpResult implements Serializable {
|
||||||
|
|
||||||
// selection of the relevant relations from result type to other entity. Only teh semantic relevant ones are
|
// selection of the relevant relations from result type to other entity. Only teh semantic relevant ones are
|
||||||
// considered
|
// considered
|
||||||
selectRelations(spark, inputPath, workingDir);
|
// selectRelations(spark, inputPath, workingDir);
|
||||||
|
|
||||||
// merge of relations and manifestation for the same result
|
// merge of relations and manifestation for the same result
|
||||||
getRelationAndManifestation(spark, workingDir, inputPath);
|
getRelationAndManifestation(spark, workingDir, inputPath);
|
||||||
|
@ -138,8 +141,18 @@ public class DumpResult implements Serializable {
|
||||||
// }
|
// }
|
||||||
|
|
||||||
private static void getRelationAndManifestation(SparkSession spark, String workingDir, String inputPath) {
|
private static void getRelationAndManifestation(SparkSession spark, String workingDir, String inputPath) {
|
||||||
Dataset<RelationPerProduct> aggRelations = Utils
|
// Dataset<RelationPerProduct> aggRelations = Utils
|
||||||
.readPath(spark, workingDir + "aggrelation", RelationPerProduct.class);
|
// .readPath(spark, workingDir + "aggrelation", RelationPerProduct.class);
|
||||||
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add(
|
||||||
|
"eoscdatasourcetype", new StructType()
|
||||||
|
.add("classid", DataTypes.StringType))
|
||||||
|
.add("id", DataTypes.StringType)
|
||||||
|
|
||||||
|
;
|
||||||
|
|
||||||
ModelSupport.entityTypes
|
ModelSupport.entityTypes
|
||||||
.keySet()
|
.keySet()
|
||||||
|
@ -147,165 +160,194 @@ public class DumpResult implements Serializable {
|
||||||
.filter(ModelSupport::isResult)
|
.filter(ModelSupport::isResult)
|
||||||
.forEach(e -> {
|
.forEach(e -> {
|
||||||
Utils.removeOutputDir(spark, workingDir + e.name() + "/partialresearchproduct");
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/partialresearchproduct");
|
||||||
|
log.info("executing on {}", e.name());
|
||||||
|
Dataset<Row> datasource = spark
|
||||||
|
.read()
|
||||||
|
.schema(rp)
|
||||||
|
.json(inputPath + "/datasource")
|
||||||
|
.filter(("datainfo.deletedbyinference != true and eoscdatasourcetype.classid == 'Journal archive'"))
|
||||||
|
.drop("datainfo", "eoscdatasourcetype");
|
||||||
|
|
||||||
Dataset<Datasource> datasource = Utils
|
// Dataset<EmitPerManifestation> man = Utils
|
||||||
.readPath(spark, inputPath + "/datasource", Datasource.class)
|
// .readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
|
||||||
.filter(
|
Dataset<Row> man = spark
|
||||||
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEoscdatasourcetype()).isPresent() &&
|
.read()
|
||||||
d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
|
.schema(Encoders.bean(EmitPerManifestation.class).schema())
|
||||||
|
.json(workingDir + e.name() + "/manifestation");
|
||||||
|
// Dataset<PartialResearchProduct> partialResearchProduct =
|
||||||
|
|
||||||
Dataset<EmitPerManifestation> man = Utils
|
man
|
||||||
.readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
|
.joinWith(datasource, man.col("hostedby").equalTo(datasource.col("id")), "left")
|
||||||
|
|
||||||
Dataset<PartialResearchProduct> partialResearchProduct = man
|
|
||||||
.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left")
|
|
||||||
.groupByKey(
|
.groupByKey(
|
||||||
(MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(),
|
(MapFunction<Tuple2<Row, Row>, String>) t2 -> t2._1().getAs("resultId"),
|
||||||
Encoders.STRING())
|
Encoders.STRING())
|
||||||
.mapGroups(
|
.mapGroups(
|
||||||
(MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (
|
(MapGroupsFunction<String, Tuple2<Row, Row>, PartialResearchProduct>) (
|
||||||
k, v) -> {
|
k, v) -> {
|
||||||
PartialResearchProduct prp = new PartialResearchProduct();
|
PartialResearchProduct prp = new PartialResearchProduct();
|
||||||
prp.setResultId(k);
|
prp.setResultId(k);
|
||||||
List<Manifestation> manifestationList = new ArrayList<>();
|
List<Manifestation> manifestationList = new ArrayList<>();
|
||||||
while (v.hasNext())
|
Tuple2<Row, Row> first = v.next();
|
||||||
manifestationList.add(getManifestation(v.next()));
|
manifestationList.add(getManifestation(first));
|
||||||
|
v.forEachRemaining(value -> manifestationList.add(getManifestation(value)));
|
||||||
prp.setManifestations(manifestationList);
|
prp.setManifestations(manifestationList);
|
||||||
return prp;
|
return prp;
|
||||||
}, Encoders.bean(PartialResearchProduct.class));
|
}, Encoders.bean(PartialResearchProduct.class))
|
||||||
partialResearchProduct
|
.write()
|
||||||
.joinWith(
|
.mode(SaveMode.Overwrite)
|
||||||
aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")),
|
.option("compression", "gzip")
|
||||||
"left")
|
.json(workingDir + e.name() + "/temp_partitalresearchproduct");
|
||||||
.map(
|
|
||||||
(MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> {
|
Dataset<PartialResearchProduct> partialResearchProduct = spark
|
||||||
PartialResearchProduct prp = t2._1();
|
.read()
|
||||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
.schema(Encoders.bean(PartialResearchProduct.class).schema())
|
||||||
prp
|
.json(workingDir + e.name() + "/temp_partitalresearchproduct")
|
||||||
.setRelated_products(
|
.as(Encoders.bean(PartialResearchProduct.class));
|
||||||
t2
|
|
||||||
._2()
|
Dataset<ExtendingOrganization> affiliations = Utils
|
||||||
.getRelatedProduct()
|
.readPath(
|
||||||
.keySet()
|
spark, workingDir + "relations/result_relevant_organizations", ExtendingOrganization.class);
|
||||||
.stream()
|
// Dataset<PartialResearchProduct> prrWithAffiliation =
|
||||||
.map(
|
partialResearchProduct
|
||||||
key -> Relations.newInstance(key, t2._2().getRelatedProduct().get(key)))
|
.joinWith(
|
||||||
.collect(Collectors.toList()));
|
affiliations, partialResearchProduct.col("resultId").equalTo(affiliations.col("entityId")),
|
||||||
prp.setRelevant_organizations(t2._2().getOrganizations());
|
"left")
|
||||||
prp.setFunding(t2._2().getFunding());
|
.map(
|
||||||
}
|
(MapFunction<Tuple2<PartialResearchProduct, ExtendingOrganization>, PartialResearchProduct>) t2 -> {
|
||||||
return prp;
|
if (t2._2() == null)
|
||||||
|
return t2._1();
|
||||||
|
t2._1().setRelevant_organizations(t2._2().getRelevant_organization());
|
||||||
|
return t2._1();
|
||||||
|
}, Encoders.bean(PartialResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/temp_partitalresearchproductaff");
|
||||||
|
|
||||||
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_partitalresearchproduct");
|
||||||
|
|
||||||
|
Dataset<PartialResearchProduct> prrWithAffiliation = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(PartialResearchProduct.class).schema())
|
||||||
|
.json(workingDir + e.name() + "/temp_partitalresearchproductaff")
|
||||||
|
.as(Encoders.bean(PartialResearchProduct.class));
|
||||||
|
Dataset<GrantRelation> grants = Utils
|
||||||
|
.readPath(spark, workingDir + "relations/funding", GrantRelation.class);
|
||||||
|
|
||||||
|
// Dataset<PartialResearchProduct> pprWitGrants =
|
||||||
|
prrWithAffiliation
|
||||||
|
.joinWith(grants, prrWithAffiliation.col("resultId").equalTo(grants.col("resultId")), "left")
|
||||||
|
.map((MapFunction<Tuple2<PartialResearchProduct, GrantRelation>, PartialResearchProduct>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1();
|
||||||
|
t2._1().setFunding(t2._2().getFunding());
|
||||||
|
return t2._1();
|
||||||
|
}, Encoders.bean(PartialResearchProduct.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + e.name() + "/temp_partitalresearchproductgrant");
|
||||||
|
|
||||||
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_partitalresearchproductaff");
|
||||||
|
Dataset<PartialResearchProduct> pprWitGrants = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(PartialResearchProduct.class).schema())
|
||||||
|
.json(workingDir + e.name() + "/temp_partitalresearchproductgrant")
|
||||||
|
.as(Encoders.bean(PartialResearchProduct.class));
|
||||||
|
Dataset<ProductsRelation> relatedResults = Utils
|
||||||
|
.readPath(spark, workingDir + "/relations/related_products", ProductsRelation.class);
|
||||||
|
pprWitGrants
|
||||||
|
.joinWith(
|
||||||
|
relatedResults, pprWitGrants.col("resultId").equalTo(relatedResults.col("resultId")),
|
||||||
|
"left")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<PartialResearchProduct, ProductsRelation>, PartialResearchProduct>) t2 -> {
|
||||||
|
if (t2._2() == null)
|
||||||
|
return t2._1();
|
||||||
|
t2._1().setRelated_products(t2._2().getRelated_products());
|
||||||
|
return t2._1();
|
||||||
|
|
||||||
}, Encoders.bean(PartialResearchProduct.class))
|
}, Encoders.bean(PartialResearchProduct.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(workingDir + e.name() + "/partialresearchproduct");
|
.json(workingDir + e.name() + "/partialresearchproduct");
|
||||||
|
Utils.removeOutputDir(spark, workingDir + e.name() + "/temp_partitalresearchproductgrant");
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Manifestation getManifestation(Tuple2<EmitPerManifestation, Datasource> t2) {
|
private static Manifestation getManifestation(Tuple2<Row, Row> t2) {
|
||||||
|
|
||||||
// se il lato sinistro c'e' allora ho la biblio e la venue
|
// se il lato sinistro c'e' allora ho la biblio e la venue
|
||||||
// se non c'e' allora ho solo gli altri valori
|
// se non c'e' allora ho solo gli altri valori
|
||||||
EmitPerManifestation epm = t2._1();
|
Row epm = t2._1();
|
||||||
Manifestation manifestation = new Manifestation();
|
Manifestation manifestation = new Manifestation();
|
||||||
manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname());
|
manifestation.setProduct_local_type(epm.getAs("product_local_type"));
|
||||||
manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
|
manifestation.setProduct_local_type_schema(epm.getAs("product_local_type_schema"));
|
||||||
if (Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent())
|
if (Optional.ofNullable(epm.getAs("publishing_date")).isPresent())
|
||||||
manifestation
|
manifestation
|
||||||
.setDates(
|
.setDates(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
|
Dates.newInstance(epm.getAs("publishing_date"), "publishing")));
|
||||||
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
|
manifestation.setPeer_review(epm.getAs("peer_reviewed"));
|
||||||
switch (epm.getInstance().getRefereed().getClassid()) {
|
|
||||||
case "0000":
|
|
||||||
manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
|
|
||||||
break;
|
|
||||||
case "0001":
|
|
||||||
manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
|
|
||||||
break;
|
|
||||||
case "0002":
|
|
||||||
manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
manifestation.setMetadata_curation("unavailable");
|
manifestation.setMetadata_curation("unavailable");
|
||||||
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
|
manifestation.setAccess_right(epm.getAs("access_right"));
|
||||||
switch (epm.getInstance().getAccessright().getClassid()) {
|
manifestation.setLicence(epm.getAs("licence"));
|
||||||
case "OPEN":
|
manifestation.setUrl(epm.getAs("url"));
|
||||||
case "OPEN DATA":
|
manifestation.setPid(epm.getAs("pid"));
|
||||||
case "OPEN SOURCE":
|
|
||||||
manifestation.setAccess_right(AccessRight.OPEN.label);
|
|
||||||
break;
|
|
||||||
case "CLOSED":
|
|
||||||
manifestation.setAccess_right(AccessRight.CLOSED.label);
|
|
||||||
break;
|
|
||||||
case "RESTRICTED":
|
|
||||||
manifestation.setAccess_right(AccessRight.RESTRICTED.label);
|
|
||||||
break;
|
|
||||||
case "EMBARGO":
|
|
||||||
case "12MONTHS":
|
|
||||||
case "6MONTHS":
|
|
||||||
manifestation.setAccess_right(AccessRight.EMBARGO.label);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
manifestation
|
|
||||||
.setLicence(
|
|
||||||
Optional
|
|
||||||
.ofNullable(epm.getInstance().getLicense())
|
|
||||||
.map(value -> value.getValue())
|
|
||||||
.orElse(null));
|
|
||||||
if (Optional.ofNullable(epm.getInstance().getUrl()).isPresent() && epm.getInstance().getUrl().size() > 0)
|
|
||||||
manifestation
|
|
||||||
.setUrl(epm.getInstance().getUrl().get(0));
|
|
||||||
else
|
|
||||||
manifestation.setUrl(null);
|
|
||||||
|
|
||||||
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent() && epm.getInstance().getPid().size() > 0) {
|
|
||||||
manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
|
|
||||||
}
|
|
||||||
if (Optional.ofNullable(t2._2()).isPresent()) {
|
if (Optional.ofNullable(t2._2()).isPresent()) {
|
||||||
|
Biblio biblio = getBiblio(epm);
|
||||||
|
// if (biblio == null)
|
||||||
|
// log.info("null biblio fo {} ", epm.getAs("resultId"));
|
||||||
manifestation.setBiblio(getBiblio(epm));
|
manifestation.setBiblio(getBiblio(epm));
|
||||||
if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent())
|
if (Optional.ofNullable(epm.getAs("journal")).isPresent() &&
|
||||||
|
Optional.ofNullable(epm.getAs("journal.issnPrinted")).isPresent())
|
||||||
manifestation
|
manifestation
|
||||||
.setVenue(
|
.setVenue(
|
||||||
MinVenue
|
MinVenue
|
||||||
.newInstance(
|
.newInstance(
|
||||||
Utils.getIdentifier(Prefixes.VENUE, t2._2().getJournal().getIssnPrinted()),
|
Utils.getIdentifier(Prefixes.VENUE, epm.getAs("journal.issnPrinted")),
|
||||||
t2._1().getJournal().getName()));
|
epm.getAs("hostedbyvalue")));
|
||||||
else if (Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent())
|
else if (Optional.ofNullable(epm.getAs("journal")).isPresent() &&
|
||||||
|
Optional.ofNullable(epm.getAs("journal.issnOnline")).isPresent())
|
||||||
manifestation
|
manifestation
|
||||||
.setVenue(
|
.setVenue(
|
||||||
MinVenue
|
MinVenue
|
||||||
.newInstance(
|
.newInstance(
|
||||||
Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline()),
|
Utils.getIdentifier(Prefixes.VENUE, epm.getAs("journal.issnOnline")),
|
||||||
t2._1().getJournal().getName()));
|
epm.getAs("hostedbyvalue")));
|
||||||
}
|
}
|
||||||
manifestation
|
manifestation
|
||||||
.setHosting_datasource(
|
.setHosting_datasource(
|
||||||
MinVenue
|
MinVenue
|
||||||
.newInstance(
|
.newInstance(
|
||||||
// Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()),
|
// Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()),
|
||||||
epm.getInstance().getHostedby().getKey(),
|
epm.getAs("hostedBy"),
|
||||||
epm.getInstance().getHostedby().getValue()));
|
epm.getAs("hostedbyvalue")));
|
||||||
|
|
||||||
return manifestation;
|
return manifestation;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Biblio getBiblio(EmitPerManifestation epm) {
|
private static Biblio getBiblio(Row epm) {
|
||||||
Biblio biblio = new Biblio();
|
Biblio biblio = new Biblio();
|
||||||
biblio.setEdition(epm.getJournal().getEdition());
|
if (!Optional.ofNullable(epm.getAs("journal")).isPresent()) {
|
||||||
biblio.setIssue(epm.getJournal().getIss());
|
return null;
|
||||||
biblio.setPublisher(epm.getPublisher());
|
}
|
||||||
biblio.setVolume(epm.getJournal().getVol());
|
if (Optional.ofNullable(epm.getAs("journal.edition")).isPresent())
|
||||||
biblio.setEnd_page(epm.getJournal().getEp());
|
biblio.setEdition(epm.getAs("journal.edition"));
|
||||||
biblio.setStart_page(epm.getJournal().getSp());
|
if (Optional.ofNullable(epm.getAs("journal.iss")).isPresent())
|
||||||
|
biblio.setIssue(epm.getAs("journal.iss"));
|
||||||
|
if (Optional.ofNullable(epm.getAs("publisher")).isPresent())
|
||||||
|
biblio.setPublisher(epm.getAs("publisher"));
|
||||||
|
if (Optional.ofNullable(epm.getAs("journal.vol")).isPresent())
|
||||||
|
biblio.setVolume(epm.getAs("journal.vol"));
|
||||||
|
if (Optional.ofNullable(epm.getAs("journal.ep")).isPresent())
|
||||||
|
biblio.setEnd_page(epm.getAs("journal.ep"));
|
||||||
|
if (Optional.ofNullable(epm.getAs("journal.sp")).isPresent())
|
||||||
|
biblio.setStart_page(epm.getAs("journal.sp"));
|
||||||
return biblio;
|
return biblio;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -369,6 +411,25 @@ public class DumpResult implements Serializable {
|
||||||
RelationType.PART.label,
|
RelationType.PART.label,
|
||||||
RelationType.VERSION.label);
|
RelationType.VERSION.label);
|
||||||
|
|
||||||
|
Dataset<Row> relation = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(inputPath + "relation")
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"relClass == 'hasAuthorInstitution")
|
||||||
|
.select("source", "target");
|
||||||
|
|
||||||
|
Dataset<Row> organization = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Organization.class).schema())
|
||||||
|
.json(inputPath + "organization")
|
||||||
|
.filter("datainfo.deletedbyinference != true")
|
||||||
|
.select("id", "pid", "legalname.value");
|
||||||
|
|
||||||
|
// result = spark.read().schema(Encoders.bean(Result.class).schema())
|
||||||
|
// .json(inputPath + )
|
||||||
|
|
||||||
// relationsProducts
|
// relationsProducts
|
||||||
// .stream()
|
// .stream()
|
||||||
// .forEach(r -> buildRelationPerProducts(spark, inputPath, workingDir, r));
|
// .forEach(r -> buildRelationPerProducts(spark, inputPath, workingDir, r));
|
||||||
|
|
|
@ -85,6 +85,8 @@ public class DumpVenue implements Serializable {
|
||||||
manifestationDataset, datasourceDataset.col("id").equalTo(manifestationDataset.col("hostedby")),
|
manifestationDataset, datasourceDataset.col("id").equalTo(manifestationDataset.col("hostedby")),
|
||||||
"left")
|
"left")
|
||||||
.map((MapFunction<Tuple2<Datasource, EmitPerManifestation>, Venue>) t2 -> {
|
.map((MapFunction<Tuple2<Datasource, EmitPerManifestation>, Venue>) t2 -> {
|
||||||
|
if (!Optional.ofNullable(t2._1().getJournal()).isPresent())
|
||||||
|
return null;
|
||||||
Venue venue = new Venue();
|
Venue venue = new Venue();
|
||||||
Datasource d = t2._1();
|
Datasource d = t2._1();
|
||||||
if (Optional.ofNullable(d.getJournal()).isPresent()
|
if (Optional.ofNullable(d.getJournal()).isPresent()
|
||||||
|
@ -105,6 +107,7 @@ public class DumpVenue implements Serializable {
|
||||||
venue.setContributions(null);
|
venue.setContributions(null);
|
||||||
return venue;
|
return venue;
|
||||||
}, Encoders.bean(Venue.class))
|
}, Encoders.bean(Venue.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -15,20 +15,18 @@ import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.types.StructType;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement;
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
import eu.dnetlib.dhp.skgif.model.*;
|
import eu.dnetlib.dhp.skgif.model.*;
|
||||||
|
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -72,9 +70,9 @@ public class EmitFromEntities implements Serializable {
|
||||||
spark -> {
|
spark -> {
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
emitFromResult(spark, inputPath, outputPath, workingDir);
|
emitFromResult(spark, inputPath, outputPath, workingDir);
|
||||||
emitFromDatasource(spark, inputPath, workingDir);
|
// emitFromDatasource(spark, inputPath, workingDir);
|
||||||
emitFromOrganization(spark, inputPath, workingDir);
|
// emitFromOrganization(spark, inputPath, workingDir);
|
||||||
emitFromProject(spark, inputPath, workingDir);
|
// emitFromProject(spark, inputPath, workingDir);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -196,13 +194,53 @@ public class EmitFromEntities implements Serializable {
|
||||||
// per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
// per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
||||||
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
|
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
|
||||||
String workingDir) {
|
String workingDir) {
|
||||||
emitManifestation(spark, inputPath, workingDir);
|
// emitManifestation(spark, inputPath, workingDir);
|
||||||
emitPerson(spark, inputPath, outputPath, workingDir);
|
emitPerson(spark, inputPath, outputPath, workingDir);
|
||||||
emitTopic(spark, inputPath, outputPath, workingDir);
|
emitTopic(spark, inputPath, outputPath, workingDir);
|
||||||
emitMinProduct(spark, inputPath, workingDir);
|
emitDatasourcePublisher(spark, inputPath, workingDir);
|
||||||
|
// emitMinProduct(spark, inputPath, workingDir);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//the publisher is at the level of the result as well as the information for the journal. We do not know which instance
|
||||||
|
// hostedby.key is the one for the journal
|
||||||
|
private static void emitDatasourcePublisher(SparkSession spark, String inputPath, String workingDir) {
|
||||||
|
Dataset<Row> journalIds = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Datasource.class).schema())
|
||||||
|
.json((inputPath + "datasource"))
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference !=true false and " +
|
||||||
|
"eoscdatasourcetype.classid == 'Journal archive' ")
|
||||||
|
.select("id");
|
||||||
|
|
||||||
|
Dataset<Publication> result = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Publication.class).schema())
|
||||||
|
.json(inputPath + "publication")
|
||||||
|
.filter("datainfo.deletedbyinference != true ")
|
||||||
|
.as(Encoders.bean(Publication.class));
|
||||||
|
|
||||||
|
Dataset<Row> datasourcePublisher = result.flatMap((FlatMapFunction<Publication, Tuple2<String, String>>) r -> {
|
||||||
|
ArrayList<Tuple2<String, String>> dsPub = new ArrayList<>();
|
||||||
|
if (Optional.ofNullable(r.getJournal()).isPresent() &&
|
||||||
|
Optional.ofNullable(r.getPublisher()).isPresent()) {
|
||||||
|
for (Instance i : r.getInstance())
|
||||||
|
dsPub.add(new Tuple2<>(i.getHostedby().getKey(), r.getPublisher().getValue()));
|
||||||
|
}
|
||||||
|
return dsPub.iterator();
|
||||||
|
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||||
|
.selectExpr("_1 as hostedby", "_2 as publisher");
|
||||||
|
|
||||||
|
datasourcePublisher
|
||||||
|
.join(journalIds, datasourcePublisher.col("hostedby").equalTo(journalIds.col("id")), "leftsemi")
|
||||||
|
.distinct()
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + "/datasourcePublisher");
|
||||||
|
}
|
||||||
|
|
||||||
private static <R extends Result> void emitMinProduct(SparkSession spark, String inputPath, String workingDir) {
|
private static <R extends Result> void emitMinProduct(SparkSession spark, String inputPath, String workingDir) {
|
||||||
Utils.removeOutputDir(spark, workingDir + "minProduct");
|
Utils.removeOutputDir(spark, workingDir + "minProduct");
|
||||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||||
|
@ -365,8 +403,10 @@ public class EmitFromEntities implements Serializable {
|
||||||
.flatMap((FlatMapFunction<R, EmitPerManifestation>) p -> p.getInstance().stream().map(i -> {
|
.flatMap((FlatMapFunction<R, EmitPerManifestation>) p -> p.getInstance().stream().map(i -> {
|
||||||
EmitPerManifestation epb = new EmitPerManifestation();
|
EmitPerManifestation epb = new EmitPerManifestation();
|
||||||
epb.setResultId(p.getId());
|
epb.setResultId(p.getId());
|
||||||
epb.setInstance(i);
|
setInstanceFields(epb, i);
|
||||||
|
// epb.setInstance(i);
|
||||||
epb.setHostedBy(i.getHostedby().getKey());
|
epb.setHostedBy(i.getHostedby().getKey());
|
||||||
|
epb.setHostedbyvalue(i.getHostedby().getValue());
|
||||||
epb
|
epb
|
||||||
.setPublisher(
|
.setPublisher(
|
||||||
Optional
|
Optional
|
||||||
|
@ -414,4 +454,69 @@ public class EmitFromEntities implements Serializable {
|
||||||
.json(workingDir + "/datasourcePublisher");
|
.json(workingDir + "/datasourcePublisher");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void setInstanceFields(EmitPerManifestation epb, Instance i) {
|
||||||
|
epb.setProduct_local_type(i.getInstancetype().getClassname());
|
||||||
|
epb.setProduct_local_type_schema(i.getInstancetype().getSchemename());
|
||||||
|
epb.setPeer_reviewed(getPeerReviewd(i));
|
||||||
|
epb.setAccess_right(getAccessRigth(i));
|
||||||
|
epb
|
||||||
|
.setLicence(
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getLicense())
|
||||||
|
.map(value -> value.getValue())
|
||||||
|
.orElse(null));
|
||||||
|
if (Optional.ofNullable(i.getUrl()).isPresent() && i.getUrl().size() > 0)
|
||||||
|
epb.setUrl(i.getUrl().get(0));
|
||||||
|
else
|
||||||
|
epb.setUrl(null);
|
||||||
|
if (Optional.ofNullable(i.getPid()).isPresent() && i.getPid().size() > 0) {
|
||||||
|
epb.setPid(i.getPid().get(0).getValue());
|
||||||
|
}
|
||||||
|
if (Optional.ofNullable(i.getDateofacceptance()).isPresent())
|
||||||
|
epb.setPublishing_date(i.getDateofacceptance().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getAccessRigth(Instance i) {
|
||||||
|
if (Optional.ofNullable(i.getAccessright()).isPresent())
|
||||||
|
switch (i.getAccessright().getClassid()) {
|
||||||
|
case "OPEN":
|
||||||
|
case "OPEN DATA":
|
||||||
|
case "OPEN SOURCE":
|
||||||
|
return AccessRight.OPEN.label;
|
||||||
|
|
||||||
|
case "CLOSED":
|
||||||
|
return AccessRight.CLOSED.label;
|
||||||
|
|
||||||
|
case "RESTRICTED":
|
||||||
|
return AccessRight.RESTRICTED.label;
|
||||||
|
|
||||||
|
case "EMBARGO":
|
||||||
|
case "12MONTHS":
|
||||||
|
case "6MONTHS":
|
||||||
|
return AccessRight.EMBARGO.label;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return AccessRight.UNAVAILABLE.label;
|
||||||
|
|
||||||
|
}
|
||||||
|
return AccessRight.UNAVAILABLE.label;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getPeerReviewd(Instance i) {
|
||||||
|
if (Optional.ofNullable(i.getRefereed()).isPresent())
|
||||||
|
|
||||||
|
switch (i.getRefereed().getClassid()) {
|
||||||
|
case "0000":
|
||||||
|
return PeerReview.UNAVAILABLE.label;
|
||||||
|
|
||||||
|
case "0001":
|
||||||
|
return PeerReview.PEER_REVIEWED.label;
|
||||||
|
|
||||||
|
case "0002":
|
||||||
|
return PeerReview.NON_PEER_REVIEWED.label;
|
||||||
|
}
|
||||||
|
return PeerReview.UNAVAILABLE.label;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,516 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.types.DataTypes;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ProductsRelation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.skgif.model.*;
|
||||||
|
import scala.Tuple5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 16/03/24
|
||||||
|
*/
|
||||||
|
public class SelectRelation implements Serializable {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(SelectRelation.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
DumpResult.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/dump/skgif/select_relation_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String workingDir = parser.get("workingDir");
|
||||||
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
|
final String relationPath = parser.get("relationPath");
|
||||||
|
log.info("relationPath: {}", relationPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
// Utils.removeOutputDir(spark, workingDir + "aggrelation");
|
||||||
|
|
||||||
|
// selectAffiliationRelations(spark, inputPath, workingDir, outputPath);
|
||||||
|
createOrganizationExtention(
|
||||||
|
spark, inputPath, RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label,
|
||||||
|
workingDir + "relations/result_relevant_organizations", relationPath);
|
||||||
|
selectFundingRelations(spark, inputPath, workingDir, relationPath);
|
||||||
|
selectProductRelation(spark, inputPath, workingDir, relationPath);
|
||||||
|
// selectDatasourceOrganizationRelation(spark, inputPath, workingDir, outputPath);
|
||||||
|
createOrganizationExtention(
|
||||||
|
spark, inputPath, RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION.label,
|
||||||
|
workingDir + "relations/datasource_providing_organization", relationPath);
|
||||||
|
createOrganizationExtention(
|
||||||
|
spark, inputPath, RelationType.PROJECT_HAS_PARTICIPANT_ORGANIZATION.label,
|
||||||
|
workingDir + "relations/project_partecipating_organization", relationPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void createOrganizationExtention(SparkSession spark, String inputPath, String relationSem,
|
||||||
|
String outputPath, String relationPath) {
|
||||||
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("source", DataTypes.StringType)
|
||||||
|
.add("target", DataTypes.StringType)
|
||||||
|
.add("relClass", DataTypes.StringType);
|
||||||
|
|
||||||
|
Dataset<Row> relation = spark
|
||||||
|
.read()
|
||||||
|
.schema(rp)
|
||||||
|
.json(relationPath)
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"relClass == '" + relationSem + "'")
|
||||||
|
.drop("datainfo", "relClass");
|
||||||
|
|
||||||
|
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
|
||||||
|
|
||||||
|
relation
|
||||||
|
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
|
||||||
|
.drop("target")
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
|
||||||
|
ExtendingOrganization ar = new ExtendingOrganization();
|
||||||
|
ar.setEntityId(k);
|
||||||
|
addRelevantOrganization(ar, v);
|
||||||
|
return ar;
|
||||||
|
}, Encoders.bean(ExtendingOrganization.class))
|
||||||
|
// .show(false);
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void selectDatasourceOrganizationRelation(SparkSession spark, String inputPath, String workingDir,
|
||||||
|
String outputPath) {
|
||||||
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("source", DataTypes.StringType)
|
||||||
|
.add("target", DataTypes.StringType)
|
||||||
|
.add("relClass", DataTypes.StringType);
|
||||||
|
|
||||||
|
Dataset<Row> relation = spark
|
||||||
|
.read()
|
||||||
|
.schema(rp)
|
||||||
|
.json(inputPath + "relation")
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"relClass == '" + RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION + "'")
|
||||||
|
.drop("datainfo", "relClass");
|
||||||
|
|
||||||
|
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
|
||||||
|
|
||||||
|
relation
|
||||||
|
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
|
||||||
|
.drop("target")
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
|
||||||
|
ExtendingOrganization ar = new ExtendingOrganization();
|
||||||
|
ar.setEntityId(k);
|
||||||
|
addRelevantOrganization(ar, v);
|
||||||
|
return ar;
|
||||||
|
}, Encoders.bean(ExtendingOrganization.class))
|
||||||
|
// .show(false);
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json("/tmp/miriam/prova/providingOrganization");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void selectProductRelation(SparkSession spark, String inputPath, String workingDir,
|
||||||
|
String relationPath) {
|
||||||
|
|
||||||
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("source", DataTypes.StringType)
|
||||||
|
.add("target", DataTypes.StringType)
|
||||||
|
.add("relClass", DataTypes.StringType);
|
||||||
|
|
||||||
|
Dataset<Row> relation = spark
|
||||||
|
.read()
|
||||||
|
.schema(rp)
|
||||||
|
.json(relationPath)
|
||||||
|
.filter("datainfo.deletedbyinference != true")
|
||||||
|
.filter(
|
||||||
|
"relClass == '" + RelationType.DOCUMENTS.label + "' or " +
|
||||||
|
"relClass == '" + RelationType.CITATION.label + "' or " +
|
||||||
|
"relClass == '" + RelationType.PART.label + "' or " +
|
||||||
|
"relClass == '" + RelationType.SUPPLEMENT.label + "' or " +
|
||||||
|
"relClass == '" + RelationType.VERSION.label + "'")
|
||||||
|
.drop("datainfo");
|
||||||
|
|
||||||
|
Dataset<Row> result = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Result.class).schema())
|
||||||
|
.json(inputPath + "publication")
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"datainfo.invisible != true")
|
||||||
|
.selectExpr("id", "title[0].value as title", "pid");
|
||||||
|
|
||||||
|
result.createOrReplaceTempView("res");
|
||||||
|
|
||||||
|
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, title " +
|
||||||
|
"from res " +
|
||||||
|
"lateral view explode (pid) p as pide ";
|
||||||
|
|
||||||
|
Dataset<MinProduct> minProduct = spark
|
||||||
|
.sql(query)
|
||||||
|
// .show(false);
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, MinProduct>) (k, v) -> {
|
||||||
|
MinProduct mp = new MinProduct();
|
||||||
|
mp.setLocal_identifier(k);
|
||||||
|
Row r = v.next();
|
||||||
|
mp.setTitle(r.getAs("title"));
|
||||||
|
addProductPid(mp, r);
|
||||||
|
v.forEachRemaining(row -> addProductPid(mp, row));
|
||||||
|
return mp;
|
||||||
|
}, Encoders.bean(MinProduct.class));
|
||||||
|
|
||||||
|
relation
|
||||||
|
.join(minProduct, relation.col("target").equalTo(minProduct.col("local_identifier")))
|
||||||
|
.selectExpr("source", "local_identifier", "title", "doi", "pmcid", "pmid", "arxivid", "relClass as sem")
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, ProductsRelation>) (k, v) -> {
|
||||||
|
ProductsRelation pr = new ProductsRelation();
|
||||||
|
pr.setResultId(k);
|
||||||
|
addResulRelations(pr, v);
|
||||||
|
return pr;
|
||||||
|
}, Encoders.bean(ProductsRelation.class))
|
||||||
|
// .show(false);
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + "relations/related_products");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addResulRelations(ProductsRelation pr, Iterator<Row> v) {
|
||||||
|
pr.setRelated_products(new ArrayList<>());
|
||||||
|
Map<String, ArrayList<MinProduct>> hashMap = new HashMap<>();
|
||||||
|
while (v.hasNext()) {
|
||||||
|
Row next = v.next();
|
||||||
|
String sem = next.getAs("sem");
|
||||||
|
if (!hashMap.containsKey(sem))
|
||||||
|
hashMap.put(sem, new ArrayList<>());
|
||||||
|
hashMap.get(sem).add(getMinProduct(next));
|
||||||
|
}
|
||||||
|
|
||||||
|
hashMap
|
||||||
|
.keySet()
|
||||||
|
.stream()
|
||||||
|
.forEach(key -> pr.getRelated_products().add(Relations.newInstance(key, hashMap.get(key))));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MinProduct getMinProduct(Row next) {
|
||||||
|
MinProduct mp = new MinProduct();
|
||||||
|
mp.setLocal_identifier(next.getAs("local_identifier"));
|
||||||
|
if (Optional.ofNullable(next.getAs("doi")).isPresent())
|
||||||
|
mp.setDoi(next.getAs("doi"));
|
||||||
|
if (Optional.ofNullable(next.getAs("pmid")).isPresent())
|
||||||
|
mp.setPmid(next.getAs("pmid"));
|
||||||
|
if (Optional.ofNullable(next.getAs("pmcid")).isPresent())
|
||||||
|
mp.setPmcid(next.getAs("pmcid"));
|
||||||
|
if (Optional.ofNullable(next.getAs("arxivid")).isPresent())
|
||||||
|
mp.setArxivid(next.getAs("arxivid"));
|
||||||
|
return mp;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addProductPid(MinProduct mp, Row next) {
|
||||||
|
String schema = next.getAs("schema");
|
||||||
|
if (Optional.ofNullable(schema).isPresent()) {
|
||||||
|
switch (schema) {
|
||||||
|
case "doi":
|
||||||
|
mp.setDoi(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "pmcid":
|
||||||
|
mp.setPmcid(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "pmid":
|
||||||
|
mp.setPmid(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "arXiv":
|
||||||
|
mp.setArxivid(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void selectFundingRelations(SparkSession spark, String inputPath, String workingDir,
|
||||||
|
String relationPath) {
|
||||||
|
final StructType tp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("id", DataTypes.StringType);
|
||||||
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("source", DataTypes.StringType)
|
||||||
|
.add("target", DataTypes.StringType)
|
||||||
|
.add("relClass", DataTypes.StringType);
|
||||||
|
|
||||||
|
Dataset<Row> relation = spark
|
||||||
|
.read()
|
||||||
|
.schema(rp)
|
||||||
|
.json(relationPath)
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"relClass == '" + RelationType.RESULT_OUTCOME_FUNDING.label + "'")
|
||||||
|
.drop("datainfo", "relClass");
|
||||||
|
Dataset<Row> projects = Utils
|
||||||
|
.readPath(spark, inputPath + "project", Project.class)
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
||||||
|
p.getFundingtree().size() > 0
|
||||||
|
&&
|
||||||
|
Utils
|
||||||
|
.getFunderName(p.getFundingtree().get(0).getValue())
|
||||||
|
.equalsIgnoreCase("European Commission"))
|
||||||
|
|
||||||
|
.map((MapFunction<Project, Tuple5<String, String, String, String, String>>) p -> {
|
||||||
|
String id = p.getId();
|
||||||
|
String acronym = "";
|
||||||
|
if (Optional.ofNullable(p.getAcronym()).isPresent())
|
||||||
|
acronym = p.getAcronym().getValue();
|
||||||
|
String title = "";
|
||||||
|
if (Optional.ofNullable(p.getTitle()).isPresent())
|
||||||
|
title = p.getTitle().getValue();
|
||||||
|
String funder = Utils.getFunderName(p.getFundingtree().get(0).getValue());
|
||||||
|
String code = p.getCode().getValue();
|
||||||
|
return new Tuple5<>(id, acronym, title, funder, code);
|
||||||
|
}, Encoders
|
||||||
|
.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING()))
|
||||||
|
.selectExpr("_1 as id", "_2 as acronym", "_3 as title", "_4 as funder", "_5 as code");
|
||||||
|
|
||||||
|
relation
|
||||||
|
.join(projects, relation.col("target").equalTo(projects.col("id")))
|
||||||
|
.drop("target")
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, GrantRelation>) (k, v) -> {
|
||||||
|
GrantRelation gr = new GrantRelation();
|
||||||
|
gr.setResultId(k);
|
||||||
|
addFunding(gr, v);
|
||||||
|
return gr;
|
||||||
|
}, Encoders.bean(GrantRelation.class))
|
||||||
|
// .show(false);
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(workingDir + "relations/funding");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addFunding(GrantRelation gr, Iterator<Row> v) {
|
||||||
|
gr.setFunding(new ArrayList<>());
|
||||||
|
while (v.hasNext()) {
|
||||||
|
gr.getFunding().add(getMinGrant(v.next()));
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MinGrant getMinGrant(Row next) {
|
||||||
|
MinGrant mn = new MinGrant();
|
||||||
|
mn.setCode(next.getAs("code"));
|
||||||
|
mn.setLocal_identifier(next.getAs("id"));
|
||||||
|
mn.setFunder(next.getAs("funder"));
|
||||||
|
if (Optional.ofNullable(next.getAs("acronym")).isPresent())
|
||||||
|
mn.setTitle(next.getAs("acronym"));
|
||||||
|
else
|
||||||
|
mn.setTitle(next.getAs("title"));
|
||||||
|
return mn;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void selectAffiliationRelations(SparkSession spark, String inputPath, String workingDir,
|
||||||
|
String outputPath) {
|
||||||
|
|
||||||
|
final StructType rp = new StructType()
|
||||||
|
.add(
|
||||||
|
"dataInfo", new StructType()
|
||||||
|
.add("deletedbyinference", DataTypes.BooleanType))
|
||||||
|
.add("source", DataTypes.StringType)
|
||||||
|
.add("target", DataTypes.StringType)
|
||||||
|
.add("relClass", DataTypes.StringType);
|
||||||
|
|
||||||
|
Dataset<Row> relation = spark
|
||||||
|
.read()
|
||||||
|
.schema(rp)
|
||||||
|
.json(inputPath + "relation")
|
||||||
|
.filter(
|
||||||
|
"datainfo.deletedbyinference != true and " +
|
||||||
|
"relClass == '" + RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label + "'")
|
||||||
|
.drop("datainfo", "relClass");
|
||||||
|
|
||||||
|
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
|
||||||
|
|
||||||
|
relation
|
||||||
|
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
|
||||||
|
.drop("target")
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
|
||||||
|
ExtendingOrganization ar = new ExtendingOrganization();
|
||||||
|
ar.setEntityId(k);
|
||||||
|
addRelevantOrganization(ar, v);
|
||||||
|
return ar;
|
||||||
|
}, Encoders.bean(ExtendingOrganization.class))
|
||||||
|
// .show(false);
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json("/tmp/miriam/prova/relevantOrganization");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<MinOrganization> getMinOrganizationDataset(SparkSession spark, String inputPath) {
|
||||||
|
Dataset<Row> organization = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Organization.class).schema())
|
||||||
|
.json(inputPath + "organization")
|
||||||
|
.filter("datainfo.deletedbyinference != true")
|
||||||
|
.selectExpr("id", "legalname.value as name", "pid");
|
||||||
|
|
||||||
|
organization.createOrReplaceTempView("org");
|
||||||
|
|
||||||
|
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, name " +
|
||||||
|
"from org " +
|
||||||
|
"lateral view explode (pid) p as pide ";
|
||||||
|
|
||||||
|
Dataset<MinOrganization> minOrganizations = spark
|
||||||
|
.sql(query)
|
||||||
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Row, MinOrganization>) (k, v) -> {
|
||||||
|
MinOrganization mn = new MinOrganization();
|
||||||
|
mn.setLocal_identifier(k);
|
||||||
|
Row r = v.next();
|
||||||
|
mn.setName(r.getAs("name"));
|
||||||
|
addOrganizationPid(mn, r);
|
||||||
|
v.forEachRemaining(row -> addOrganizationPid(mn, row));
|
||||||
|
return mn;
|
||||||
|
}, Encoders.bean(MinOrganization.class));
|
||||||
|
return minOrganizations;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addOrganizationPid(MinOrganization mo, Row next) {
|
||||||
|
String schema = next.getAs("schema");
|
||||||
|
if (Optional.ofNullable(schema).isPresent()) {
|
||||||
|
switch (schema) {
|
||||||
|
case "ROR":
|
||||||
|
mo.setRor(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "ISNI":
|
||||||
|
mo.setIsni(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "FundRef":
|
||||||
|
mo.setFundRef(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "RingGold":
|
||||||
|
mo.setRinGold(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
case "Wikidata":
|
||||||
|
mo.setWikidata(next.getAs("pid"));
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addRelevantOrganization(ExtendingOrganization ar, Iterator<Row> v) {
|
||||||
|
ar.setRelevant_organization(new ArrayList<>());
|
||||||
|
while (v.hasNext())
|
||||||
|
ar.getRelevant_organization().add(getMinOrg(v.next()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MinOrganization getMinOrg(Row next) {
|
||||||
|
MinOrganization mo = new MinOrganization();
|
||||||
|
|
||||||
|
mo.setLocal_identifier(next.getAs("local_identifier"));
|
||||||
|
mo.setName(next.getAs("name"));
|
||||||
|
if (Optional.ofNullable(next.getAs("ror")).isPresent())
|
||||||
|
mo.setRor(next.getAs("ror"));
|
||||||
|
if (Optional.ofNullable(next.getAs("isni")).isPresent())
|
||||||
|
mo.setIsni(next.getAs("isni"));
|
||||||
|
if (Optional.ofNullable(next.getAs("fundRef")).isPresent())
|
||||||
|
mo.setFundRef(next.getAs("fundRef"));
|
||||||
|
if (Optional.ofNullable(next.getAs("rinGold")).isPresent())
|
||||||
|
mo.setRinGold(next.getAs("rinGold"));
|
||||||
|
if (Optional.ofNullable(next.getAs("wikidata")).isPresent())
|
||||||
|
mo.setWikidata(next.getAs("wikidata"));
|
||||||
|
// return mo;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if (Optional.ofNullable(pids).isPresent())
|
||||||
|
// pids.toStream().foreach(pid -> {
|
||||||
|
// if (Optional.ofNullable(pid.getQualifier()).isPresent() &&
|
||||||
|
// Optional.ofNullable(pid.getQualifier().getClassid()).isPresent())
|
||||||
|
// switch (pid.getQualifier().getClassid().toLowerCase()) {
|
||||||
|
// case "ror":
|
||||||
|
// mo.setRor(pid.getValue());
|
||||||
|
// break;
|
||||||
|
// case "isni":
|
||||||
|
// mo.setIsni(pid.getValue());
|
||||||
|
// break;
|
||||||
|
// case "fundref":
|
||||||
|
// mo.setFundRef(pid.getValue());
|
||||||
|
// break;
|
||||||
|
// case "ringgold":
|
||||||
|
// mo.setRinGold(pid.getValue());
|
||||||
|
// break;
|
||||||
|
// case "wikidata":
|
||||||
|
// mo.setWikidata(pid.getValue());
|
||||||
|
// break;
|
||||||
|
//
|
||||||
|
// }
|
||||||
|
// return null;
|
||||||
|
// });
|
||||||
|
return mo;
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,9 +3,7 @@ package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||||
import eu.dnetlib.dhp.skgif.model.Biblio;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
|
@ -14,9 +12,94 @@ import eu.dnetlib.dhp.skgif.model.Biblio;
|
||||||
public class EmitPerManifestation implements Serializable {
|
public class EmitPerManifestation implements Serializable {
|
||||||
private String resultId;
|
private String resultId;
|
||||||
private String hostedBy;
|
private String hostedBy;
|
||||||
|
private String hostedbyvalue;
|
||||||
private Journal journal;
|
private Journal journal;
|
||||||
private Instance instance;
|
// private Instance instance;
|
||||||
private String publisher;
|
private String publisher;
|
||||||
|
private String product_local_type; // instance.getinstancetype.getclassname
|
||||||
|
private String product_local_type_schema; // getInstance().getInstancetype().getSchemename()
|
||||||
|
private String publishing_date;
|
||||||
|
private String peer_reviewed;
|
||||||
|
|
||||||
|
private String access_right;
|
||||||
|
|
||||||
|
private String licence;
|
||||||
|
|
||||||
|
private String url;
|
||||||
|
|
||||||
|
private String pid;
|
||||||
|
|
||||||
|
public String getProduct_local_type() {
|
||||||
|
return product_local_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProduct_local_type(String product_local_type) {
|
||||||
|
this.product_local_type = product_local_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getProduct_local_type_schema() {
|
||||||
|
return product_local_type_schema;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProduct_local_type_schema(String product_local_type_schema) {
|
||||||
|
this.product_local_type_schema = product_local_type_schema;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPublishing_date() {
|
||||||
|
return publishing_date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPublishing_date(String publishing_date) {
|
||||||
|
this.publishing_date = publishing_date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPeer_reviewed() {
|
||||||
|
return peer_reviewed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPeer_reviewed(String peer_reviewed) {
|
||||||
|
this.peer_reviewed = peer_reviewed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAccess_right() {
|
||||||
|
return access_right;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAccess_right(String access_right) {
|
||||||
|
this.access_right = access_right;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLicence() {
|
||||||
|
return licence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLicence(String licence) {
|
||||||
|
this.licence = licence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUrl() {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUrl(String url) {
|
||||||
|
this.url = url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPid() {
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPid(String pid) {
|
||||||
|
this.pid = pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getHostedbyvalue() {
|
||||||
|
return hostedbyvalue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHostedbyvalue(String hostedbyvalue) {
|
||||||
|
this.hostedbyvalue = hostedbyvalue;
|
||||||
|
}
|
||||||
|
|
||||||
public String getPublisher() {
|
public String getPublisher() {
|
||||||
return publisher;
|
return publisher;
|
||||||
|
@ -50,11 +133,4 @@ public class EmitPerManifestation implements Serializable {
|
||||||
this.journal = journal;
|
this.journal = journal;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Instance getInstance() {
|
|
||||||
return instance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setInstance(Instance instance) {
|
|
||||||
this.instance = instance;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.skgif.model.MinOrganization;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 16/03/24
|
||||||
|
*/
|
||||||
|
public class ExtendingOrganization implements Serializable {
|
||||||
|
private String entityId;
|
||||||
|
private java.util.List<MinOrganization> relevant_organization;
|
||||||
|
|
||||||
|
public String getEntityId() {
|
||||||
|
return entityId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEntityId(String entityId) {
|
||||||
|
this.entityId = entityId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<MinOrganization> getRelevant_organization() {
|
||||||
|
return relevant_organization;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRelevant_organization(List<MinOrganization> relevant_organization) {
|
||||||
|
this.relevant_organization = relevant_organization;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.skgif.model.MinGrant;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 16/03/24
|
||||||
|
*/
|
||||||
|
public class GrantRelation implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private java.util.List<MinGrant> funding;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<MinGrant> getFunding() {
|
||||||
|
return funding;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFunding(List<MinGrant> funding) {
|
||||||
|
this.funding = funding;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.dump.skgif.beans;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.skgif.model.Relations;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 16/03/24
|
||||||
|
*/
|
||||||
|
public class ProductsRelation implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private java.util.List<Relations> related_products;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Relations> getRelated_products() {
|
||||||
|
return related_products;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRelated_products(List<Relations> related_products) {
|
||||||
|
this.related_products = related_products;
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,5 +18,11 @@
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
"paramDescription": "the relationPath",
|
"paramDescription": "the relationPath",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "wd",
|
||||||
|
"paramLongName": "workingDir",
|
||||||
|
"paramDescription": "the relationPath",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -62,110 +62,12 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
<start to="dump_result"/>
|
<start to="dump_organization"/>
|
||||||
|
|
||||||
|
|
||||||
<decision name="filter_set">
|
|
||||||
<switch>
|
|
||||||
<case to="filter">${wf:conf('filter') eq "true"}</case>
|
|
||||||
<default to="copy_graph"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
<fork name="copy_graph">
|
|
||||||
<path start="copy_relation"/>
|
|
||||||
<path start="copy_organization"/>
|
|
||||||
<path start="copy_projects"/>
|
|
||||||
<path start="copy_datasources"/>
|
|
||||||
<path start="copy_publication"/>
|
|
||||||
<path start="copy_dataset"/>
|
|
||||||
<path start="copy_software"/>
|
|
||||||
<path start="copy_otherresearchproduct"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="copy_relation">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/relation</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_organization">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/organization</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_projects">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/project</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_datasources">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/datasource</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_publication">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/publication</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_dataset">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/dataset</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_software">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/software</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_otherresearchproduct">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/otherresearchproduct</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
<join name="copy_wait" to="emit_from_result"/>
|
|
||||||
<action name="filter">
|
<action name="filter">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -213,95 +115,46 @@
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/graph/</arg>-->
|
<arg>--workingDir</arg><arg>${workingDir}/graph/</arg>
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/graph/</arg>
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/graph/</arg>-->
|
||||||
<arg>--filterPath</arg><arg>${filterPath}</arg>
|
<arg>--filterPath</arg><arg>${filterPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<!-- <ok to="emit_from_result"/>-->
|
<ok to="select_relation"/>
|
||||||
<ok to="copy_graph_publication"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_publication">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/publication</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/publication</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_dataset"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_dataset">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/dataset</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/dataset</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_otherresearchproduct"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_otherresearchproduct">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/otherresearchproduct</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/otherresearchproduct</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_software"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_software">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/software</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/software</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_datasource"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_datasource">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/datasource</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/datasource</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_project"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_project">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/project</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/project</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_organization"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_organization">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/organization</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/organization</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_graph_relation"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="copy_graph_relation">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<arg>${nameNode}/user/miriam.baglioni/oa/graph/dump/temp/graph/relation</arg>
|
|
||||||
<arg>${nameNode}/${workingDir}/graph/relation</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="emit_from_result"/>
|
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="select_relation">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>select relations </name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.SelectRelation</class>
|
||||||
|
<jar>dump-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=4
|
||||||
|
--executor-memory=4G
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=5G
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
|
</spark-opts>
|
||||||
|
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
|
||||||
|
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}/</arg>
|
||||||
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>-->
|
||||||
|
<!-- <arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>-->
|
||||||
|
</spark>
|
||||||
|
|
||||||
|
<ok to="emit_from_result"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
<action name="emit_from_result">
|
<action name="emit_from_result">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -319,43 +172,44 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
<arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
<arg>--workingDir</arg><arg>${workingDir}/</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>-->
|
||||||
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
<!-- <arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>-->
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="dump_result"/>
|
<ok to="dump_research_product"/>
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="dump_result">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Dump table results </name>
|
|
||||||
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpResult</class>
|
|
||||||
<jar>dump-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=4
|
|
||||||
--executor-memory=8G
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.executor.memoryOverhead=5G
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
|
||||||
</spark-opts>
|
|
||||||
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
|
||||||
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="dump_datasource"/>
|
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<!-- <action name="dump_result">-->
|
||||||
|
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
|
||||||
|
<!-- <master>yarn</master>-->
|
||||||
|
<!-- <mode>cluster</mode>-->
|
||||||
|
<!-- <name>Dump table results </name>-->
|
||||||
|
<!-- <class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpResult</class>-->
|
||||||
|
<!-- <jar>dump-${projectVersion}.jar</jar>-->
|
||||||
|
<!-- <spark-opts>-->
|
||||||
|
<!-- --executor-cores=4-->
|
||||||
|
<!-- --executor-memory=4G-->
|
||||||
|
<!-- --driver-memory=${sparkDriverMemory}-->
|
||||||
|
<!-- --conf spark.executor.memoryOverhead=5G-->
|
||||||
|
<!-- --conf spark.extraListeners=${spark2ExtraListeners}-->
|
||||||
|
<!-- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
|
||||||
|
<!-- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
|
||||||
|
<!-- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
|
||||||
|
<!-- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}-->
|
||||||
|
<!-- --conf spark.sql.shuffle.partitions=15000-->
|
||||||
|
<!-- </spark-opts>-->
|
||||||
|
<!--<!– <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>–>-->
|
||||||
|
<!-- <arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>-->
|
||||||
|
<!-- <arg>--outputPath</arg><arg>${outputPath}</arg>-->
|
||||||
|
<!--<!– <arg>--workingDir</arg><arg>${workingDir}/</arg>–>-->
|
||||||
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>-->
|
||||||
|
<!-- </spark>-->
|
||||||
|
<!-- <ok to="dump_datasource"/>-->
|
||||||
|
<!-- <error to="Kill"/>-->
|
||||||
|
<!-- </action>-->
|
||||||
<action name="dump_datasource">
|
<action name="dump_datasource">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -373,11 +227,11 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
<arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>
|
||||||
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
<!-- <arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>-->
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
<arg>--workingDir</arg><arg>${workingDir}/</arg>
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>-->
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="dump_venue"/>
|
<ok to="dump_venue"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -399,11 +253,11 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
<arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>
|
||||||
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
<!-- <arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>-->
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
<arg>--workingDir</arg><arg>${workingDir}/</arg>
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>-->
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="dump_organization"/>
|
<ok to="dump_organization"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -426,10 +280,10 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
||||||
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="dump_grant"/>
|
<ok to="dump_grant"/>
|
||||||
|
@ -453,15 +307,44 @@
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
<!-- <arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>-->
|
||||||
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
<arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
<!-- <arg>--workingDir</arg><arg>${workingDir}/</arg>-->
|
||||||
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
<arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="dump_research_product">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Dump table results </name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.dump.skgif.DumpResearchProduct</class>
|
||||||
|
<jar>dump-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=4
|
||||||
|
--executor-memory=4G
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=5G
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${workingDir}/graph/</arg>
|
||||||
|
<!-- <arg>--sourcePath</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/graph/</arg>-->
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}/</arg>
|
||||||
|
<!-- <arg>--workingDir</arg><arg>/user/miriam.baglioni/oa/graph/dump/temp/working_dir/</arg>-->
|
||||||
|
</spark>
|
||||||
|
<ok to="dump_datasource"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -0,0 +1,27 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "wd",
|
||||||
|
"paramLongName": "workingDir",
|
||||||
|
"paramDescription": "the relationPath",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rp",
|
||||||
|
"paramLongName": "relationPath",
|
||||||
|
"paramDescription": "the relationPath",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -3,8 +3,10 @@ package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -13,6 +15,9 @@ import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.io.SAXReader;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.AfterAll;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
@ -152,5 +157,51 @@ public class DumpGrantTest implements Serializable {
|
||||||
@Test
|
@Test
|
||||||
public void testDumpFunder() throws Exception {
|
public void testDumpFunder() throws Exception {
|
||||||
|
|
||||||
|
String fundingtree = "<fundingtree>" +
|
||||||
|
" <funder>" +
|
||||||
|
" <id>ec__________::EC</id>\n" +
|
||||||
|
" <shortname>EC</shortname>\n" +
|
||||||
|
" <name>European Commission</name>\n" +
|
||||||
|
" <jurisdiction>EU</jurisdiction>\n" +
|
||||||
|
" </funder>\n" +
|
||||||
|
" <funding_level_2>\n" +
|
||||||
|
" <id>ec__________::EC::FP7::SP2::ERC</id>\n" +
|
||||||
|
" <description>ERC</description>\n" +
|
||||||
|
" <name>ERC</name>\n" +
|
||||||
|
" <class>ec:program</class>\n" +
|
||||||
|
" <parent>\n" +
|
||||||
|
" <funding_level_1>\n" +
|
||||||
|
" <id>ec__________::EC::FP7::SP2</id>\n" +
|
||||||
|
" <description>SP2-Ideas</description>\n" +
|
||||||
|
" <name>SP2</name>\n" +
|
||||||
|
" <class>ec:specificprogram</class>\n" +
|
||||||
|
" <parent>\n" +
|
||||||
|
" <funding_level_0>\n" +
|
||||||
|
" <id>ec__________::EC::FP7</id>\n" +
|
||||||
|
" <description>SEVENTH FRAMEWORK PROGRAMME</description>\n" +
|
||||||
|
" <name>FP7</name>\n" +
|
||||||
|
" <parent />\n" +
|
||||||
|
" <class>ec:frameworkprogram</class>\n" +
|
||||||
|
" </funding_level_0>\n" +
|
||||||
|
" </parent>\n" +
|
||||||
|
" </funding_level_1>\n" +
|
||||||
|
" </parent>\n" +
|
||||||
|
" </funding_level_2>\n" +
|
||||||
|
" </fundingtree>".replace("\n", " ");
|
||||||
|
System.out.println(getFundingStream(fundingtree));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getFundingStream(String fundingtree) throws DocumentException {
|
||||||
|
final Document doc;
|
||||||
|
|
||||||
|
doc = new SAXReader().read(new StringReader(fundingtree));
|
||||||
|
if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
|
||||||
|
doc.selectNodes("//funding_level_0").size() > 0 &&
|
||||||
|
Optional.ofNullable(doc.selectNodes("//funding_level_0/name")).isPresent() &&
|
||||||
|
doc.selectNodes("//funding_level_0/name").size() > 0)
|
||||||
|
return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0/name").get(0))).getText();
|
||||||
|
return new String();
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -867,4 +867,32 @@ public class DumpResultTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSelectRelation() throws Exception {
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/graphForAPIExample/")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
final String workingDir = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/oa/graph/dump/skgif/workingDirApiExample/")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
SelectRelation
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath", sourcePath,
|
||||||
|
"-workingDir", workingDir,
|
||||||
|
"-outputPath", workingDir
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
//
|
||||||
|
// JavaRDD<ResearchProduct> researchProduct = sc
|
||||||
|
// .textFile(workingDir.toString() + "ResearchProduct")
|
||||||
|
// .map(item -> OBJECT_MAPPER.readValue(item, ResearchProduct.class));
|
||||||
|
//
|
||||||
|
// researchProduct.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue