dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/SelectRelation.java

517 lines
18 KiB
Java

package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ProductsRelation;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.skgif.model.*;
import scala.Tuple5;
/**
* @author miriam.baglioni
* @Date 16/03/24
*/
public class SelectRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectRelation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpResult.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/select_relation_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relationPath = parser.get("relationPath");
log.info("relationPath: {}", relationPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, workingDir + "aggrelation");
// selectAffiliationRelations(spark, inputPath, workingDir, outputPath);
createOrganizationExtention(
spark, inputPath, RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label,
workingDir + "relations/result_relevant_organizations", relationPath);
selectFundingRelations(spark, inputPath, workingDir, relationPath);
selectProductRelation(spark, inputPath, workingDir, relationPath);
// selectDatasourceOrganizationRelation(spark, inputPath, workingDir, outputPath);
createOrganizationExtention(
spark, inputPath, RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION.label,
workingDir + "relations/datasource_providing_organization", relationPath);
createOrganizationExtention(
spark, inputPath, RelationType.PROJECT_HAS_PARTICIPANT_ORGANIZATION.label,
workingDir + "relations/project_partecipating_organization", relationPath);
});
}
private static void createOrganizationExtention(SparkSession spark, String inputPath, String relationSem,
String outputPath, String relationPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(relationPath)
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + relationSem + "'")
.drop("datainfo", "relClass");
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
relation
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
ExtendingOrganization ar = new ExtendingOrganization();
ar.setEntityId(k);
addRelevantOrganization(ar, v);
return ar;
}, Encoders.bean(ExtendingOrganization.class))
// .show(false);
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static void selectDatasourceOrganizationRelation(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION + "'")
.drop("datainfo", "relClass");
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
relation
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
ExtendingOrganization ar = new ExtendingOrganization();
ar.setEntityId(k);
addRelevantOrganization(ar, v);
return ar;
}, Encoders.bean(ExtendingOrganization.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json("/tmp/miriam/prova/providingOrganization");
}
private static void selectProductRelation(SparkSession spark, String inputPath, String workingDir,
String relationPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(relationPath)
.filter("datainfo.deletedbyinference != true")
.filter(
"relClass == '" + RelationType.DOCUMENTS.label + "' or " +
"relClass == '" + RelationType.CITATION.label + "' or " +
"relClass == '" + RelationType.PART.label + "' or " +
"relClass == '" + RelationType.SUPPLEMENT.label + "' or " +
"relClass == '" + RelationType.VERSION.label + "'")
.drop("datainfo");
Dataset<Row> result = spark
.read()
.schema(Encoders.bean(Result.class).schema())
.json(inputPath + "publication")
.filter(
"datainfo.deletedbyinference != true and " +
"datainfo.invisible != true")
.selectExpr("id", "title[0].value as title", "pid");
result.createOrReplaceTempView("res");
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, title " +
"from res " +
"lateral view explode (pid) p as pide ";
Dataset<MinProduct> minProduct = spark
.sql(query)
// .show(false);
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, MinProduct>) (k, v) -> {
MinProduct mp = new MinProduct();
mp.setLocal_identifier(k);
Row r = v.next();
mp.setTitle(r.getAs("title"));
addProductPid(mp, r);
v.forEachRemaining(row -> addProductPid(mp, row));
return mp;
}, Encoders.bean(MinProduct.class));
relation
.join(minProduct, relation.col("target").equalTo(minProduct.col("local_identifier")))
.selectExpr("source", "local_identifier", "title", "doi", "pmcid", "pmid", "arxivid", "relClass as sem")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ProductsRelation>) (k, v) -> {
ProductsRelation pr = new ProductsRelation();
pr.setResultId(k);
addResulRelations(pr, v);
return pr;
}, Encoders.bean(ProductsRelation.class))
// .show(false);
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "relations/related_products");
}
private static void addResulRelations(ProductsRelation pr, Iterator<Row> v) {
pr.setRelated_products(new ArrayList<>());
Map<String, ArrayList<MinProduct>> hashMap = new HashMap<>();
while (v.hasNext()) {
Row next = v.next();
String sem = next.getAs("sem");
if (!hashMap.containsKey(sem))
hashMap.put(sem, new ArrayList<>());
hashMap.get(sem).add(getMinProduct(next));
}
hashMap
.keySet()
.stream()
.forEach(key -> pr.getRelated_products().add(Relations.newInstance(key, hashMap.get(key))));
}
private static MinProduct getMinProduct(Row next) {
MinProduct mp = new MinProduct();
mp.setLocal_identifier(next.getAs("local_identifier"));
if (Optional.ofNullable(next.getAs("doi")).isPresent())
mp.setDoi(next.getAs("doi"));
if (Optional.ofNullable(next.getAs("pmid")).isPresent())
mp.setPmid(next.getAs("pmid"));
if (Optional.ofNullable(next.getAs("pmcid")).isPresent())
mp.setPmcid(next.getAs("pmcid"));
if (Optional.ofNullable(next.getAs("arxivid")).isPresent())
mp.setArxivid(next.getAs("arxivid"));
return mp;
}
private static void addProductPid(MinProduct mp, Row next) {
String schema = next.getAs("schema");
if (Optional.ofNullable(schema).isPresent()) {
switch (schema) {
case "doi":
mp.setDoi(next.getAs("pid"));
break;
case "pmcid":
mp.setPmcid(next.getAs("pid"));
break;
case "pmid":
mp.setPmid(next.getAs("pid"));
break;
case "arXiv":
mp.setArxivid(next.getAs("pid"));
break;
}
}
}
private static void selectFundingRelations(SparkSession spark, String inputPath, String workingDir,
String relationPath) {
final StructType tp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("id", DataTypes.StringType);
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(relationPath)
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + RelationType.RESULT_OUTCOME_FUNDING.label + "'")
.drop("datainfo", "relClass");
Dataset<Row> projects = Utils
.readPath(spark, inputPath + "project", Project.class)
.filter(
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
p.getFundingtree().size() > 0
&&
Utils
.getFunderName(p.getFundingtree().get(0).getValue())
.equalsIgnoreCase("European Commission"))
.map((MapFunction<Project, Tuple5<String, String, String, String, String>>) p -> {
String id = p.getId();
String acronym = "";
if (Optional.ofNullable(p.getAcronym()).isPresent())
acronym = p.getAcronym().getValue();
String title = "";
if (Optional.ofNullable(p.getTitle()).isPresent())
title = p.getTitle().getValue();
String funder = Utils.getFunderName(p.getFundingtree().get(0).getValue());
String code = p.getCode().getValue();
return new Tuple5<>(id, acronym, title, funder, code);
}, Encoders
.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING()))
.selectExpr("_1 as id", "_2 as acronym", "_3 as title", "_4 as funder", "_5 as code");
relation
.join(projects, relation.col("target").equalTo(projects.col("id")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, GrantRelation>) (k, v) -> {
GrantRelation gr = new GrantRelation();
gr.setResultId(k);
addFunding(gr, v);
return gr;
}, Encoders.bean(GrantRelation.class))
// .show(false);
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "relations/funding");
}
private static void addFunding(GrantRelation gr, Iterator<Row> v) {
gr.setFunding(new ArrayList<>());
while (v.hasNext()) {
gr.getFunding().add(getMinGrant(v.next()));
}
}
private static MinGrant getMinGrant(Row next) {
MinGrant mn = new MinGrant();
mn.setCode(next.getAs("code"));
mn.setLocal_identifier(next.getAs("id"));
mn.setFunder(next.getAs("funder"));
if (Optional.ofNullable(next.getAs("acronym")).isPresent())
mn.setTitle(next.getAs("acronym"));
else
mn.setTitle(next.getAs("title"));
return mn;
}
private static void selectAffiliationRelations(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
final StructType rp = new StructType()
.add(
"dataInfo", new StructType()
.add("deletedbyinference", DataTypes.BooleanType))
.add("source", DataTypes.StringType)
.add("target", DataTypes.StringType)
.add("relClass", DataTypes.StringType);
Dataset<Row> relation = spark
.read()
.schema(rp)
.json(inputPath + "relation")
.filter(
"datainfo.deletedbyinference != true and " +
"relClass == '" + RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label + "'")
.drop("datainfo", "relClass");
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
relation
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
.drop("target")
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
ExtendingOrganization ar = new ExtendingOrganization();
ar.setEntityId(k);
addRelevantOrganization(ar, v);
return ar;
}, Encoders.bean(ExtendingOrganization.class))
// .show(false);
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json("/tmp/miriam/prova/relevantOrganization");
}
private static Dataset<MinOrganization> getMinOrganizationDataset(SparkSession spark, String inputPath) {
Dataset<Row> organization = spark
.read()
.schema(Encoders.bean(Organization.class).schema())
.json(inputPath + "organization")
.filter("datainfo.deletedbyinference != true")
.selectExpr("id", "legalname.value as name", "pid");
organization.createOrReplaceTempView("org");
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, name " +
"from org " +
"lateral view explode (pid) p as pide ";
return spark
.sql(query)
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Row, MinOrganization>) (k, v) -> {
MinOrganization mn = new MinOrganization();
mn.setLocal_identifier(k);
Row r = v.next();
mn.setName(r.getAs("name"));
addOrganizationPid(mn, r);
v.forEachRemaining(row -> addOrganizationPid(mn, row));
return mn;
}, Encoders.bean(MinOrganization.class));
}
private static void addOrganizationPid(MinOrganization mo, Row next) {
String schema = next.getAs("schema");
if (Optional.ofNullable(schema).isPresent()) {
switch (schema) {
case "ROR":
mo.setRor(next.getAs("pid"));
break;
case "ISNI":
mo.setIsni(next.getAs("pid"));
break;
case "FundRef":
mo.setFundRef(next.getAs("pid"));
break;
case "RingGold":
mo.setRinGold(next.getAs("pid"));
break;
case "Wikidata":
mo.setWikidata(next.getAs("pid"));
break;
}
}
}
private static void addRelevantOrganization(ExtendingOrganization ar, Iterator<Row> v) {
ar.setRelevant_organization(new ArrayList<>());
while (v.hasNext())
ar.getRelevant_organization().add(getMinOrg(v.next()));
}
private static MinOrganization getMinOrg(Row next) {
MinOrganization mo = new MinOrganization();
mo.setLocal_identifier(next.getAs("local_identifier"));
mo.setName(next.getAs("name"));
if (Optional.ofNullable(next.getAs("ror")).isPresent())
mo.setRor(next.getAs("ror"));
if (Optional.ofNullable(next.getAs("isni")).isPresent())
mo.setIsni(next.getAs("isni"));
if (Optional.ofNullable(next.getAs("fundRef")).isPresent())
mo.setFundRef(next.getAs("fundRef"));
if (Optional.ofNullable(next.getAs("rinGold")).isPresent())
mo.setRinGold(next.getAs("rinGold"));
if (Optional.ofNullable(next.getAs("wikidata")).isPresent())
mo.setWikidata(next.getAs("wikidata"));
// return mo;
// }
//
// if (Optional.ofNullable(pids).isPresent())
// pids.toStream().foreach(pid -> {
// if (Optional.ofNullable(pid.getQualifier()).isPresent() &&
// Optional.ofNullable(pid.getQualifier().getClassid()).isPresent())
// switch (pid.getQualifier().getClassid().toLowerCase()) {
// case "ror":
// mo.setRor(pid.getValue());
// break;
// case "isni":
// mo.setIsni(pid.getValue());
// break;
// case "fundref":
// mo.setFundRef(pid.getValue());
// break;
// case "ringgold":
// mo.setRinGold(pid.getValue());
// break;
// case "wikidata":
// mo.setWikidata(pid.getValue());
// break;
//
// }
// return null;
// });
return mo;
}
}