517 lines
18 KiB
Java
517 lines
18 KiB
Java
|
|
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
|
|
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
|
|
import java.io.Serializable;
|
|
import java.util.*;
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.spark.SparkConf;
|
|
import org.apache.spark.api.java.function.FilterFunction;
|
|
import org.apache.spark.api.java.function.MapFunction;
|
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
import org.apache.spark.sql.*;
|
|
import org.apache.spark.sql.Dataset;
|
|
import org.apache.spark.sql.types.DataTypes;
|
|
import org.apache.spark.sql.types.StructType;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization;
|
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation;
|
|
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ProductsRelation;
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
|
import eu.dnetlib.dhp.skgif.model.*;
|
|
import scala.Tuple5;
|
|
|
|
/**
|
|
* @author miriam.baglioni
|
|
* @Date 16/03/24
|
|
*/
|
|
public class SelectRelation implements Serializable {
|
|
private static final Logger log = LoggerFactory.getLogger(SelectRelation.class);
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
String jsonConfiguration = IOUtils
|
|
.toString(
|
|
DumpResult.class
|
|
.getResourceAsStream(
|
|
"/eu/dnetlib/dhp/oa/graph/dump/skgif/select_relation_parameters.json"));
|
|
|
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
parser.parseArgument(args);
|
|
|
|
Boolean isSparkSessionManaged = Optional
|
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
.map(Boolean::valueOf)
|
|
.orElse(Boolean.TRUE);
|
|
|
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
|
|
final String inputPath = parser.get("sourcePath");
|
|
log.info("inputPath: {}", inputPath);
|
|
|
|
final String workingDir = parser.get("workingDir");
|
|
log.info("workingDir: {}", workingDir);
|
|
|
|
final String relationPath = parser.get("relationPath");
|
|
log.info("relationPath: {}", relationPath);
|
|
|
|
SparkConf conf = new SparkConf();
|
|
|
|
runWithSparkSession(
|
|
conf,
|
|
isSparkSessionManaged,
|
|
spark -> {
|
|
// Utils.removeOutputDir(spark, workingDir + "aggrelation");
|
|
|
|
// selectAffiliationRelations(spark, inputPath, workingDir, outputPath);
|
|
createOrganizationExtention(
|
|
spark, inputPath, RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label,
|
|
workingDir + "relations/result_relevant_organizations", relationPath);
|
|
selectFundingRelations(spark, inputPath, workingDir, relationPath);
|
|
selectProductRelation(spark, inputPath, workingDir, relationPath);
|
|
// selectDatasourceOrganizationRelation(spark, inputPath, workingDir, outputPath);
|
|
createOrganizationExtention(
|
|
spark, inputPath, RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION.label,
|
|
workingDir + "relations/datasource_providing_organization", relationPath);
|
|
createOrganizationExtention(
|
|
spark, inputPath, RelationType.PROJECT_HAS_PARTICIPANT_ORGANIZATION.label,
|
|
workingDir + "relations/project_partecipating_organization", relationPath);
|
|
});
|
|
}
|
|
|
|
private static void createOrganizationExtention(SparkSession spark, String inputPath, String relationSem,
|
|
String outputPath, String relationPath) {
|
|
final StructType rp = new StructType()
|
|
.add(
|
|
"dataInfo", new StructType()
|
|
.add("deletedbyinference", DataTypes.BooleanType))
|
|
.add("source", DataTypes.StringType)
|
|
.add("target", DataTypes.StringType)
|
|
.add("relClass", DataTypes.StringType);
|
|
|
|
Dataset<Row> relation = spark
|
|
.read()
|
|
.schema(rp)
|
|
.json(relationPath)
|
|
.filter(
|
|
"datainfo.deletedbyinference != true and " +
|
|
"relClass == '" + relationSem + "'")
|
|
.drop("datainfo", "relClass");
|
|
|
|
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
|
|
|
|
relation
|
|
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
|
|
.drop("target")
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
|
|
ExtendingOrganization ar = new ExtendingOrganization();
|
|
ar.setEntityId(k);
|
|
addRelevantOrganization(ar, v);
|
|
return ar;
|
|
}, Encoders.bean(ExtendingOrganization.class))
|
|
// .show(false);
|
|
.write()
|
|
.mode(SaveMode.Overwrite)
|
|
.option("compression", "gzip")
|
|
.json(outputPath);
|
|
|
|
}
|
|
|
|
private static void selectDatasourceOrganizationRelation(SparkSession spark, String inputPath, String workingDir,
|
|
String outputPath) {
|
|
final StructType rp = new StructType()
|
|
.add(
|
|
"dataInfo", new StructType()
|
|
.add("deletedbyinference", DataTypes.BooleanType))
|
|
.add("source", DataTypes.StringType)
|
|
.add("target", DataTypes.StringType)
|
|
.add("relClass", DataTypes.StringType);
|
|
|
|
Dataset<Row> relation = spark
|
|
.read()
|
|
.schema(rp)
|
|
.json(inputPath + "relation")
|
|
.filter(
|
|
"datainfo.deletedbyinference != true and " +
|
|
"relClass == '" + RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION + "'")
|
|
.drop("datainfo", "relClass");
|
|
|
|
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
|
|
|
|
relation
|
|
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
|
|
.drop("target")
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
|
|
ExtendingOrganization ar = new ExtendingOrganization();
|
|
ar.setEntityId(k);
|
|
addRelevantOrganization(ar, v);
|
|
return ar;
|
|
}, Encoders.bean(ExtendingOrganization.class))
|
|
// .show(false);
|
|
.write()
|
|
.mode(SaveMode.Append)
|
|
.option("compression", "gzip")
|
|
.json("/tmp/miriam/prova/providingOrganization");
|
|
|
|
}
|
|
|
|
private static void selectProductRelation(SparkSession spark, String inputPath, String workingDir,
|
|
String relationPath) {
|
|
|
|
final StructType rp = new StructType()
|
|
.add(
|
|
"dataInfo", new StructType()
|
|
.add("deletedbyinference", DataTypes.BooleanType))
|
|
.add("source", DataTypes.StringType)
|
|
.add("target", DataTypes.StringType)
|
|
.add("relClass", DataTypes.StringType);
|
|
|
|
Dataset<Row> relation = spark
|
|
.read()
|
|
.schema(rp)
|
|
.json(relationPath)
|
|
.filter("datainfo.deletedbyinference != true")
|
|
.filter(
|
|
"relClass == '" + RelationType.DOCUMENTS.label + "' or " +
|
|
"relClass == '" + RelationType.CITATION.label + "' or " +
|
|
"relClass == '" + RelationType.PART.label + "' or " +
|
|
"relClass == '" + RelationType.SUPPLEMENT.label + "' or " +
|
|
"relClass == '" + RelationType.VERSION.label + "'")
|
|
.drop("datainfo");
|
|
|
|
Dataset<Row> result = spark
|
|
.read()
|
|
.schema(Encoders.bean(Result.class).schema())
|
|
.json(inputPath + "publication")
|
|
.filter(
|
|
"datainfo.deletedbyinference != true and " +
|
|
"datainfo.invisible != true")
|
|
.selectExpr("id", "title[0].value as title", "pid");
|
|
|
|
result.createOrReplaceTempView("res");
|
|
|
|
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, title " +
|
|
"from res " +
|
|
"lateral view explode (pid) p as pide ";
|
|
|
|
Dataset<MinProduct> minProduct = spark
|
|
.sql(query)
|
|
// .show(false);
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, MinProduct>) (k, v) -> {
|
|
MinProduct mp = new MinProduct();
|
|
mp.setLocal_identifier(k);
|
|
Row r = v.next();
|
|
mp.setTitle(r.getAs("title"));
|
|
addProductPid(mp, r);
|
|
v.forEachRemaining(row -> addProductPid(mp, row));
|
|
return mp;
|
|
}, Encoders.bean(MinProduct.class));
|
|
|
|
relation
|
|
.join(minProduct, relation.col("target").equalTo(minProduct.col("local_identifier")))
|
|
.selectExpr("source", "local_identifier", "title", "doi", "pmcid", "pmid", "arxivid", "relClass as sem")
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, ProductsRelation>) (k, v) -> {
|
|
ProductsRelation pr = new ProductsRelation();
|
|
pr.setResultId(k);
|
|
addResulRelations(pr, v);
|
|
return pr;
|
|
}, Encoders.bean(ProductsRelation.class))
|
|
// .show(false);
|
|
.write()
|
|
.mode(SaveMode.Overwrite)
|
|
.option("compression", "gzip")
|
|
.json(workingDir + "relations/related_products");
|
|
|
|
}
|
|
|
|
private static void addResulRelations(ProductsRelation pr, Iterator<Row> v) {
|
|
pr.setRelated_products(new ArrayList<>());
|
|
Map<String, ArrayList<MinProduct>> hashMap = new HashMap<>();
|
|
while (v.hasNext()) {
|
|
Row next = v.next();
|
|
String sem = next.getAs("sem");
|
|
if (!hashMap.containsKey(sem))
|
|
hashMap.put(sem, new ArrayList<>());
|
|
hashMap.get(sem).add(getMinProduct(next));
|
|
}
|
|
|
|
hashMap
|
|
.keySet()
|
|
.stream()
|
|
.forEach(key -> pr.getRelated_products().add(Relations.newInstance(key, hashMap.get(key))));
|
|
}
|
|
|
|
private static MinProduct getMinProduct(Row next) {
|
|
MinProduct mp = new MinProduct();
|
|
mp.setLocal_identifier(next.getAs("local_identifier"));
|
|
if (Optional.ofNullable(next.getAs("doi")).isPresent())
|
|
mp.setDoi(next.getAs("doi"));
|
|
if (Optional.ofNullable(next.getAs("pmid")).isPresent())
|
|
mp.setPmid(next.getAs("pmid"));
|
|
if (Optional.ofNullable(next.getAs("pmcid")).isPresent())
|
|
mp.setPmcid(next.getAs("pmcid"));
|
|
if (Optional.ofNullable(next.getAs("arxivid")).isPresent())
|
|
mp.setArxivid(next.getAs("arxivid"));
|
|
return mp;
|
|
}
|
|
|
|
private static void addProductPid(MinProduct mp, Row next) {
|
|
String schema = next.getAs("schema");
|
|
if (Optional.ofNullable(schema).isPresent()) {
|
|
switch (schema) {
|
|
case "doi":
|
|
mp.setDoi(next.getAs("pid"));
|
|
break;
|
|
case "pmcid":
|
|
mp.setPmcid(next.getAs("pid"));
|
|
break;
|
|
case "pmid":
|
|
mp.setPmid(next.getAs("pid"));
|
|
break;
|
|
case "arXiv":
|
|
mp.setArxivid(next.getAs("pid"));
|
|
break;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void selectFundingRelations(SparkSession spark, String inputPath, String workingDir,
|
|
String relationPath) {
|
|
final StructType tp = new StructType()
|
|
.add(
|
|
"dataInfo", new StructType()
|
|
.add("deletedbyinference", DataTypes.BooleanType))
|
|
.add("id", DataTypes.StringType);
|
|
final StructType rp = new StructType()
|
|
.add(
|
|
"dataInfo", new StructType()
|
|
.add("deletedbyinference", DataTypes.BooleanType))
|
|
.add("source", DataTypes.StringType)
|
|
.add("target", DataTypes.StringType)
|
|
.add("relClass", DataTypes.StringType);
|
|
|
|
Dataset<Row> relation = spark
|
|
.read()
|
|
.schema(rp)
|
|
.json(relationPath)
|
|
.filter(
|
|
"datainfo.deletedbyinference != true and " +
|
|
"relClass == '" + RelationType.RESULT_OUTCOME_FUNDING.label + "'")
|
|
.drop("datainfo", "relClass");
|
|
Dataset<Row> projects = Utils
|
|
.readPath(spark, inputPath + "project", Project.class)
|
|
.filter(
|
|
(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
|
|
p.getFundingtree().size() > 0
|
|
&&
|
|
Utils
|
|
.getFunderName(p.getFundingtree().get(0).getValue())
|
|
.equalsIgnoreCase("European Commission"))
|
|
|
|
.map((MapFunction<Project, Tuple5<String, String, String, String, String>>) p -> {
|
|
String id = p.getId();
|
|
String acronym = "";
|
|
if (Optional.ofNullable(p.getAcronym()).isPresent())
|
|
acronym = p.getAcronym().getValue();
|
|
String title = "";
|
|
if (Optional.ofNullable(p.getTitle()).isPresent())
|
|
title = p.getTitle().getValue();
|
|
String funder = Utils.getFunderName(p.getFundingtree().get(0).getValue());
|
|
String code = p.getCode().getValue();
|
|
return new Tuple5<>(id, acronym, title, funder, code);
|
|
}, Encoders
|
|
.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING()))
|
|
.selectExpr("_1 as id", "_2 as acronym", "_3 as title", "_4 as funder", "_5 as code");
|
|
|
|
relation
|
|
.join(projects, relation.col("target").equalTo(projects.col("id")))
|
|
.drop("target")
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, GrantRelation>) (k, v) -> {
|
|
GrantRelation gr = new GrantRelation();
|
|
gr.setResultId(k);
|
|
addFunding(gr, v);
|
|
return gr;
|
|
}, Encoders.bean(GrantRelation.class))
|
|
// .show(false);
|
|
.write()
|
|
.mode(SaveMode.Overwrite)
|
|
.option("compression", "gzip")
|
|
.json(workingDir + "relations/funding");
|
|
}
|
|
|
|
private static void addFunding(GrantRelation gr, Iterator<Row> v) {
|
|
gr.setFunding(new ArrayList<>());
|
|
while (v.hasNext()) {
|
|
gr.getFunding().add(getMinGrant(v.next()));
|
|
|
|
}
|
|
}
|
|
|
|
private static MinGrant getMinGrant(Row next) {
|
|
MinGrant mn = new MinGrant();
|
|
mn.setCode(next.getAs("code"));
|
|
mn.setLocal_identifier(next.getAs("id"));
|
|
mn.setFunder(next.getAs("funder"));
|
|
if (Optional.ofNullable(next.getAs("acronym")).isPresent())
|
|
mn.setTitle(next.getAs("acronym"));
|
|
else
|
|
mn.setTitle(next.getAs("title"));
|
|
return mn;
|
|
}
|
|
|
|
private static void selectAffiliationRelations(SparkSession spark, String inputPath, String workingDir,
|
|
String outputPath) {
|
|
|
|
final StructType rp = new StructType()
|
|
.add(
|
|
"dataInfo", new StructType()
|
|
.add("deletedbyinference", DataTypes.BooleanType))
|
|
.add("source", DataTypes.StringType)
|
|
.add("target", DataTypes.StringType)
|
|
.add("relClass", DataTypes.StringType);
|
|
|
|
Dataset<Row> relation = spark
|
|
.read()
|
|
.schema(rp)
|
|
.json(inputPath + "relation")
|
|
.filter(
|
|
"datainfo.deletedbyinference != true and " +
|
|
"relClass == '" + RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label + "'")
|
|
.drop("datainfo", "relClass");
|
|
|
|
final Dataset<MinOrganization> minOrganizations = getMinOrganizationDataset(spark, inputPath);
|
|
|
|
relation
|
|
.join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier")))
|
|
.drop("target")
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("source"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, ExtendingOrganization>) (k, v) -> {
|
|
ExtendingOrganization ar = new ExtendingOrganization();
|
|
ar.setEntityId(k);
|
|
addRelevantOrganization(ar, v);
|
|
return ar;
|
|
}, Encoders.bean(ExtendingOrganization.class))
|
|
// .show(false);
|
|
.write()
|
|
.mode(SaveMode.Append)
|
|
.option("compression", "gzip")
|
|
.json("/tmp/miriam/prova/relevantOrganization");
|
|
|
|
}
|
|
|
|
private static Dataset<MinOrganization> getMinOrganizationDataset(SparkSession spark, String inputPath) {
|
|
Dataset<Row> organization = spark
|
|
.read()
|
|
.schema(Encoders.bean(Organization.class).schema())
|
|
.json(inputPath + "organization")
|
|
.filter("datainfo.deletedbyinference != true")
|
|
.selectExpr("id", "legalname.value as name", "pid");
|
|
|
|
organization.createOrReplaceTempView("org");
|
|
|
|
String query = "select id, pide.qualifier.classid as schema, pide.value as pid, name " +
|
|
"from org " +
|
|
"lateral view explode (pid) p as pide ";
|
|
|
|
return spark
|
|
.sql(query)
|
|
.groupByKey((MapFunction<Row, String>) r -> r.getAs("id"), Encoders.STRING())
|
|
.mapGroups((MapGroupsFunction<String, Row, MinOrganization>) (k, v) -> {
|
|
MinOrganization mn = new MinOrganization();
|
|
mn.setLocal_identifier(k);
|
|
Row r = v.next();
|
|
mn.setName(r.getAs("name"));
|
|
addOrganizationPid(mn, r);
|
|
v.forEachRemaining(row -> addOrganizationPid(mn, row));
|
|
return mn;
|
|
}, Encoders.bean(MinOrganization.class));
|
|
|
|
}
|
|
|
|
private static void addOrganizationPid(MinOrganization mo, Row next) {
|
|
String schema = next.getAs("schema");
|
|
if (Optional.ofNullable(schema).isPresent()) {
|
|
switch (schema) {
|
|
case "ROR":
|
|
mo.setRor(next.getAs("pid"));
|
|
break;
|
|
case "ISNI":
|
|
mo.setIsni(next.getAs("pid"));
|
|
break;
|
|
case "FundRef":
|
|
mo.setFundRef(next.getAs("pid"));
|
|
break;
|
|
case "RingGold":
|
|
mo.setRinGold(next.getAs("pid"));
|
|
break;
|
|
case "Wikidata":
|
|
mo.setWikidata(next.getAs("pid"));
|
|
break;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void addRelevantOrganization(ExtendingOrganization ar, Iterator<Row> v) {
|
|
ar.setRelevant_organization(new ArrayList<>());
|
|
while (v.hasNext())
|
|
ar.getRelevant_organization().add(getMinOrg(v.next()));
|
|
}
|
|
|
|
private static MinOrganization getMinOrg(Row next) {
|
|
MinOrganization mo = new MinOrganization();
|
|
|
|
mo.setLocal_identifier(next.getAs("local_identifier"));
|
|
mo.setName(next.getAs("name"));
|
|
if (Optional.ofNullable(next.getAs("ror")).isPresent())
|
|
mo.setRor(next.getAs("ror"));
|
|
if (Optional.ofNullable(next.getAs("isni")).isPresent())
|
|
mo.setIsni(next.getAs("isni"));
|
|
if (Optional.ofNullable(next.getAs("fundRef")).isPresent())
|
|
mo.setFundRef(next.getAs("fundRef"));
|
|
if (Optional.ofNullable(next.getAs("rinGold")).isPresent())
|
|
mo.setRinGold(next.getAs("rinGold"));
|
|
if (Optional.ofNullable(next.getAs("wikidata")).isPresent())
|
|
mo.setWikidata(next.getAs("wikidata"));
|
|
// return mo;
|
|
// }
|
|
//
|
|
// if (Optional.ofNullable(pids).isPresent())
|
|
// pids.toStream().foreach(pid -> {
|
|
// if (Optional.ofNullable(pid.getQualifier()).isPresent() &&
|
|
// Optional.ofNullable(pid.getQualifier().getClassid()).isPresent())
|
|
// switch (pid.getQualifier().getClassid().toLowerCase()) {
|
|
// case "ror":
|
|
// mo.setRor(pid.getValue());
|
|
// break;
|
|
// case "isni":
|
|
// mo.setIsni(pid.getValue());
|
|
// break;
|
|
// case "fundref":
|
|
// mo.setFundRef(pid.getValue());
|
|
// break;
|
|
// case "ringgold":
|
|
// mo.setRinGold(pid.getValue());
|
|
// break;
|
|
// case "wikidata":
|
|
// mo.setWikidata(pid.getValue());
|
|
// break;
|
|
//
|
|
// }
|
|
// return null;
|
|
// });
|
|
return mo;
|
|
}
|
|
}
|