package eu.dnetlib.dhp.oa.graph.dump.skgif; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ProductsRelation; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.skgif.model.*; import scala.Tuple5; /** * @author miriam.baglioni * @Date 16/03/24 */ public class SelectRelation implements Serializable { private static final Logger log = LoggerFactory.getLogger(SelectRelation.class); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( DumpResult.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/dump/skgif/select_relation_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String workingDir = parser.get("workingDir"); log.info("workingDir: {}", workingDir); final String relationPath = parser.get("relationPath"); log.info("relationPath: {}", relationPath); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { // Utils.removeOutputDir(spark, workingDir + "aggrelation"); // selectAffiliationRelations(spark, inputPath, workingDir, outputPath); createOrganizationExtention( spark, inputPath, RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label, workingDir + "relations/result_relevant_organizations", relationPath); selectFundingRelations(spark, inputPath, workingDir, relationPath); selectProductRelation(spark, inputPath, workingDir, relationPath); // selectDatasourceOrganizationRelation(spark, inputPath, workingDir, outputPath); createOrganizationExtention( spark, inputPath, RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION.label, workingDir + "relations/datasource_providing_organization", relationPath); createOrganizationExtention( spark, inputPath, RelationType.PROJECT_HAS_PARTICIPANT_ORGANIZATION.label, workingDir + "relations/project_partecipating_organization", relationPath); }); } private static void createOrganizationExtention(SparkSession spark, String inputPath, String relationSem, String outputPath, String relationPath) { final StructType rp = new StructType() .add( "dataInfo", new StructType() .add("deletedbyinference", DataTypes.BooleanType)) .add("source", DataTypes.StringType) .add("target", DataTypes.StringType) .add("relClass", DataTypes.StringType); Dataset relation = spark .read() .schema(rp) .json(relationPath) .filter( "datainfo.deletedbyinference != true and " + "relClass == '" + relationSem + "'") .drop("datainfo", "relClass"); final Dataset minOrganizations = getMinOrganizationDataset(spark, inputPath); relation .join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier"))) .drop("target") .groupByKey((MapFunction) r -> r.getAs("source"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { ExtendingOrganization ar = new ExtendingOrganization(); ar.setEntityId(k); addRelevantOrganization(ar, v); return ar; }, Encoders.bean(ExtendingOrganization.class)) // .show(false); .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); } private static void selectDatasourceOrganizationRelation(SparkSession spark, String inputPath, String workingDir, String outputPath) { final StructType rp = new StructType() .add( "dataInfo", new StructType() .add("deletedbyinference", DataTypes.BooleanType)) .add("source", DataTypes.StringType) .add("target", DataTypes.StringType) .add("relClass", DataTypes.StringType); Dataset relation = spark .read() .schema(rp) .json(inputPath + "relation") .filter( "datainfo.deletedbyinference != true and " + "relClass == '" + RelationType.DATASOURCE_PROVIDED_BY_ORGANIZATION + "'") .drop("datainfo", "relClass"); final Dataset minOrganizations = getMinOrganizationDataset(spark, inputPath); relation .join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier"))) .drop("target") .groupByKey((MapFunction) r -> r.getAs("source"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { ExtendingOrganization ar = new ExtendingOrganization(); ar.setEntityId(k); addRelevantOrganization(ar, v); return ar; }, Encoders.bean(ExtendingOrganization.class)) // .show(false); .write() .mode(SaveMode.Append) .option("compression", "gzip") .json("/tmp/miriam/prova/providingOrganization"); } private static void selectProductRelation(SparkSession spark, String inputPath, String workingDir, String relationPath) { final StructType rp = new StructType() .add( "dataInfo", new StructType() .add("deletedbyinference", DataTypes.BooleanType)) .add("source", DataTypes.StringType) .add("target", DataTypes.StringType) .add("relClass", DataTypes.StringType); Dataset relation = spark .read() .schema(rp) .json(relationPath) .filter("datainfo.deletedbyinference != true") .filter( "relClass == '" + RelationType.DOCUMENTS.label + "' or " + "relClass == '" + RelationType.CITATION.label + "' or " + "relClass == '" + RelationType.PART.label + "' or " + "relClass == '" + RelationType.SUPPLEMENT.label + "' or " + "relClass == '" + RelationType.VERSION.label + "'") .drop("datainfo"); Dataset result = spark .read() .schema(Encoders.bean(Result.class).schema()) .json(inputPath + "publication") .filter( "datainfo.deletedbyinference != true and " + "datainfo.invisible != true") .selectExpr("id", "title[0].value as title", "pid"); result.createOrReplaceTempView("res"); String query = "select id, pide.qualifier.classid as schema, pide.value as pid, title " + "from res " + "lateral view explode (pid) p as pide "; Dataset minProduct = spark .sql(query) // .show(false); .groupByKey((MapFunction) r -> r.getAs("id"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { MinProduct mp = new MinProduct(); mp.setLocal_identifier(k); Row r = v.next(); mp.setTitle(r.getAs("title")); addProductPid(mp, r); v.forEachRemaining(row -> addProductPid(mp, row)); return mp; }, Encoders.bean(MinProduct.class)); relation .join(minProduct, relation.col("target").equalTo(minProduct.col("local_identifier"))) .selectExpr("source", "local_identifier", "title", "doi", "pmcid", "pmid", "arxivid", "relClass as sem") .groupByKey((MapFunction) r -> r.getAs("source"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { ProductsRelation pr = new ProductsRelation(); pr.setResultId(k); addResulRelations(pr, v); return pr; }, Encoders.bean(ProductsRelation.class)) // .show(false); .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "relations/related_products"); } private static void addResulRelations(ProductsRelation pr, Iterator v) { pr.setRelated_products(new ArrayList<>()); Map> hashMap = new HashMap<>(); while (v.hasNext()) { Row next = v.next(); String sem = next.getAs("sem"); if (!hashMap.containsKey(sem)) hashMap.put(sem, new ArrayList<>()); hashMap.get(sem).add(getMinProduct(next)); } hashMap .keySet() .stream() .forEach(key -> pr.getRelated_products().add(Relations.newInstance(key, hashMap.get(key)))); } private static MinProduct getMinProduct(Row next) { MinProduct mp = new MinProduct(); mp.setLocal_identifier(next.getAs("local_identifier")); if (Optional.ofNullable(next.getAs("doi")).isPresent()) mp.setDoi(next.getAs("doi")); if (Optional.ofNullable(next.getAs("pmid")).isPresent()) mp.setPmid(next.getAs("pmid")); if (Optional.ofNullable(next.getAs("pmcid")).isPresent()) mp.setPmcid(next.getAs("pmcid")); if (Optional.ofNullable(next.getAs("arxivid")).isPresent()) mp.setArxivid(next.getAs("arxivid")); return mp; } private static void addProductPid(MinProduct mp, Row next) { String schema = next.getAs("schema"); if (Optional.ofNullable(schema).isPresent()) { switch (schema) { case "doi": mp.setDoi(next.getAs("pid")); break; case "pmcid": mp.setPmcid(next.getAs("pid")); break; case "pmid": mp.setPmid(next.getAs("pid")); break; case "arXiv": mp.setArxivid(next.getAs("pid")); break; } } } private static void selectFundingRelations(SparkSession spark, String inputPath, String workingDir, String relationPath) { final StructType tp = new StructType() .add( "dataInfo", new StructType() .add("deletedbyinference", DataTypes.BooleanType)) .add("id", DataTypes.StringType); final StructType rp = new StructType() .add( "dataInfo", new StructType() .add("deletedbyinference", DataTypes.BooleanType)) .add("source", DataTypes.StringType) .add("target", DataTypes.StringType) .add("relClass", DataTypes.StringType); Dataset relation = spark .read() .schema(rp) .json(relationPath) .filter( "datainfo.deletedbyinference != true and " + "relClass == '" + RelationType.RESULT_OUTCOME_FUNDING.label + "'") .drop("datainfo", "relClass"); Dataset projects = Utils .readPath(spark, inputPath + "project", Project.class) .filter( (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && p.getFundingtree().size() > 0 && Utils .getFunderName(p.getFundingtree().get(0).getValue()) .equalsIgnoreCase("European Commission")) .map((MapFunction>) p -> { String id = p.getId(); String acronym = ""; if (Optional.ofNullable(p.getAcronym()).isPresent()) acronym = p.getAcronym().getValue(); String title = ""; if (Optional.ofNullable(p.getTitle()).isPresent()) title = p.getTitle().getValue(); String funder = Utils.getFunderName(p.getFundingtree().get(0).getValue()); String code = p.getCode().getValue(); return new Tuple5<>(id, acronym, title, funder, code); }, Encoders .tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING(), Encoders.STRING())) .selectExpr("_1 as id", "_2 as acronym", "_3 as title", "_4 as funder", "_5 as code"); relation .join(projects, relation.col("target").equalTo(projects.col("id"))) .drop("target") .groupByKey((MapFunction) r -> r.getAs("source"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { GrantRelation gr = new GrantRelation(); gr.setResultId(k); addFunding(gr, v); return gr; }, Encoders.bean(GrantRelation.class)) // .show(false); .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "relations/funding"); } private static void addFunding(GrantRelation gr, Iterator v) { gr.setFunding(new ArrayList<>()); while (v.hasNext()) { gr.getFunding().add(getMinGrant(v.next())); } } private static MinGrant getMinGrant(Row next) { MinGrant mn = new MinGrant(); mn.setCode(next.getAs("code")); mn.setLocal_identifier(next.getAs("id")); mn.setFunder(next.getAs("funder")); if (Optional.ofNullable(next.getAs("acronym")).isPresent()) mn.setTitle(next.getAs("acronym")); else mn.setTitle(next.getAs("title")); return mn; } private static void selectAffiliationRelations(SparkSession spark, String inputPath, String workingDir, String outputPath) { final StructType rp = new StructType() .add( "dataInfo", new StructType() .add("deletedbyinference", DataTypes.BooleanType)) .add("source", DataTypes.StringType) .add("target", DataTypes.StringType) .add("relClass", DataTypes.StringType); Dataset relation = spark .read() .schema(rp) .json(inputPath + "relation") .filter( "datainfo.deletedbyinference != true and " + "relClass == '" + RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label + "'") .drop("datainfo", "relClass"); final Dataset minOrganizations = getMinOrganizationDataset(spark, inputPath); relation .join(minOrganizations, relation.col("target").equalTo(minOrganizations.col("local_identifier"))) .drop("target") .groupByKey((MapFunction) r -> r.getAs("source"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { ExtendingOrganization ar = new ExtendingOrganization(); ar.setEntityId(k); addRelevantOrganization(ar, v); return ar; }, Encoders.bean(ExtendingOrganization.class)) // .show(false); .write() .mode(SaveMode.Append) .option("compression", "gzip") .json("/tmp/miriam/prova/relevantOrganization"); } private static Dataset getMinOrganizationDataset(SparkSession spark, String inputPath) { Dataset organization = spark .read() .schema(Encoders.bean(Organization.class).schema()) .json(inputPath + "organization") .filter("datainfo.deletedbyinference != true") .selectExpr("id", "legalname.value as name", "pid"); organization.createOrReplaceTempView("org"); String query = "select id, pide.qualifier.classid as schema, pide.value as pid, name " + "from org " + "lateral view explode (pid) p as pide "; return spark .sql(query) .groupByKey((MapFunction) r -> r.getAs("id"), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> { MinOrganization mn = new MinOrganization(); mn.setLocal_identifier(k); Row r = v.next(); mn.setName(r.getAs("name")); addOrganizationPid(mn, r); v.forEachRemaining(row -> addOrganizationPid(mn, row)); return mn; }, Encoders.bean(MinOrganization.class)); } private static void addOrganizationPid(MinOrganization mo, Row next) { String schema = next.getAs("schema"); if (Optional.ofNullable(schema).isPresent()) { switch (schema) { case "ROR": mo.setRor(next.getAs("pid")); break; case "ISNI": mo.setIsni(next.getAs("pid")); break; case "FundRef": mo.setFundRef(next.getAs("pid")); break; case "RingGold": mo.setRinGold(next.getAs("pid")); break; case "Wikidata": mo.setWikidata(next.getAs("pid")); break; } } } private static void addRelevantOrganization(ExtendingOrganization ar, Iterator v) { ar.setRelevant_organization(new ArrayList<>()); while (v.hasNext()) ar.getRelevant_organization().add(getMinOrg(v.next())); } private static MinOrganization getMinOrg(Row next) { MinOrganization mo = new MinOrganization(); mo.setLocal_identifier(next.getAs("local_identifier")); mo.setName(next.getAs("name")); if (Optional.ofNullable(next.getAs("ror")).isPresent()) mo.setRor(next.getAs("ror")); if (Optional.ofNullable(next.getAs("isni")).isPresent()) mo.setIsni(next.getAs("isni")); if (Optional.ofNullable(next.getAs("fundRef")).isPresent()) mo.setFundRef(next.getAs("fundRef")); if (Optional.ofNullable(next.getAs("rinGold")).isPresent()) mo.setRinGold(next.getAs("rinGold")); if (Optional.ofNullable(next.getAs("wikidata")).isPresent()) mo.setWikidata(next.getAs("wikidata")); // return mo; // } // // if (Optional.ofNullable(pids).isPresent()) // pids.toStream().foreach(pid -> { // if (Optional.ofNullable(pid.getQualifier()).isPresent() && // Optional.ofNullable(pid.getQualifier().getClassid()).isPresent()) // switch (pid.getQualifier().getClassid().toLowerCase()) { // case "ror": // mo.setRor(pid.getValue()); // break; // case "isni": // mo.setIsni(pid.getValue()); // break; // case "fundref": // mo.setFundRef(pid.getValue()); // break; // case "ringgold": // mo.setRinGold(pid.getValue()); // break; // case "wikidata": // mo.setWikidata(pid.getValue()); // break; // // } // return null; // }); return mo; } }