From 4d53402712905eceefae3ba75428e4f77629b950 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 8 Jul 2021 10:26:21 +0200 Subject: [PATCH] extended ebiLinks to create a dataset before generation of OAF --- .../eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala index 4c304848b..950278aa0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala @@ -2,10 +2,12 @@ package eu.dnetlib.dhp.sx.graph.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.graph.bio import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkEBILinksToOaf { @@ -29,6 +31,11 @@ object SparkEBILinksToOaf { import spark.implicits._ implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + + val ebi_rdd:Dataset[EBILinkItem] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => BioDBToOAF.extractEBILinksFromDump(s))).as[EBILinkItem] + + ebi_rdd.write.mode(SaveMode.Overwrite).save(s"${sourcePath}_dataset") + val ebLinks:Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links!= null) ebLinks.flatMap(j =>BioDBToOAF.parse_ebi_links(j.links))