From 1d9fdb7367253d4c5f1bc835c2faf36391256aa2 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 28 Oct 2020 12:30:32 +0100 Subject: [PATCH] fixed spark memory issue in SparkSplitOafTODLIEntities --- .../sx/graph/SparkSplitOafTODLIEntities.scala | 109 +++++++++++------- .../input_extract_entities_parameters.json | 3 +- .../dhp/sx/graph/step2/oozie_app/workflow.xml | 81 ++++++++++++- 3 files changed, 149 insertions(+), 44 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala index d0df28b2df..822b162632 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala @@ -4,7 +4,6 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} import eu.dnetlib.dhp.sx.ebi.EBIAggregator -import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal} import org.apache.commons.io.IOUtils import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.LoggerFactory @@ -18,38 +17,38 @@ object SparkSplitOafTODLIEntities { } - def main(args: Array[String]): Unit = { - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))) - val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass) - parser.parseArgument(args) - val workingPath: String = parser.get("workingPath") - logger.info(s"Working dir path = $workingPath") + def extract_dataset(spark:SparkSession, workingPath:String) :Unit = { + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset] + + val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf].repartition(4000) + + val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset].repartition(1000) + + + OAFDataset + .filter(s => s != null && s.isInstanceOf[DLIDataset]) + .map(s =>s.asInstanceOf[DLIDataset]) + .union(ebi_dataset) + .map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder)) + .groupByKey(_._1)(Encoders.STRING) + .agg(EBIAggregator.getDLIDatasetAggregator().toColumn) + .map(p => p._2) + .repartition(2000) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset") + + } + + def extract_publication(spark:SparkSession, workingPath:String) :Unit = { implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] - implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset] - implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown] - implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] - - - - val spark:SparkSession = SparkSession - .builder() - .appName(SparkSplitOafTODLIEntities.getClass.getSimpleName) - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .master(parser.get("master")) - .getOrCreate() - - - val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf] - val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset] - val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication] - val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation] - + val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication].repartition(1000) OAFDataset @@ -60,20 +59,17 @@ object SparkSplitOafTODLIEntities { .groupByKey(_._1)(Encoders.STRING) .agg(EBIAggregator.getDLIPublicationAggregator().toColumn) .map(p => p._2) - .repartition(1000) + .repartition(2000) .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication") - OAFDataset - .filter(s => s != null && s.isInstanceOf[DLIDataset]) - .map(s =>s.asInstanceOf[DLIDataset]) - .union(ebi_dataset) - .map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder)) - .groupByKey(_._1)(Encoders.STRING) - .agg(EBIAggregator.getDLIDatasetAggregator().toColumn) - .map(p => p._2) - .repartition(1000) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset") + } + def extract_unknown(spark:SparkSession, workingPath:String) :Unit = { + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown] + + val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf] OAFDataset .filter(s => s != null && s.isInstanceOf[DLIUnknown]) @@ -82,9 +78,18 @@ object SparkSplitOafTODLIEntities { .groupByKey(_._1)(Encoders.STRING) .agg(EBIAggregator.getDLIUnknownAggregator().toColumn) .map(p => p._2) - .repartition(1000) .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown") + } + + + def extract_relations(spark:SparkSession, workingPath:String) :Unit = { + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] + + val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf] + val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000) OAFDataset .filter(s => s != null && s.isInstanceOf[Relation]) @@ -94,9 +99,35 @@ object SparkSplitOafTODLIEntities { .groupByKey(_._1)(Encoders.STRING) .agg(EBIAggregator.getRelationAggregator().toColumn) .map(p => p._2) - .repartition(1000) + .repartition(4000) .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation") + } + + + def main(args: Array[String]): Unit = { + val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))) + val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass) + parser.parseArgument(args) + + val workingPath: String = parser.get("workingPath") + val entity:String = parser.get("entity") + logger.info(s"Working dir path = $workingPath") + + val spark:SparkSession = SparkSession + .builder() + .appName(SparkSplitOafTODLIEntities.getClass.getSimpleName) + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .master(parser.get("master")) + .getOrCreate() + + + entity match { + case "publication" => extract_publication(spark, workingPath) + case "dataset" => extract_dataset(spark,workingPath) + case "relation" => extract_relations(spark, workingPath) + case "unknown" => extract_unknown(spark, workingPath) + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json index febcfc8981..7878931af3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json @@ -1,4 +1,5 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true} + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true}, + {"paramName":"e", "paramLongName":"entity", "paramDescription": "the work dir path", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml index fabe7510bb..9d06c42d62 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml @@ -14,30 +14,103 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + ${jobTracker} ${nameNode} yarn-cluster cluster - Extract DLI Entities + Extract DLI Entities (Publication) eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities dhp-graph-mapper-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 ${sparkExtraOPT} -mt yarn-cluster --workingPath${workingPath} + -epublication + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Extract DLI Entities (Dataset) + eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=5000 + ${sparkExtraOPT} + + -mt yarn-cluster + --workingPath${workingPath} + -edataset + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Extract DLI Entities (Unknown) + eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=5000 + ${sparkExtraOPT} + + -mt yarn-cluster + --workingPath${workingPath} + -eunknown + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Extract DLI Entities (Relation) + eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=5000 + ${sparkExtraOPT} + + -mt yarn-cluster + --workingPath${workingPath} + -erelation