From d9cbca83f7911b53190a9f999f5e02a7f00382ac Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 28 Oct 2021 16:13:24 +0200 Subject: [PATCH 1/4] moved filter on next phase --- .../dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala index 5ca7d9782..db93bc43f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala @@ -75,7 +75,7 @@ object SparkResolveRelation { if (targetResolved != null && targetResolved._1.nonEmpty) currentRelation.setTarget(targetResolved._1) currentRelation - }.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) + } .write .mode(SaveMode.Overwrite) .save(s"$workingPath/relation_resolved") @@ -88,6 +88,7 @@ object SparkResolveRelation { fs.rename(new Path(s"$graphBasePath/relation"), new Path(s"$workingPath/relation")) spark.read.load(s"$workingPath/relation_resolved").as[Relation] + .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) .map(r => mapper.writeValueAsString(r)) .write .option("compression", "gzip") From 1225ba0b9223ae6049ea5c80772dcf72407a01b8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 28 Oct 2021 16:18:17 +0200 Subject: [PATCH 2/4] [resolution] increasing number of partitions to avoid OOM --- .../eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml index 31cc53ae3..52f0e6e9d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml @@ -24,7 +24,7 @@ --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.shuffle.partitions=8000 + --conf spark.sql.shuffle.partitions=15000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} From f78afb5ef99b27e070b835d8f179726186c2e31c Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 1 Nov 2021 15:42:29 +0200 Subject: [PATCH 3/4] removed hardcoded reference --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index b977302df..e892da0be 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -65,7 +65,7 @@ FROM ${stats_db_name}.project_tmp p UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id - FROM stats_wf_db_obs.result r, + FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id = pr.result @@ -75,7 +75,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id - FROM stats_wf_db_obs.result r, + FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id = pr.result From 7bd224f051598cf0b9370e5b33f6862477026de1 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 2 Nov 2021 15:58:15 +0100 Subject: [PATCH 4/4] implement first version of scholexplorer integration for the generation of final graph --- .../dhp/sx/graph/SparkConvertRDDtoDataset.scala | 2 +- .../eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala | 10 +++++++--- .../eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala | 2 +- .../dhp/sx/graph/finalGraph/oozie_app/workflow.xml | 4 ++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index cb41d6134..141b7b073 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -59,7 +59,7 @@ object SparkConvertRDDtoDataset { log.info("Converting Relation") - val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation])) + val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation])).filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala index 0a7fc18fb..e4fcd2782 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala @@ -51,10 +51,14 @@ object SparkCreateScholix { relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => - val rel: Relation = input._1._2 - val source: ScholixSummary = input._2._2 - (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) + if (input._1!= null && input._2!= null) { + val rel: Relation = input._1._2 + val source: ScholixSummary = input._2._2 + (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) + } + else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) + .filter(r => r!= null) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index 4dafd4fa3..93c554e04 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -289,7 +289,7 @@ object ScholixUtils { if (r.getInstance() == null || r.getInstance().isEmpty) return List() r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty) - + .filter(i => i.getPid!= null && i.getUrl != null) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml index d8eb1fc80..17996c82c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml @@ -54,7 +54,7 @@ --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.shuffle.partitions=5000 + --conf spark.sql.shuffle.partitions=20000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} @@ -79,7 +79,7 @@ --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.shuffle.partitions=6000 + --conf spark.sql.shuffle.partitions=20000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}