diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala index b78f411ee..7a87861db 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala @@ -60,14 +60,10 @@ object SparkCreateActionset { val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders)) - - entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]) entities .joinWith(idRelation, entities("_1").equalTo(idRelation("value"))) .map(p => p._1._2) .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") - - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 97b3cdc99..2fc9623a8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -114,11 +114,7 @@ object SparkCreateBaselineDataFrame { val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}") val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true) val i = downloadBaselinePart(u._2) - val buffer = Array.fill[Byte](1024)(0) - while (i.read(buffer) > 0) { - fsDataOutputStream.write(buffer) - } - i.close() + IOUtils.copy(i, fsDataOutputStream) println(s"Downloaded ${u._2} into $baselinePath/${u._1}") fsDataOutputStream.close() } @@ -182,7 +178,7 @@ object SparkCreateBaselineDataFrame { downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) - val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline_ftp", 2000) + val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => { val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) new PMParser(xml) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml index f5a98ba5e..4ed6dd8bf 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml @@ -8,6 +8,10 @@ isLookupUrl The IS lookUp service endopoint + + targetPath + The target path + @@ -22,7 +26,7 @@ cluster Convert Baseline to OAF Dataset eu.dnetllib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame - dhp-graph-mapper-${projectVersion}.jar + dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -34,7 +38,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --workingPath${baselineWorkingPath} - --targetPath${baselineWorkingPath}/transformed + --targetPath${targetPath} --masteryarn --isLookupUrl${isLookupUrl} --hdfsServerUri${nameNode}