forked from D-Net/dnet-hadoop
implemented changes on PUBMed dataflow
This commit is contained in:
parent
02ef46535f
commit
e57294ac99
|
@ -6,7 +6,7 @@ import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
|
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
|
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
import scala.xml.pull.XMLEventReader
|
import scala.xml.pull.XMLEventReader
|
||||||
|
@ -14,6 +14,26 @@ import scala.xml.pull.XMLEventReader
|
||||||
object SparkCreateBaselineDataFrame {
|
object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
|
|
||||||
|
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||||
|
override def zero: PMArticle = new PMArticle
|
||||||
|
|
||||||
|
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||||
|
if (b != null && b.getPmid!= null) b else a._2
|
||||||
|
}
|
||||||
|
|
||||||
|
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||||
|
if (b1 != null && b1.getPmid!= null) b1 else b2
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||||
|
|
||||||
|
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||||
|
|
||||||
|
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||||
|
@ -24,6 +44,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
val sc = spark.sparkContext
|
val sc = spark.sparkContext
|
||||||
|
|
||||||
|
@ -39,10 +61,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
} ))
|
} ))
|
||||||
|
|
||||||
ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
ds.map(p => (p.getPmid,p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
||||||
|
.agg(pmArticleAggregator.toColumn)
|
||||||
|
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -115,7 +115,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
case "LastName" => {
|
case "LastName" => {
|
||||||
if (currentAuthor != null)
|
if (currentAuthor != null)
|
||||||
currentAuthor.setLastName(text.trim)
|
currentAuthor.setLastName(text.trim)
|
||||||
|
|
||||||
}
|
}
|
||||||
case "ForeName" => if (currentAuthor != null)
|
case "ForeName" => if (currentAuthor != null)
|
||||||
currentAuthor.setForeName(text.trim)
|
currentAuthor.setForeName(text.trim)
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
package eu.dnetlib.dhp.sx.ebi
|
package eu.dnetlib.dhp.sx.ebi
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import com.fasterxml.jackson.databind.SerializationFeature
|
||||||
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.sx.ebi.model.PMParser
|
import eu.dnetlib.dhp.sx.ebi.model.PMParser
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
|
@ -13,10 +16,12 @@ class TestEBI {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
|
|
||||||
|
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
|
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
|
||||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
|
||||||
|
|
||||||
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue