BrBETA_dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToA...

package eu.dnetlib.dhp.oa.graph.hostedbymap

import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DatasourceInfo, EntityInfo}

import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}


object SparkPrepareHostedByInfoToApply {


  implicit val mapEncoderDSInfo: Encoder[DatasourceInfo] = Encoders.bean(classOf[DatasourceInfo])
  implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])

  def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = {
    var lst:List[EntityInfo] = List()


    if (j.getIssnLinking != null && !j.getIssnLinking.equals("")){
      lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
    }
    if (j.getIssnOnline != null && !j.getIssnOnline.equals("")){
      lst = EntityInfo.newInstance(id, j.getIssnOnline, name) :: lst
    }
    if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")){
      lst = EntityInfo.newInstance(id, j.getIssnPrinted, name) :: lst
    }
    lst
  }

  def prepareResultInfo(spark:SparkSession, publicationPath:String) : Dataset[EntityInfo] = {
    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])

    val mapper = new ObjectMapper()

    val dd : Dataset[Publication] = spark.read.textFile(publicationPath)
      .map(r => mapper.readValue(r, classOf[Publication]))

    dd.filter(p => p.getJournal != null ).flatMap(p => getList(p.getId, p.getJournal, ""))

  }


  def toEntityInfo(input:String): EntityInfo = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats

    lazy val json: json4s.JValue = parse(input)
    val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
    toEntityItem(c.keys.head, c.values.head)
  }

  def explodeJournalInfo(input: DatasourceInfo): List[EntityInfo] = {
    var lst : List[EntityInfo] = List()
    if (input.getEissn != null && !input.getEissn.equals("")){
      lst = EntityInfo.newInstance(input.getId, input.getEissn, input.getOfficialname, input.getOpenAccess) :: lst
    }
    if (input.getLissn != null && !input.getLissn.equals("")){
      lst = EntityInfo.newInstance(input.getId, input.getLissn, input.getOfficialname, input.getOpenAccess) :: lst
    }
    if (input.getIssn != null && !input.getIssn.equals("")){
      lst = EntityInfo.newInstance(input.getId, input.getIssn, input.getOfficialname, input.getOpenAccess) :: lst
    }

    lst
  }

  def joinDsandHBM(left:Dataset[DatasourceInfo], right:Dataset[HostedByItemType]): Dataset[EntityInfo] = {
    left.joinWith(right,
      left.col("id").equalTo(right.col("id")), "left")
      .map(t2 => {
        val dsi : DatasourceInfo = t2._1
        if(t2._2 != null){
          val hbi : HostedByItemType = t2._2
          dsi.setOpenAccess(hbi.openAccess)
        }
        dsi
      }).flatMap(explodeJournalInfo)
  }

  def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = {

    EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)

  }

  def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
    Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journal_id").equalTo(hbm.col("journal_id")), "left")
      .map(t2 => {
        val res: EntityInfo = t2._1
        if(t2._2 != null ){
          val ds = t2._2
          res.setHb_id(ds.getId)
          res.setOpenaccess(ds.getOpenaccess)
          res.setName(ds.getName)
        }
        res
      }))
  }

  def main(args: Array[String]): Unit = {


    val logger: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()


    val graphPath = parser.get("graphPath")

    val outputPath = parser.get("outputPath")
    val hostedByMapPath = parser.get("hostedByMapPath")


    implicit val formats = DefaultFormats


    logger.info("Getting the Datasources")

    import spark.implicits._


    //STEP1: leggere la hostedbymap e trasformarla in entity info
    val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)

   //STEP2: creare la mappa publication id issn, eissn, lissn esplosa
    val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, "$graphPath/publication")

    //STEP3: join resultInfo con hostedByInfo sul journal_id dal result con left
    // e riduzione di tutti i result con lo stesso id in una unica entry con aggiunto l'id della datasource
    joinResHBM(resultInfoDataset, hostedByInfo)
      .write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)


  }


}
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`package eu.dnetlib.dhp.oa.graph.hostedbymap`

			`import com.fasterxml.jackson.databind.ObjectMapper`
			`import eu.dnetlib.dhp.application.ArgumentApplicationParser`
			`import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DatasourceInfo, EntityInfo}`
Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00
			`import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`import org.apache.commons.io.IOUtils`
			`import org.apache.spark.SparkConf`
			`import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}`
			`import org.json4s`
			`import org.json4s.DefaultFormats`
			`import org.json4s.jackson.JsonMethods.parse`
			`import org.slf4j.{Logger, LoggerFactory}`

Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00

Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`object SparkPrepareHostedByInfoToApply {`


Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`implicit val mapEncoderDSInfo: Encoder[DatasourceInfo] = Encoders.bean(classOf[DatasourceInfo])`
			`implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00
			`def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = {`
			`var lst:List[EntityInfo] = List()`


			`if (j.getIssnLinking != null && !j.getIssnLinking.equals("")){`
			`lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst`
			`}`
			`if (j.getIssnOnline != null && !j.getIssnOnline.equals("")){`
			`lst = EntityInfo.newInstance(id, j.getIssnOnline, name) :: lst`
			`}`
			`if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")){`
			`lst = EntityInfo.newInstance(id, j.getIssnPrinted, name) :: lst`
			`}`
			`lst`
			`}`

			`def prepareResultInfo(spark:SparkSession, publicationPath:String) : Dataset[EntityInfo] = {`
			`implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])`

			`val mapper = new ObjectMapper()`

			`val dd : Dataset[Publication] = spark.read.textFile(publicationPath)`
			`.map(r => mapper.readValue(r, classOf[Publication]))`

			`dd.filter(p => p.getJournal != null ).flatMap(p => getList(p.getId, p.getJournal, ""))`

			`}`


Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`def toEntityInfo(input:String): EntityInfo = {`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats`

			`lazy val json: json4s.JValue = parse(input)`
			`val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]`
Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`toEntityItem(c.keys.head, c.values.head)`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`}`

			`def explodeJournalInfo(input: DatasourceInfo): List[EntityInfo] = {`
			`var lst : List[EntityInfo] = List()`
			`if (input.getEissn != null && !input.getEissn.equals("")){`
			`lst = EntityInfo.newInstance(input.getId, input.getEissn, input.getOfficialname, input.getOpenAccess) :: lst`
			`}`
Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`if (input.getLissn != null && !input.getLissn.equals("")){`
			`lst = EntityInfo.newInstance(input.getId, input.getLissn, input.getOfficialname, input.getOpenAccess) :: lst`
			`}`
			`if (input.getIssn != null && !input.getIssn.equals("")){`
			`lst = EntityInfo.newInstance(input.getId, input.getIssn, input.getOfficialname, input.getOpenAccess) :: lst`
			`}`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00
			`lst`
			`}`

Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`def joinDsandHBM(left:Dataset[DatasourceInfo], right:Dataset[HostedByItemType]): Dataset[EntityInfo] = {`
			`left.joinWith(right,`
			`left.col("id").equalTo(right.col("id")), "left")`
			`.map(t2 => {`
			`val dsi : DatasourceInfo = t2._1`
			`if(t2._2 != null){`
			`val hbi : HostedByItemType = t2._2`
			`dsi.setOpenAccess(hbi.openAccess)`
			`}`
			`dsi`
			`}).flatMap(explodeJournalInfo)`
			`}`

			`def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = {`

			`EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)`

			`}`

			`def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {`
			`Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journal_id").equalTo(hbm.col("journal_id")), "left")`
			`.map(t2 => {`
			`val res: EntityInfo = t2._1`
			`if(t2._2 != null ){`
			`val ds = t2._2`
			`res.setHb_id(ds.getId)`
			`res.setOpenaccess(ds.getOpenaccess)`
			`res.setName(ds.getName)`
			`}`
			`res`
			`}))`
			`}`

Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`def main(args: Array[String]): Unit = {`


			`val logger: Logger = LoggerFactory.getLogger(getClass)`
			`val conf: SparkConf = new SparkConf()`
			`val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json")))`
			`parser.parseArgument(args)`
			`val spark: SparkSession =`
			`SparkSession`
			`.builder()`
			`.config(conf)`
			`.appName(getClass.getSimpleName)`
			`.master(parser.get("master")).getOrCreate()`


			`val graphPath = parser.get("graphPath")`

			`val outputPath = parser.get("outputPath")`
			`val hostedByMapPath = parser.get("hostedByMapPath")`


			`implicit val formats = DefaultFormats`


			`logger.info("Getting the Datasources")`

			`import spark.implicits._`


Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`//STEP1: leggere la hostedbymap e trasformarla in entity info`
			`val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00
Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`//STEP2: creare la mappa publication id issn, eissn, lissn esplosa`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, "$graphPath/publication")`

Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00			`//STEP3: join resultInfo con hostedByInfo sul journal_id dal result con left`
			`// e riduzione di tutti i result con lo stesso id in una unica entry con aggiunto l'id della datasource`
			`joinResHBM(resultInfoDataset, hostedByInfo)`
			`.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)`
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00

			`}`

Hosted By Map - modification in the code to prepare the info needed to apply the HostedByMap. There is no need to join datasources with the hbm: all the information needed is in the hosted by map already 2021-08-02 19:32:59 +02:00
Hosted By Map - first attempt for the creation of intermedia information to be used to applu the hosted by map on the graph entities 2021-07-30 17:56:27 +02:00			`}`