dnet-hadoop/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala

package eu.dnetlib.doiboost

import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.functions.col
import org.apache.spark.sql._
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._

object SparkGenerateDoiBoost {

  def extractIdGRID(input: String): List[(String, String)] = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: org.json4s.JValue = parse(input)

    val id: String = (json \ "id").extract[String]

    val grids: List[String] = for {

      JObject(pid)                            <- json \ "pid"
      JField("qualifier", JObject(qualifier)) <- pid
      JField("classid", JString(classid))     <- qualifier
      JField("value", JString(vl))            <- pid
      if classid == "GRID"
    } yield vl
    grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
  }

  def main(args: Array[String]): Unit = {

    val logger: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(
      IOUtils.toString(
        getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")
      )
    )
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master"))
        .getOrCreate()

    import spark.implicits._

    val hostedByMapPath = parser.get("hostedByMapPath")
    val workingDirPath = parser.get("workingPath")
    val openaireOrganizationPath = parser.get("openaireOrganizationPath")

    val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication]
      with Serializable {
      override def zero: Publication = new Publication

      override def reduce(b: Publication, a: (String, Publication)): Publication = {

        if (b == null) {
          if (a != null && a._2 != null) {
            a._2.setId(a._1)
            return a._2
          }
        } else {
          if (a != null && a._2 != null) {
            b.mergeFrom(a._2)
            b.setId(a._1)
            val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)
            b.setAuthor(authors)
            return b
          }
        }
        new Publication
      }

      override def merge(b1: Publication, b2: Publication): Publication = {
        if (b1 == null) {
          if (b2 != null)
            return b2
        } else {
          if (b2 != null) {
            b1.mergeFrom(b2)
            val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
            b1.setAuthor(authors)
            if (b2.getId != null && b2.getId.nonEmpty)
              b1.setId(b2.getId)
            return b1
          }
        }
        new Publication
      }

      override def finish(reduction: Publication): Publication = reduction

      override def bufferEncoder: Encoder[Publication] = Encoders.kryo[Publication]

      override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
    }

    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
    implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
      Encoders.tuple(Encoders.STRING, mapEncoderPub)
    implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]

    logger.info("Phase 2) Join Crossref with UnpayWall")

    val crossrefPublication: Dataset[(String, Publication)] =
      spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
    val uwPublication: Dataset[(String, Publication)] =
      spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))

    def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
      val crossrefPub = item._1._2
      if (item._2 != null) {
        val otherPub = item._2._2
        if (otherPub != null) {
          crossrefPub.mergeFrom(otherPub)
          crossrefPub.setAuthor(AuthorMerger.mergeAuthor(crossrefPub.getAuthor, otherPub.getAuthor))
        }
      }
      crossrefPub
    }

    crossrefPublication
      .joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")
      .map(applyMerge)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/firstJoin")
    logger.info("Phase 3) Join Result with ORCID")
    val fj: Dataset[(String, Publication)] =
      spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
    val orcidPublication: Dataset[(String, Publication)] =
      spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
    fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")
      .map(applyMerge)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/secondJoin")

    logger.info("Phase 4) Join Result with MAG")
    val sj: Dataset[(String, Publication)] =
      spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))

    val magPublication: Dataset[(String, Publication)] =
      spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
    sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")
      .map(applyMerge)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/doiBoostPublication")

    val doiBoostPublication: Dataset[(String, Publication)] = spark.read
      .load(s"$workingDirPath/doiBoostPublication")
      .as[Publication]
      .filter(p => DoiBoostMappingUtil.filterPublication(p))
      .map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)

    val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(
      spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)
    )

    doiBoostPublication
      .joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
      .map(DoiBoostMappingUtil.fixPublication)
      .map(p => (p.getId, p))
      .groupByKey(_._1)
      .agg(crossrefAggregator.toColumn)
      .map(p => p._2)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/doiBoostPublicationFiltered")

    val affiliationPath = parser.get("affiliationPath")
    val paperAffiliationPath = parser.get("paperAffiliationPath")

    val affiliation = spark.read
      .load(affiliationPath)
      .select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))

    val paperAffiliation = spark.read
      .load(paperAffiliationPath)
      .select(col("AffiliationId").alias("affId"), col("PaperId"))

    val a: Dataset[DoiBoostAffiliation] = paperAffiliation
      .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
      .select(
        col("_1.PaperId"),
        col("_2.AffiliationId"),
        col("_2.GridId"),
        col("_2.OfficialPage"),
        col("_2.DisplayName")
      )
      .as[DoiBoostAffiliation]

    val magPubs: Dataset[(String, Publication)] = spark.read
      .load(s"$workingDirPath/doiBoostPublicationFiltered")
      .as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(
        tupleForJoinEncoder
      )
      .filter(s => s._1 != null)

    magPubs
      .joinWith(a, magPubs("_1").equalTo(a("PaperId")))
      .flatMap(item => {
        val pub: Publication = item._1._2
        val affiliation = item._2
        val affId: String =
          if (affiliation.GridId.isDefined)
            s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"
          else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
        val r: Relation = new Relation
        r.setSource(pub.getId)
        r.setTarget(affId)
        r.setRelType(ModelConstants.RESULT_ORGANIZATION)
        r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
        r.setSubRelType(ModelConstants.AFFILIATION)
        r.setDataInfo(pub.getDataInfo)
        r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
        val r1: Relation = new Relation
        r1.setTarget(pub.getId)
        r1.setSource(affId)
        r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
        r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
        r1.setSubRelType(ModelConstants.AFFILIATION)
        r1.setDataInfo(pub.getDataInfo)
        r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
        List(r, r1)
      })(mapEncoderRel)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")

    val unresolvedRels: Dataset[(String, Relation)] = spark.read
      .load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
      .as[Relation]
      .map(r => {

        if (r.getSource.startsWith("unresolved"))
          (r.getSource, r)
        else if (r.getTarget.startsWith("unresolved"))
          (r.getTarget, r)
        else
          ("resolved", r)
      })(Encoders.tuple(Encoders.STRING, mapEncoderRel))

    val openaireOrganization: Dataset[(String, String)] = spark.read
      .text(openaireOrganizationPath)
      .as[String]
      .flatMap(s => extractIdGRID(s))
      .groupByKey(_._2)
      .reduceGroups((x, y) => if (x != null) x else y)
      .map(_._2)

    unresolvedRels
      .joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
      .map { x =>
        val currentRels = x._1._2
        val currentOrgs = x._2
        if (currentOrgs != null)
          if (currentRels.getSource.startsWith("unresolved"))
            currentRels.setSource(currentOrgs._1)
          else
            currentRels.setTarget(currentOrgs._1)
        currentRels
      }
      .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/doiBoostPublicationAffiliation")

    magPubs
      .joinWith(a, magPubs("_1").equalTo(a("PaperId")))
      .map(item => {
        val affiliation = item._2
        if (affiliation.GridId.isEmpty) {
          val o = new Organization
          o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
          o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
          o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
          o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
          if (affiliation.DisplayName.nonEmpty)
            o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
          if (affiliation.OfficialPage.isDefined)
            o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
          o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
          o
        } else
          null
      })
      .filter(o => o != null)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingDirPath/doiBoostOrganization")
  }

}
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`package eu.dnetlib.doiboost`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser`
solved bug on merge publication 2020-10-21 22:41:55 +02:00			`import eu.dnetlib.dhp.oa.merge.AuthorMerger`
using com.fasterxml.jackson.databind.ObjectMapper instead of org.codehaus.jackson.map.ObjectMapper 2020-12-23 16:59:52 +01:00			`import eu.dnetlib.dhp.schema.common.ModelConstants`
solved bug on merge publication 2020-10-21 22:41:55 +02:00			`import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}`
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00			`import eu.dnetlib.doiboost.mag.ConversionUtil`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`import org.apache.commons.io.IOUtils`
			`import org.apache.spark.SparkConf`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`import org.apache.spark.sql.expressions.Aggregator`
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00			`import org.apache.spark.sql.functions.col`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`import org.apache.spark.sql._`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`import org.json4s.DefaultFormats`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`import org.json4s.JsonAST.{JField, JObject, JString}`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`import org.json4s.jackson.JsonMethods.parse`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`import org.slf4j.{Logger, LoggerFactory}`
			`import scala.collection.JavaConverters._`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`object SparkGenerateDoiBoost {`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`def extractIdGRID(input: String): List[(String, String)] = {`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats`
			`lazy val json: org.json4s.JValue = parse(input)`

[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`val id: String = (json \ "id").extract[String]`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`val grids: List[String] = for {`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`JObject(pid) <- json \ "pid"`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`JField("qualifier", JObject(qualifier)) <- pid`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`JField("classid", JString(classid)) <- qualifier`
			`JField("value", JString(vl)) <- pid`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`if classid == "GRID"`
			`} yield vl`
			`grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)`
			`}`

completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`def main(args: Array[String]): Unit = {`

			`val logger: Logger = LoggerFactory.getLogger(getClass)`
			`val conf: SparkConf = new SparkConf()`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val parser = new ArgumentApplicationParser(`
			`IOUtils.toString(`
			`getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")`
			`)`
			`)`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`parser.parseArgument(args)`
			`val spark: SparkSession =`
			`SparkSession`
			`.builder()`
			`.config(conf)`
			`.appName(getClass.getSimpleName)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.master(parser.get("master"))`
			`.getOrCreate()`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00			`import spark.implicits._`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
improvement DOIBoost 2020-06-04 14:39:20 +02:00			`val hostedByMapPath = parser.get("hostedByMapPath")`
minor fix 2021-01-04 17:37:08 +01:00			`val workingDirPath = parser.get("workingPath")`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`val openaireOrganizationPath = parser.get("openaireOrganizationPath")`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication]`
			`with Serializable {`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`override def zero: Publication = new Publication`

			`override def reduce(b: Publication, a: (String, Publication)): Publication = {`

			`if (b == null) {`
			`if (a != null && a._2 != null) {`
			`a._2.setId(a._1)`
			`return a._2`
			`}`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`} else {`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`if (a != null && a._2 != null) {`
			`b.mergeFrom(a._2)`
			`b.setId(a._1)`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`b.setAuthor(authors)`
			`return b`
			`}`
			`}`
			`new Publication`
			`}`

			`override def merge(b1: Publication, b2: Publication): Publication = {`
			`if (b1 == null) {`
			`if (b2 != null)`
			`return b2`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`} else {`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`if (b2 != null) {`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`b1.mergeFrom(b2)`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`b1.setAuthor(authors)`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`if (b2.getId != null && b2.getId.nonEmpty)`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`b1.setId(b2.getId)`
			`return b1`
			`}`
			`}`
			`new Publication`
			`}`

			`override def finish(reduction: Publication): Publication = reduction`

			`override def bufferEncoder: Encoder[Publication] = Encoders.kryo[Publication]`

			`override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]`
			`}`

completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]`
improvement DOIBoost 2020-06-04 14:39:20 +02:00			`implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =`
			`Encoders.tuple(Encoders.STRING, mapEncoderPub)`
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00			`implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
			`logger.info("Phase 2) Join Crossref with UnpayWall")`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val crossrefPublication: Dataset[(String, Publication)] =`
			`spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))`
			`val uwPublication: Dataset[(String, Publication)] =`
			`spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`val crossrefPub = item._1._2`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`if (item._2 != null) {`
minor fix 2020-05-22 20:51:42 +02:00			`val otherPub = item._2._2`
			`if (otherPub != null) {`
			`crossrefPub.mergeFrom(otherPub)`
solved bug on merge publication 2020-10-21 22:41:55 +02:00			`crossrefPub.setAuthor(AuthorMerger.mergeAuthor(crossrefPub.getAuthor, otherPub.getAuthor))`
minor fix 2020-05-22 20:51:42 +02:00			`}`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00			`}`
			`crossrefPub`
minor fix 2020-05-22 20:51:42 +02:00			`}`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`crossrefPublication`
			`.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")`
			`.map(applyMerge)`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/firstJoin")`
minor fix 2020-05-22 20:51:42 +02:00			`logger.info("Phase 3) Join Result with ORCID")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val fj: Dataset[(String, Publication)] =`
			`spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))`
			`val orcidPublication: Dataset[(String, Publication)] =`
			`spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))`
			`fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")`
			`.map(applyMerge)`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/secondJoin")`
minor fix 2020-05-22 20:51:42 +02:00
fixed doiboost mapping and workflows 2020-12-07 19:59:33 +01:00			`logger.info("Phase 4) Join Result with MAG")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val sj: Dataset[(String, Publication)] =`
			`spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))`

			`val magPublication: Dataset[(String, Publication)] =`
			`spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))`
			`sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")`
			`.map(applyMerge)`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/doiBoostPublication")`

			`val doiBoostPublication: Dataset[(String, Publication)] = spark.read`
			`.load(s"$workingDirPath/doiBoostPublication")`
			`.as[Publication]`
			`.filter(p => DoiBoostMappingUtil.filterPublication(p))`
			`.map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)`

			`val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(`
			`spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)`
			`)`

			`doiBoostPublication`
			`.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")`
improvement DOIBoost 2020-06-04 14:39:20 +02:00			`.map(DoiBoostMappingUtil.fixPublication)`
applied unique function on the final dataset 2021-04-16 17:36:48 +02:00			`.map(p => (p.getId, p))`
			`.groupByKey(_._1)`
			`.agg(crossrefAggregator.toColumn)`
			`.map(p => p._2)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/doiBoostPublicationFiltered")`
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00
			`val affiliationPath = parser.get("affiliationPath")`
			`val paperAffiliationPath = parser.get("paperAffiliationPath")`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val affiliation = spark.read`
			`.load(affiliationPath)`
			`.select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))`
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val paperAffiliation = spark.read`
			`.load(paperAffiliationPath)`
			`.select(col("AffiliationId").alias("affId"), col("PaperId"))`
implemented generation of ActionSet 2020-05-26 09:15:33 +02:00
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`val a: Dataset[DoiBoostAffiliation] = paperAffiliation`
improvement DOIBoost 2020-06-04 14:39:20 +02:00			`.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.select(`
			`col("_1.PaperId"),`
			`col("_2.AffiliationId"),`
			`col("_2.GridId"),`
			`col("_2.OfficialPage"),`
			`col("_2.DisplayName")`
			`)`
			`.as[DoiBoostAffiliation]`

			`val magPubs: Dataset[(String, Publication)] = spark.read`
			`.load(s"$workingDirPath/doiBoostPublicationFiltered")`
			`.as[Publication]`
			`.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(`
			`tupleForJoinEncoder`
			`)`
			`.filter(s => s._1 != null)`

			`magPubs`
			`.joinWith(a, magPubs("_1").equalTo(a("PaperId")))`
			`.flatMap(item => {`
			`val pub: Publication = item._1._2`
			`val affiliation = item._2`
			`val affId: String =`
			`if (affiliation.GridId.isDefined)`
			`s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"`
			`else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)`
			`val r: Relation = new Relation`
			`r.setSource(pub.getId)`
			`r.setTarget(affId)`
			`r.setRelType(ModelConstants.RESULT_ORGANIZATION)`
			`r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)`
			`r.setSubRelType(ModelConstants.AFFILIATION)`
			`r.setDataInfo(pub.getDataInfo)`
			`r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)`
			`val r1: Relation = new Relation`
			`r1.setTarget(pub.getId)`
			`r1.setSource(affId)`
			`r1.setRelType(ModelConstants.RESULT_ORGANIZATION)`
			`r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)`
			`r1.setSubRelType(ModelConstants.AFFILIATION)`
			`r1.setDataInfo(pub.getDataInfo)`
			`r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)`
			`List(r, r1)`
			`})(mapEncoderRel)`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")`

			`val unresolvedRels: Dataset[(String, Relation)] = spark.read`
			`.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")`
			`.as[Relation]`
			`.map(r => {`

			`if (r.getSource.startsWith("unresolved"))`
			`(r.getSource, r)`
			`else if (r.getTarget.startsWith("unresolved"))`
			`(r.getTarget, r)`
			`else`
			`("resolved", r)`
			`})(Encoders.tuple(Encoders.STRING, mapEncoderRel))`

			`val openaireOrganization: Dataset[(String, String)] = spark.read`
			`.text(openaireOrganizationPath)`
			`.as[String]`
			`.flatMap(s => extractIdGRID(s))`
			`.groupByKey(_._2)`
			`.reduceGroups((x, y) => if (x != null) x else y)`
			`.map(_._2)`

			`unresolvedRels`
			`.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`.map { x =>`
			`val currentRels = x._1._2`
			`val currentOrgs = x._2`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`if (currentOrgs != null)`
			`if (currentRels.getSource.startsWith("unresolved"))`
Fixed DoiBoost generation to point to correct organization in affiliation relation 2021-09-27 16:57:04 +02:00			`currentRels.setSource(currentOrgs._1)`
			`else`
			`currentRels.setTarget(currentOrgs._1)`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`currentRels`
improvement DOIBoost 2020-06-04 14:39:20 +02:00			`}`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/doiBoostPublicationAffiliation")`

			`magPubs`
			`.joinWith(a, magPubs("_1").equalTo(a("PaperId")))`
			`.map(item => {`
			`val affiliation = item._2`
			`if (affiliation.GridId.isEmpty) {`
			`val o = new Organization`
			`o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)`
			`o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())`
			`o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))`
			`o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)`
			`if (affiliation.DisplayName.nonEmpty)`
			`o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))`
			`if (affiliation.OfficialPage.isDefined)`
			`o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))`
			`o.setCountry(ModelConstants.UNKNOWN_COUNTRY)`
			`o`
			`} else`
			`null`
			`})`
			`.filter(o => o != null)`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingDirPath/doiBoostOrganization")`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`}`
completed mapping of UnpayWall and ORCID 2020-05-22 15:15:09 +02:00
			`}`