dnet-hadoop/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala

package eu.dnetlib.doiboost.mag

import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, collect_list, struct}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._

object SparkProcessMAG {

  def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
    d.where(col("Doi").isNotNull)
      .groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
      .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
      .map(_._2)(Encoders.product[MagPapers])
      .map(mp => {
        MagPapers(
          mp.PaperId,
          mp.Rank,
          DoiBoostMappingUtil.normalizeDoi(mp.Doi),
          mp.DocType,
          mp.PaperTitle,
          mp.OriginalTitle,
          mp.BookTitle,
          mp.Year,
          mp.Date,
          mp.Publisher: String,
          mp.JournalId,
          mp.ConferenceSeriesId,
          mp.ConferenceInstanceId,
          mp.Volume,
          mp.Issue,
          mp.FirstPage,
          mp.LastPage,
          mp.ReferenceCount,
          mp.CitationCount,
          mp.EstimatedCitation,
          mp.OriginalVenue,
          mp.FamilyId,
          mp.CreatedDate
        )
      })(Encoders.product[MagPapers])
  }

  def main(args: Array[String]): Unit = {

    val logger: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(
      IOUtils.toString(
        getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
      )
    )
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master"))
        .getOrCreate()

    val sourcePath = parser.get("sourcePath")
    val workingPath = parser.get("workingPath")
    val targetPath = parser.get("targetPath")

    import spark.implicits._
    implicit val mapEncoderPubs: Encoder[Publication] =
      org.apache.spark.sql.Encoders.kryo[Publication]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
      Encoders.tuple(Encoders.STRING, mapEncoderPubs)

    logger.info("Phase 1) make uninue DOI in Papers:")
    val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]

    // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
    val distinctPaper: Dataset[MagPapers] = getDistinctResults(d)

    distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")

    logger.info("Phase 0) Enrich Publication with description")
    val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
    pa.map(ConversionUtil.transformPaperAbstract)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/PaperAbstract")

    logger.info("Phase 3) Group Author by PaperId")
    val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]

    val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
    val paperAuthorAffiliation =
      spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]

    paperAuthorAffiliation
      .joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
      .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
        (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
      }
      .joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
      .map(s => {
        val mpa = s._1._2
        val af = s._2
        if (af != null) {
          MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
        } else
          mpa
      })
      .groupBy("PaperId")
      .agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/merge_step_1_paper_authors")

    logger.info(
      "Phase 4) create First Version of publication Entity with Paper Journal and Authors"
    )

    val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]

    val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]

    val paperWithAuthors =
      spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]

    val firstJoin =
      papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
    firstJoin
      .joinWith(
        paperWithAuthors,
        firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
        "left"
      )
      .map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/merge_step_2")

    var magPubs: Dataset[(String, Publication)] =
      spark.read
        .load(s"$workingPath/merge_step_2")
        .as[Publication]
        .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
        .as[(String, Publication)]

    val conference = spark.read
      .load(s"$sourcePath/ConferenceInstances")
      .select(
        $"ConferenceInstanceId".as("ci"),
        $"DisplayName",
        $"Location",
        $"StartDate",
        $"EndDate"
      )
    val conferenceInstance = conference
      .joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
      .select(
        $"_1.ci",
        $"_1.DisplayName",
        $"_1.Location",
        $"_1.StartDate",
        $"_1.EndDate",
        $"_2.PaperId"
      )
      .as[MagConferenceInstance]

    magPubs
      .joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
      .map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/merge_step_3")

    val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]

    magPubs = spark.read
      .load(s"$workingPath/merge_step_3")
      .as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
      .as[(String, Publication)]

    magPubs
      .joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
      .map(item => ConversionUtil.updatePubsWithDescription(item))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/merge_step_4")

    logger.info("Phase 7) Enrich Publication with FieldOfStudy")

    magPubs = spark.read
      .load(s"$workingPath/merge_step_4")
      .as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
      .as[(String, Publication)]

    val fos = spark.read
      .load(s"$sourcePath/FieldsOfStudy")
      .select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")

    val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")

    val paperField = pfos
      .joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
      .select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
      .groupBy($"PaperId")
      .agg(
        collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
          .as("subjects")
      )
      .as[MagFieldOfStudy]

    magPubs
      .joinWith(
        paperField,
        col("_1")
          .equalTo(paperField("PaperId")),
        "left"
      )
      .map(item => ConversionUtil.updatePubsWithSubject(item))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/mag_publication")

    spark.read
      .load(s"$workingPath/mag_publication")
      .as[Publication]
      .filter(p => p.getId != null)
      .groupByKey(p => p.getId)
      .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
      .map(_._2)
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$targetPath/magPublication")

  }
}
next step of MAG conversion implemented 2020-05-13 10:38:04 +02:00			`package eu.dnetlib.doiboost.mag`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser`
code refactor 2020-05-28 09:57:46 +02:00			`import eu.dnetlib.dhp.schema.oaf.Publication`
added normalization step to the doi 2021-06-29 18:51:11 +02:00			`import eu.dnetlib.doiboost.DoiBoostMappingUtil`
next step of MAG conversion implemented 2020-05-13 10:38:04 +02:00			`import org.apache.commons.io.IOUtils`
			`import org.apache.spark.SparkConf`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`import org.apache.spark.sql.functions.{col, collect_list, struct}`
code refactor 2020-05-28 09:57:46 +02:00			`import org.apache.spark.sql._`
			`import org.slf4j.{Logger, LoggerFactory}`
next step of MAG conversion implemented 2020-05-19 09:24:45 +02:00			`import scala.collection.JavaConverters._`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00
minor fix 2021-01-04 17:37:08 +01:00			`object SparkProcessMAG {`
added normalization step to the doi 2021-06-29 18:51:11 +02:00
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {`
added normalization step to the doi 2021-06-29 18:51:11 +02:00			`d.where(col("Doi").isNotNull)`
			`.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)`
formatted code 2022-01-12 09:40:28 +01:00			`.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))`
added normalization step to the doi 2021-06-29 18:51:11 +02:00			`.map(_._2)(Encoders.product[MagPapers])`
			`.map(mp => {`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`MagPapers(`
			`mp.PaperId,`
			`mp.Rank,`
			`DoiBoostMappingUtil.normalizeDoi(mp.Doi),`
			`mp.DocType,`
			`mp.PaperTitle,`
			`mp.OriginalTitle,`
			`mp.BookTitle,`
			`mp.Year,`
			`mp.Date,`
			`mp.Publisher: String,`
			`mp.JournalId,`
			`mp.ConferenceSeriesId,`
			`mp.ConferenceInstanceId,`
			`mp.Volume,`
			`mp.Issue,`
			`mp.FirstPage,`
			`mp.LastPage,`
			`mp.ReferenceCount,`
			`mp.CitationCount,`
			`mp.EstimatedCitation,`
			`mp.OriginalVenue,`
			`mp.FamilyId,`
			`mp.CreatedDate`
			`)`
added normalization step to the doi 2021-06-29 18:51:11 +02:00			`})(Encoders.product[MagPapers])`
			`}`

next step of MAG conversion implemented 2020-05-13 10:38:04 +02:00			`def main(args: Array[String]): Unit = {`

			`val logger: Logger = LoggerFactory.getLogger(getClass)`
			`val conf: SparkConf = new SparkConf()`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val parser = new ArgumentApplicationParser(`
			`IOUtils.toString(`
			`getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")`
			`)`
			`)`
next step of MAG conversion implemented 2020-05-13 10:38:04 +02:00			`parser.parseArgument(args)`
			`val spark: SparkSession =`
			`SparkSession`
			`.builder()`
			`.config(conf)`
			`.appName(getClass.getSimpleName)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.master(parser.get("master"))`
			`.getOrCreate()`
next step of MAG conversion implemented 2020-05-19 09:24:45 +02:00
			`val sourcePath = parser.get("sourcePath")`
fixed doiboost mapping and workflows 2020-12-07 19:59:33 +01:00			`val workingPath = parser.get("workingPath")`
			`val targetPath = parser.get("targetPath")`

next step of MAG conversion implemented 2020-05-13 10:38:04 +02:00			`import spark.implicits._`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`implicit val mapEncoderPubs: Encoder[Publication] =`
			`org.apache.spark.sql.Encoders.kryo[Publication]`
			`implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =`
			`Encoders.tuple(Encoders.STRING, mapEncoderPubs)`
next step of MAG conversion implemented 2020-05-20 08:14:03 +02:00
added normalization step to the doi 2021-06-29 18:51:11 +02:00			`logger.info("Phase 1) make uninue DOI in Papers:")`
fixed doiboost mapping and workflows 2020-12-07 19:59:33 +01:00			`val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
			`// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one`
added normalization step to the doi 2021-06-29 18:51:11 +02:00			`val distinctPaper: Dataset[MagPapers] = getDistinctResults(d)`
fixed doiboost mapping and workflows 2020-12-07 19:59:33 +01:00
			`distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")`
code refactor 2020-05-28 09:57:46 +02:00
added test to verify secondary sorting 2020-06-25 10:48:15 +02:00			`logger.info("Phase 0) Enrich Publication with description")`
fixed doiboost mapping and workflows 2020-12-07 19:59:33 +01:00			`val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`pa.map(ConversionUtil.transformPaperAbstract)`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingPath/PaperAbstract")`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
			`logger.info("Phase 3) Group Author by PaperId")`
			`val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]`

			`val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val paperAuthorAffiliation =`
			`spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]`

			`paperAuthorAffiliation`
			`.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))`
			`.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>`
			`(a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))`
			`}`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00			`.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")`
			`.map(s => {`
			`val mpa = s._1._2`
			`val af = s._2`
			`if (af != null) {`
add author sequence number 2021-03-11 11:32:32 +01:00			`MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00			`} else`
			`mpa`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`})`
			`.groupBy("PaperId")`
			`.agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingPath/merge_step_1_paper_authors")`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`logger.info(`
			`"Phase 4) create First Version of publication Entity with Paper Journal and Authors"`
			`)`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
			`val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val paperWithAuthors =`
			`spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val firstJoin =`
			`papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")`
			`firstJoin`
			`.joinWith(`
			`paperWithAuthors,`
			`firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),`
			`"left"`
			`)`
code refactor 2020-05-28 09:57:46 +02:00			`.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingPath/merge_step_2")`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
code refactor 2020-05-28 09:57:46 +02:00			`var magPubs: Dataset[(String, Publication)] =`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`spark.read`
			`.load(s"$workingPath/merge_step_2")`
			`.as[Publication]`
			`.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))`
			`.as[(String, Publication)]`

			`val conference = spark.read`
			`.load(s"$sourcePath/ConferenceInstances")`
			`.select(`
			`$"ConferenceInstanceId".as("ci"),`
			`$"DisplayName",`
			`$"Location",`
			`$"StartDate",`
			`$"EndDate"`
			`)`
			`val conferenceInstance = conference`
			`.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))`
			`.select(`
			`$"_1.ci",`
			`$"_1.DisplayName",`
			`$"_1.Location",`
			`$"_1.StartDate",`
			`$"_1.EndDate",`
			`$"_2.PaperId"`
			`)`
			`.as[MagConferenceInstance]`

			`magPubs`
			`.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")`
code refactor 2020-05-28 09:57:46 +02:00			`.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))`
			`.write`
			`.mode(SaveMode.Overwrite)`
[DOIBoost - Mapping] Remove the addition of the instance to the MAG publication record 2021-11-03 15:51:26 +01:00			`.save(s"$workingPath/merge_step_3")`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`magPubs = spark.read`
			`.load(s"$workingPath/merge_step_3")`
			`.as[Publication]`
			`.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))`
			`.as[(String, Publication)]`
fixed Crossref Mapping 2020-05-20 17:05:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`magPubs`
			`.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")`
			`.map(item => ConversionUtil.updatePubsWithDescription(item))`
			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$workingPath/merge_step_4")`
next step of MAG conversion implemented 2020-05-20 08:14:03 +02:00
			`logger.info("Phase 7) Enrich Publication with FieldOfStudy")`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`magPubs = spark.read`
			`.load(s"$workingPath/merge_step_4")`
			`.as[Publication]`
			`.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))`
			`.as[(String, Publication)]`
next step of MAG conversion implemented 2020-05-20 08:14:03 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val fos = spark.read`
			`.load(s"$sourcePath/FieldsOfStudy")`
			`.select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")`
next step of MAG conversion implemented 2020-05-20 08:14:03 +02:00
			`val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val paperField = pfos`
			`.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))`
next step of MAG conversion implemented 2020-05-20 08:14:03 +02:00			`.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.groupBy($"PaperId")`
			`.agg(`
			`collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))`
			`.as("subjects")`
			`)`
next step of MAG conversion implemented 2020-05-20 08:14:03 +02:00			`.as[MagFieldOfStudy]`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`magPubs`
			`.joinWith(`
			`paperField,`
			`col("_1")`
			`.equalTo(paperField("PaperId")),`
			`"left"`
			`)`
code refactor 2020-05-28 09:57:46 +02:00			`.map(item => ConversionUtil.updatePubsWithSubject(item))`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.write`
			`.mode(SaveMode.Overwrite)`
fixed doiboost mapping and workflows 2020-12-07 19:59:33 +01:00			`.save(s"$workingPath/mag_publication")`
code refactor 2020-05-28 09:57:46 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`spark.read`
			`.load(s"$workingPath/mag_publication")`
			`.as[Publication]`
[DOIBOOST Process] fix filtering to filter results with non null id 2021-11-25 12:10:45 +01:00			`.filter(p => p.getId != null)`
[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id) 2021-11-04 16:16:40 +01:00			`.groupByKey(p => p.getId)`
[scala-refactor] Module dhp-doiboost: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 14:24:03 +01:00			`.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))`
[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id) 2021-11-04 16:16:40 +01:00			`.map(_._2)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.write`
			`.mode(SaveMode.Overwrite)`
			`.save(s"$targetPath/magPublication")`
[DOIBoost Mapping] removed not needed comments 2021-11-04 16:24:07 +01:00
next step of MAG conversion implemented 2020-05-13 10:38:04 +02:00			`}`
			`}`