implemented generation of ActionSet

2020-05-26 09:15:33 +02:00 · 2020-05-26 09:15:33 +02:00 · 25f52e19a4
parent 2408083566
commit 25f52e19a4
8 changed files with 484 additions and 198 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@ -1,10 +1,14 @@
 package eu.dnetlib.doiboost
-import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
 import eu.dnetlib.dhp.utils.DHPUtils
 import org.apache.commons.lang3.StringUtils
 import org.codehaus.jackson.map.ObjectMapper
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
 import scala.io.Source
@ -12,8 +16,12 @@ import scala.io.Source
 case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
 case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){}
 object DoiBoostMappingUtil {
  val logger: Logger = LoggerFactory.getLogger(getClass)
  //STATIC STRING
  val MAG = "microsoft"
  val MAG_NAME = "Microsoft Academic Graph"
@ -30,6 +38,31 @@ object DoiBoostMappingUtil {
  val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
  def toActionSet(item:Oaf) :(String, String) = {
    val mapper = new ObjectMapper()
    item match {
      case dataset: Dataset =>
        val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
        a.setClazz(classOf[Dataset])
        a.setPayload(dataset)
        (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case publication: Publication =>
        val a: AtomicAction[Publication] = new AtomicAction[Publication]
        a.setClazz(classOf[Publication])
        a.setPayload(publication)
        (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case relation: Relation =>
        val a: AtomicAction[Relation] = new AtomicAction[Relation]
        a.setClazz(classOf[Relation])
        a.setPayload(relation)
        (relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
      case _ =>
        null
    }
  }
  def retrieveHostedByMap(): Map[String, HostedByItemType] = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -52,210 +85,249 @@ object DoiBoostMappingUtil {
  }
  def generateGridAffiliationId(gridId:String) :String = {
    s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
  }
  def fixResult(result: Dataset) :Dataset = {
    val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
    if (instanceType.isDefined) {
      result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
    }
    result.getInstance().asScala.foreach(i => {
      val hb = new KeyValue
      hb.setValue("Unknown Repository")
      hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
      i.setHostedby(hb)
    })
    result
  }
  def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
-    if (publication.getJournal == null)
+    val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
-      return publication
+    val eissn =if (publication.getJournal == null) null else  publication.getJournal.getIssnOnline
    val lissn =if (publication.getJournal == null) null else  publication.getJournal.getIssnLinking
-    val issn = publication.getJournal.getIssnPrinted
+    val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
-    val eissn = publication.getJournal.getIssnOnline
+
-    val lissn = publication.getJournal.getIssnLinking
+    if (instanceType.isDefined) {
      publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
    }
    val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
-    if (item!= null) {
+    publication.getInstance().asScala.foreach(i => {
-      val l = publication.getInstance().asScala.map(i =>{
+      val hb = new KeyValue
-        val hb = new KeyValue
+      if (item != null) {
-        hb.setValue (item.officialName)
+        hb.setValue(item.officialName)
-        hb.setKey (s"10|${item.id}" )
+        hb.setKey(generateDSId(item.id))
-        i.setHostedby(hb)
+        if (item.openAccess)
        if(item.openAccess)
          i.setAccessright(createQualifier("Open", "dnet:access_modes"))
-        i
+        publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
-      }).asJava
+      }
      else {
        hb.setValue("Unknown Repository")
        hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
      }
      i.setHostedby(hb)
    })
-      publication.setInstance(l)
+    val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
    if (ar.nonEmpty) {
      if(ar.contains("Open")){
        publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
      }
      else {
        publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes"))
      }
    }
    publication
-}
+  }
-def generateDataInfo (): DataInfo = {
+  def generateDSId(input: String): String = {
-  generateDataInfo ("0.9")
+
-}
+    val b = StringUtils.substringBefore(input, "::")
    val a = StringUtils.substringAfter(input, "::")
    s"10|${b}::${DHPUtils.md5(a)}"
  }
-  def filterPublication (publication: Publication): Boolean = {
+  def generateDataInfo(): DataInfo = {
-
+    generateDataInfo("0.9")
-  //Case empty publication
+  }
  if (publication == null)
  return false
  //Case publication with no title
  if (publication.getTitle == null || publication.getTitle.size == 0)
  return false
-  val s = publication.getTitle.asScala.count (p => p.getValue != null
+  def filterPublication(publication: Publication): Boolean = {
  && p.getValue.nonEmpty && ! p.getValue.equalsIgnoreCase ("[NO TITLE AVAILABLE]") )
-  if (s == 0)
+    //Case empty publication
-  return false
+    if (publication == null)
      return false
-  // fixes #4360 (test publisher)
+    //Case publication with no title
-  val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
+    if (publication.getTitle == null || publication.getTitle.size == 0)
-
+      return false
  if (publisher != null && (publisher.equalsIgnoreCase ("Test accounts") || publisher.equalsIgnoreCase ("CrossRef Test Account") ) ) {
  return false;
 }
  //Publication with no Author
  if (publication.getAuthor == null || publication.getAuthor.size () == 0)
  return false
-  //filter invalid author
+    val s = publication.getTitle.asScala.count(p => p.getValue != null
-  val authors = publication.getAuthor.asScala.map (s => {
+      && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
  if (s.getFullname.nonEmpty) {
  s.getFullname
 }
  else
  s"${
  s.getName
 } ${
  s.getSurname
 }"
 })
-  val c = authors.count (isValidAuthorName)
+    if (s == 0)
-  if (c == 0)
+      return false
  return false
-  // fixes #4368
+    // fixes #4360 (test publisher)
-  if (authors.count (s => s.equalsIgnoreCase ("Addie Jackson") ) > 0 && "Elsevier BV".equalsIgnoreCase (publication.getPublisher.getValue) )
+    val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
  return false
-  true
+    if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
-}
+      return false;
    }
    //Publication with no Author
    if (publication.getAuthor == null || publication.getAuthor.size() == 0)
      return false
-  def isValidAuthorName (fullName: String): Boolean = {
+    //filter invalid author
-  if (fullName == null || fullName.isEmpty)
+    val authors = publication.getAuthor.asScala.map(s => {
-  return false
+      if (s.getFullname.nonEmpty) {
-  if (invalidName.contains (fullName.toLowerCase.trim) )
+        s.getFullname
-  return false
+      }
-  true
+      else
-}
+        s"${
          s.getName
        } ${
          s.getSurname
        }"
    })
    val c = authors.count(isValidAuthorName)
    if (c == 0)
      return false
    // fixes #4368
    if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
      return false
    true
  }
-  def generateDataInfo (trust: String): DataInfo = {
+  def isValidAuthorName(fullName: String): Boolean = {
-  val di = new DataInfo
+    if (fullName == null || fullName.isEmpty)
-  di.setDeletedbyinference (false)
+      return false
-  di.setInferred (false)
+    if (invalidName.contains(fullName.toLowerCase.trim))
-  di.setInvisible (false)
+      return false
-  di.setTrust (trust)
+    true
-  di.setProvenanceaction (createQualifier ("sysimport:actionset", "dnet:provenanceActions") )
+  }
  di
 }
-  def createSP (value: String, classId: String, schemeId: String): StructuredProperty = {
+  def generateDataInfo(trust: String): DataInfo = {
-  val sp = new StructuredProperty
+    val di = new DataInfo
-  sp.setQualifier (createQualifier (classId, schemeId) )
+    di.setDeletedbyinference(false)
-  sp.setValue (value)
+    di.setInferred(false)
-  sp
+    di.setInvisible(false)
-
+    di.setTrust(trust)
-}
+    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
-
+    di
-  def createSP (value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
+  }
  val sp = new StructuredProperty
  sp.setQualifier (createQualifier (classId, schemeId) )
  sp.setValue (value)
  sp.setDataInfo (dataInfo)
  sp
 }
  def createCrossrefCollectedFrom (): KeyValue = {
  val cf = new KeyValue
  cf.setValue (CROSSREF)
  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (CROSSREF.toLowerCase) )
  cf
 }
-  def createUnpayWallCollectedFrom (): KeyValue = {
+  def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
    val sp = new StructuredProperty
    sp.setQualifier(createQualifier(classId, schemeId))
    sp.setValue(value)
    sp
-  val cf = new KeyValue
+  }
  cf.setValue (UNPAYWALL)
  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (UNPAYWALL.toLowerCase) )
  cf
-}
+  def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
    val sp = new StructuredProperty
    sp.setQualifier(createQualifier(classId, schemeId))
    sp.setValue(value)
    sp.setDataInfo(dataInfo)
    sp
-  def createORIDCollectedFrom (): KeyValue = {
+  }
-  val cf = new KeyValue
+  def createCrossrefCollectedFrom(): KeyValue = {
  cf.setValue (ORCID)
  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (ORCID.toLowerCase) )
  cf
-}
+    val cf = new KeyValue
    cf.setValue(CROSSREF)
    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
    cf
  }
  def createUnpayWallCollectedFrom(): KeyValue = {
    val cf = new KeyValue
    cf.setValue(UNPAYWALL)
    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
    cf
  }
  def createORIDCollectedFrom(): KeyValue = {
    val cf = new KeyValue
    cf.setValue(ORCID)
    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
    cf
  }
  def generateIdentifier (oaf: Result, doi: String): String = {
-  val id = DHPUtils.md5 (doi.toLowerCase)
+    val id = DHPUtils.md5 (doi.toLowerCase)
-  if (oaf.isInstanceOf[Dataset] )
+    if (oaf.isInstanceOf[Dataset] )
-  return s"60|${
+      return s"60|${
-  doiBoostNSPREFIX
+        doiBoostNSPREFIX
-}${
+      }${
-  SEPARATOR
+        SEPARATOR
-}${
+      }${
-  id
+        id
-}"
+      }"
-  s"50|${
+    s"50|${
-  doiBoostNSPREFIX
+      doiBoostNSPREFIX
-}${
+    }${
-  SEPARATOR
+      SEPARATOR
-}${
+    }${
-  id
+      id
-}"
+    }"
-}
+  }
-  def createMAGCollectedFrom (): KeyValue = {
+  def createMAGCollectedFrom(): KeyValue = {
-  val cf = new KeyValue
+    val cf = new KeyValue
-  cf.setValue (MAG)
+    cf.setValue(MAG_NAME)
-  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (MAG) )
+    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
-  cf
+    cf
-}
+  }
-  def createQualifier (clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
+  def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
-  val q = new Qualifier
+    val q = new Qualifier
-  q.setClassid (clsName)
+    q.setClassid(clsName)
-  q.setClassname (clsValue)
+    q.setClassname(clsValue)
-  q.setSchemeid (schName)
+    q.setSchemeid(schName)
-  q.setSchemename (schValue)
+    q.setSchemename(schValue)
-  q
+    q
-}
+  }
-  def createQualifier (cls: String, sch: String): Qualifier = {
+  def createQualifier(cls: String, sch: String): Qualifier = {
-  createQualifier (cls, cls, sch, sch)
+    createQualifier(cls, cls, sch, sch)
-}
+  }
-  def asField[T] (value: T): Field[T] = {
+  def asField[T](value: T): Field[T] = {
-  val tmp = new Field[T]
+    val tmp = new Field[T]
-  tmp.setValue (value)
+    tmp.setValue(value)
-  tmp
+    tmp
-
+
-
+
-}
+  }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@ -0,0 +1,80 @@
 package eu.dnetlib.doiboost
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.action.AtomicAction
 import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
 import org.apache.hadoop.io.Text
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.hadoop.mapred.{SequenceFileOutputFormat, TextOutputFormat}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 object SparkGenerateDOIBoostActionSet {
  val logger: Logger = LoggerFactory.getLogger(getClass)
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
 //    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
 //    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
 //    implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
 //
 //    implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
 //
 //
 //
 //    val dbPublicationPath           = parser.get("dbPublicationPath")
 //    val dbDatasetPath               = parser.get("dbDatasetPath")
 //    val crossRefRelation            = parser.get("crossRefRelation")
 //    val dbaffiliationRelationPath   = parser.get("dbaffiliationRelationPath")
      val workingDirPath              = parser.get("targetPath")
 //
 //    spark.read.load(dbDatasetPath).as[OafDataset]
 //      .map(d =>DoiBoostMappingUtil.fixResult(d))
 //      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
 //      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
 //
 //    spark.read.load(dbPublicationPath).as[Publication]
 //      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
 //      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
 //
 //    spark.read.load(crossRefRelation).as[Relation]
 //      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
 //      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
 //
 //    spark.read.load(dbaffiliationRelationPath).as[Relation]
 //      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
 //      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
    implicit val mapEncoderPub: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
    val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
    SequenceFileOutputFormat
    d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset_a8c2f90b-a3ae-4d6e-8187-47a437156e18_1590223414", classOf[Text], classOf[Text], classOf[TextOutputFormat[Text,Text]], classOf[GzipCodec])
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
@ -1,11 +1,14 @@
 package eu.dnetlib.doiboost
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
+import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
 import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import scala.collection.JavaConverters._
 object SparkGenerateDoiBoost {
@ -22,7 +25,7 @@ object SparkGenerateDoiBoost {
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
-
+    import spark.implicits._
    val crossrefPublicationPath = parser.get("crossrefPublicationPath")
    val crossrefDatasetPath = parser.get("crossrefDatasetPath")
    val uwPublicationPath = parser.get("uwPublicationPath")
@ -32,15 +35,16 @@ object SparkGenerateDoiBoost {
    logger.info("Phase 1) repartition and move all the dataset in a same working folder")
-//    spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
+    spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
-//    spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
+    spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
-//    spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
+    spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
-//    spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
+    spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
-//    spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
+    spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
    implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
    logger.info("Phase 2) Join Crossref with UnpayWall")
@ -73,11 +77,49 @@ object SparkGenerateDoiBoost {
    val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication]
-    doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
+    val map = DoiBoostMappingUtil.retrieveHostedByMap()
    doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(p => DoiBoostMappingUtil.fixPublication(p, map)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
    val affiliationPath = parser.get("affiliationPath")
    val paperAffiliationPath = parser.get("paperAffiliationPath")
    val affiliation = spark.read.load(affiliationPath).where(col("GridId").isNotNull).select(col("AffiliationId"), col("GridId"))
    val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
    val a:Dataset[DoiBoostAffiliation] = paperAffiliation
      .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))).select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId")).as[DoiBoostAffiliation]
    val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
    magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
      val pub:Publication = item._1._2
      val affiliation = item._2
      val r:Relation = new Relation
      r.setSource(pub.getId)
      r.setTarget(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
      r.setRelType("resultOrganization")
      r.setRelClass("hasAuthorInstitution")
      r.setSubRelType("affiliation")
      r.setDataInfo(pub.getDataInfo)
      r.setCollectedfrom(pub.getCollectedfrom)
      val r1:Relation = new Relation
      r1.setTarget(pub.getId)
      r1.setSource(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
      r1.setRelType("resultOrganization")
      r1.setRelClass("isAuthorInstitutionOf")
      r1.setSubRelType("affiliation")
      r1.setDataInfo(pub.getDataInfo)
      r1.setCollectedfrom(pub.getCollectedfrom)
      List(r, r1)
    })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
@ -0,0 +1,8 @@
 [
  {"paramName": "m",    "paramLongName":"master",                    "paramDescription": "the master name",                 "paramRequired": true},
  {"paramName": "dp",   "paramLongName":"dbPublicationPath",   "paramDescription": "the Crossref Publication Path",   "paramRequired": true},
  {"paramName": "dd",   "paramLongName":"dbDatasetPath",       "paramDescription": "the Crossref Dataset Path",       "paramRequired": true},
  {"paramName": "cr",   "paramLongName":"crossRefRelation",         "paramDescription": "the UnpayWall Publication Path",  "paramRequired": true},
  {"paramName": "da",   "paramLongName":"dbaffiliationRelationPath",        "paramDescription": "the MAG Publication Path",        "paramRequired": true},
  {"paramName": "w",    "paramLongName":"targetPath",            "paramDescription": "the Working Path",                "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json
@ -1,9 +1,11 @@
 [
-  {"paramName": "m",    "paramLongName":"master",                    "paramDescription": "the master name",                "paramRequired": true},
+  {"paramName": "m",    "paramLongName":"master",                    "paramDescription": "the master name",                 "paramRequired": true},
-  {"paramName": "cp",   "paramLongName":"crossrefPublicationPath",   "paramDescription": "the Crossref Publication Path",  "paramRequired": true},
+  {"paramName": "cp",   "paramLongName":"crossrefPublicationPath",   "paramDescription": "the Crossref Publication Path",   "paramRequired": true},
-  {"paramName": "cd",   "paramLongName":"crossrefDatasetPath",       "paramDescription": "the Crossref Dataset Path",      "paramRequired": true},
+  {"paramName": "cd",   "paramLongName":"crossrefDatasetPath",       "paramDescription": "the Crossref Dataset Path",       "paramRequired": true},
-  {"paramName": "up",   "paramLongName":"uwPublicationPath",         "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
+  {"paramName": "up",   "paramLongName":"uwPublicationPath",         "paramDescription": "the UnpayWall Publication Path",  "paramRequired": true},
-  {"paramName": "mp",   "paramLongName":"magPublicationPath",        "paramDescription": "the MAG Publication Path",       "paramRequired": true},
+  {"paramName": "mp",   "paramLongName":"magPublicationPath",        "paramDescription": "the MAG Publication Path",        "paramRequired": true},
-  {"paramName": "op",   "paramLongName":"orcidPublicationPath",      "paramDescription": "the ORCID Publication Path",     "paramRequired": true},
+  {"paramName": "op",   "paramLongName":"orcidPublicationPath",      "paramDescription": "the ORCID Publication Path",      "paramRequired": true},
-  {"paramName": "w",    "paramLongName":"workingDirPath",            "paramDescription": "the Working Path",               "paramRequired": true}
+  {"paramName": "ap",   "paramLongName":"affiliationPath",            "paramDescription": "the Affliation Path",            "paramRequired": true},
  {"paramName": "pa",   "paramLongName":"paperAffiliationPath",      "paramDescription": "the paperAffiliation Path",       "paramRequired": true},
  {"paramName": "w",    "paramLongName":"workingDirPath",            "paramDescription": "the Working Path",                "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@ -20,6 +20,18 @@
            <name>orcidPublicationPath</name>
            <description>the ORCID Publication Path</description>
        </property>
        <property>
            <name>affiliationPath</name>
            <description>the Affliation Path</description>
        </property>
        <property>
            <name>paperAffiliationPath</name>
            <description>the paperAffiliation Path</description>
        </property>
        <property>
            <name>workingDirPath</name>
            <description>the Working Path</description>
@ -40,7 +52,7 @@
-    <start to="CreateDOIBoost"/>
+    <start to="GenerateActionSet"/>
    <kill name="Kill">
@ -75,36 +87,40 @@
            <arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
            <arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
            <arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
            <arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
            <arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
            <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="GenerateActionSet"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateActionSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Generate DOIBoost ActionSet</name>
            <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--dbPublicationPath</arg><arg>${workingDirPath}/doiBoostPublicationFiltered</arg>
            <arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
            <arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg>
            <arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
            <arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
 <!--    <action name="PreprocessMag">-->
 <!--        <spark xmlns="uri:oozie:spark-action:0.2">-->
 <!--            <master>yarn-cluster</master>-->
 <!--            <mode>cluster</mode>-->
 <!--            <name>Convert Mag to Dataset</name>-->
 <!--            <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>-->
 <!--            <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
 <!--            <spark-opts>-->
 <!--                &#45;&#45;executor-memory=${sparkExecutorMemory}-->
 <!--                &#45;&#45;executor-cores=${sparkExecutorCores}-->
 <!--                &#45;&#45;driver-memory=${sparkDriverMemory}-->
 <!--                &#45;&#45;conf spark.sql.shuffle.partitions=3840-->
 <!--                ${sparkExtraOPT}-->
 <!--            </spark-opts>-->
 <!--            <arg>&#45;&#45;sourcePath</arg><arg>${sourcePath}</arg>-->
 <!--            <arg>&#45;&#45;targetPath</arg><arg>${targetPath}</arg>-->
 <!--            <arg>&#45;&#45;master</arg><arg>yarn-cluster</arg>-->
 <!--        </spark>-->
 <!--        <ok to="End"/>-->
 <!--        <error to="Kill"/>-->
 <!--    </action>-->
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
@ -1,7 +1,11 @@
 package eu.dnetlib.dhp.doiboost
-
+import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
 import org.junit.jupiter.api.Test
 class DoiBoostHostedByMapTest {
@ -13,4 +17,46 @@ class DoiBoostHostedByMapTest {
  }
  @Test
  def testFilter():Unit = {
    val conf: SparkConf = new SparkConf()
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master("local[*]").getOrCreate()
    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
    val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
    val mapper = new ObjectMapper()
    val map = DoiBoostMappingUtil.retrieveHostedByMap()
   println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
  }
  @Test
  def idDSGeneration():Unit = {
    val s ="doajarticles::0066-782X"
    println(DoiBoostMappingUtil.generateDSId(s))
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/dhp/doiboost/publicationInput.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/dhp/doiboost/publicationInput.json