implemented generation of ActionSet

2020-05-26 09:15:33 +02:00 · 2020-05-26 09:15:33 +02:00 · 25f52e19a4
parent 2408083566
commit 25f52e19a4
8 changed files with 484 additions and 198 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@ -1,10 +1,14 @@
 package eu.dnetlib.doiboost

-import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.action.AtomicAction
+import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
 import eu.dnetlib.dhp.utils.DHPUtils
+import org.apache.commons.lang3.StringUtils
+import org.codehaus.jackson.map.ObjectMapper
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
+import org.slf4j.{Logger, LoggerFactory}

 import scala.collection.JavaConverters._
 import scala.io.Source
@ -12,8 +16,12 @@ import scala.io.Source

 case class HostedByItemType(id: String, officialName: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}

+case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:String){}
+
 object DoiBoostMappingUtil {

+  val logger: Logger = LoggerFactory.getLogger(getClass)
+
  //STATIC STRING
  val MAG = "microsoft"
  val MAG_NAME = "Microsoft Academic Graph"
@ -30,6 +38,31 @@ object DoiBoostMappingUtil {

  val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")

+  def toActionSet(item:Oaf) :(String, String) = {
+    val mapper = new ObjectMapper()
+
+    item match {
+      case dataset: Dataset =>
+        val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
+        a.setClazz(classOf[Dataset])
+        a.setPayload(dataset)
+        (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case publication: Publication =>
+        val a: AtomicAction[Publication] = new AtomicAction[Publication]
+        a.setClazz(classOf[Publication])
+        a.setPayload(publication)
+        (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case relation: Relation =>
+        val a: AtomicAction[Relation] = new AtomicAction[Relation]
+        a.setClazz(classOf[Relation])
+        a.setPayload(relation)
+        (relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case _ =>
+        null
+    }
+
+  }
+

  def retrieveHostedByMap(): Map[String, HostedByItemType] = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -52,210 +85,249 @@ object DoiBoostMappingUtil {

  }

+  def generateGridAffiliationId(gridId:String) :String = {
+    s"10|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
+  }


+  def fixResult(result: Dataset) :Dataset = {
+    val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
+    if (instanceType.isDefined) {
+      result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
+    }
+    result.getInstance().asScala.foreach(i => {
+      val hb = new KeyValue
+      hb.setValue("Unknown Repository")
+      hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
+      i.setHostedby(hb)
+    })
+    result
+  }

  def fixPublication(publication: Publication, hostedByMap: Map[String, HostedByItemType]): Publication = {
-    if (publication.getJournal == null)
-      return publication
+    val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
+    val eissn =if (publication.getJournal == null) null else  publication.getJournal.getIssnOnline
+    val lissn =if (publication.getJournal == null) null else  publication.getJournal.getIssnLinking

-    val issn = publication.getJournal.getIssnPrinted
-    val eissn = publication.getJournal.getIssnOnline
-    val lissn = publication.getJournal.getIssnLinking
+    val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
+
+    if (instanceType.isDefined) {
+      publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
+    }

    val item = retrieveHostedByItem(issn, eissn, lissn, hostedByMap)
-    if (item!= null) {
-      val l = publication.getInstance().asScala.map(i =>{
-        val hb = new KeyValue
-        hb.setValue (item.officialName)
-        hb.setKey (s"10|${item.id}" )
-        i.setHostedby(hb)
-        if(item.openAccess)
+    publication.getInstance().asScala.foreach(i => {
+      val hb = new KeyValue
+      if (item != null) {
+        hb.setValue(item.officialName)
+        hb.setKey(generateDSId(item.id))
+        if (item.openAccess)
          i.setAccessright(createQualifier("Open", "dnet:access_modes"))
-        i
-      }).asJava
+        publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
+      }
+      else {
+        hb.setValue("Unknown Repository")
+        hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
+      }
+      i.setHostedby(hb)
+    })

-      publication.setInstance(l)
+    val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
+    if (ar.nonEmpty) {
+      if(ar.contains("Open")){
+        publication.setBestaccessright(createQualifier("Open", "dnet:access_modes"))
+      }
+      else {
+        publication.setBestaccessright(createQualifier(ar.head, "dnet:access_modes"))
+      }
    }
    publication
-}
+  }


-def generateDataInfo (): DataInfo = {
-  generateDataInfo ("0.9")
-}
+  def generateDSId(input: String): String = {
+
+    val b = StringUtils.substringBefore(input, "::")
+    val a = StringUtils.substringAfter(input, "::")
+    s"10|${b}::${DHPUtils.md5(a)}"
+  }


-  def filterPublication (publication: Publication): Boolean = {
-
-  //Case empty publication
-  if (publication == null)
-  return false
-
-  //Case publication with no title
-  if (publication.getTitle == null || publication.getTitle.size == 0)
-  return false
+  def generateDataInfo(): DataInfo = {
+    generateDataInfo("0.9")
+  }


-  val s = publication.getTitle.asScala.count (p => p.getValue != null
-  && p.getValue.nonEmpty && ! p.getValue.equalsIgnoreCase ("[NO TITLE AVAILABLE]") )
+  def filterPublication(publication: Publication): Boolean = {

-  if (s == 0)
-  return false
+    //Case empty publication
+    if (publication == null)
+      return false

-  // fixes #4360 (test publisher)
-  val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
-
-  if (publisher != null && (publisher.equalsIgnoreCase ("Test accounts") || publisher.equalsIgnoreCase ("CrossRef Test Account") ) ) {
-  return false;
-}
-
-  //Publication with no Author
-  if (publication.getAuthor == null || publication.getAuthor.size () == 0)
-  return false
+    //Case publication with no title
+    if (publication.getTitle == null || publication.getTitle.size == 0)
+      return false


-  //filter invalid author
-  val authors = publication.getAuthor.asScala.map (s => {
-  if (s.getFullname.nonEmpty) {
-  s.getFullname
-}
-  else
-  s"${
-  s.getName
-} ${
-  s.getSurname
-}"
-})
+    val s = publication.getTitle.asScala.count(p => p.getValue != null
+      && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))

-  val c = authors.count (isValidAuthorName)
-  if (c == 0)
-  return false
+    if (s == 0)
+      return false

-  // fixes #4368
-  if (authors.count (s => s.equalsIgnoreCase ("Addie Jackson") ) > 0 && "Elsevier BV".equalsIgnoreCase (publication.getPublisher.getValue) )
-  return false
+    // fixes #4360 (test publisher)
+    val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null

-  true
-}
+    if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
+      return false;
+    }
+
+    //Publication with no Author
+    if (publication.getAuthor == null || publication.getAuthor.size() == 0)
+      return false


-  def isValidAuthorName (fullName: String): Boolean = {
-  if (fullName == null || fullName.isEmpty)
-  return false
-  if (invalidName.contains (fullName.toLowerCase.trim) )
-  return false
-  true
-}
+    //filter invalid author
+    val authors = publication.getAuthor.asScala.map(s => {
+      if (s.getFullname.nonEmpty) {
+        s.getFullname
+      }
+      else
+        s"${
+          s.getName
+        } ${
+          s.getSurname
+        }"
+    })
+
+    val c = authors.count(isValidAuthorName)
+    if (c == 0)
+      return false
+
+    // fixes #4368
+    if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
+      return false
+
+    true
+  }


-  def generateDataInfo (trust: String): DataInfo = {
-  val di = new DataInfo
-  di.setDeletedbyinference (false)
-  di.setInferred (false)
-  di.setInvisible (false)
-  di.setTrust (trust)
-  di.setProvenanceaction (createQualifier ("sysimport:actionset", "dnet:provenanceActions") )
-  di
-}
+  def isValidAuthorName(fullName: String): Boolean = {
+    if (fullName == null || fullName.isEmpty)
+      return false
+    if (invalidName.contains(fullName.toLowerCase.trim))
+      return false
+    true
+  }


-  def createSP (value: String, classId: String, schemeId: String): StructuredProperty = {
-  val sp = new StructuredProperty
-  sp.setQualifier (createQualifier (classId, schemeId) )
-  sp.setValue (value)
-  sp
-
-}
-
-  def createSP (value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
-  val sp = new StructuredProperty
-  sp.setQualifier (createQualifier (classId, schemeId) )
-  sp.setValue (value)
-  sp.setDataInfo (dataInfo)
-  sp
-
-}
-
-  def createCrossrefCollectedFrom (): KeyValue = {
-
-  val cf = new KeyValue
-  cf.setValue (CROSSREF)
-  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (CROSSREF.toLowerCase) )
-  cf
-
-}
+  def generateDataInfo(trust: String): DataInfo = {
+    val di = new DataInfo
+    di.setDeletedbyinference(false)
+    di.setInferred(false)
+    di.setInvisible(false)
+    di.setTrust(trust)
+    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
+    di
+  }


-  def createUnpayWallCollectedFrom (): KeyValue = {
+  def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
+    val sp = new StructuredProperty
+    sp.setQualifier(createQualifier(classId, schemeId))
+    sp.setValue(value)
+    sp

-  val cf = new KeyValue
-  cf.setValue (UNPAYWALL)
-  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (UNPAYWALL.toLowerCase) )
-  cf
+  }

-}
+  def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
+    val sp = new StructuredProperty
+    sp.setQualifier(createQualifier(classId, schemeId))
+    sp.setValue(value)
+    sp.setDataInfo(dataInfo)
+    sp

-  def createORIDCollectedFrom (): KeyValue = {
+  }

-  val cf = new KeyValue
-  cf.setValue (ORCID)
-  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (ORCID.toLowerCase) )
-  cf
+  def createCrossrefCollectedFrom(): KeyValue = {

-}
+    val cf = new KeyValue
+    cf.setValue(CROSSREF)
+    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
+    cf
+
+  }
+
+
+  def createUnpayWallCollectedFrom(): KeyValue = {
+
+    val cf = new KeyValue
+    cf.setValue(UNPAYWALL)
+    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
+    cf
+
+  }
+
+  def createORIDCollectedFrom(): KeyValue = {
+
+    val cf = new KeyValue
+    cf.setValue(ORCID)
+    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
+    cf
+
+  }


  def generateIdentifier (oaf: Result, doi: String): String = {
-  val id = DHPUtils.md5 (doi.toLowerCase)
-  if (oaf.isInstanceOf[Dataset] )
-  return s"60|${
-  doiBoostNSPREFIX
-}${
-  SEPARATOR
-}${
-  id
-}"
-  s"50|${
-  doiBoostNSPREFIX
-}${
-  SEPARATOR
-}${
-  id
-}"
-}
+    val id = DHPUtils.md5 (doi.toLowerCase)
+    if (oaf.isInstanceOf[Dataset] )
+      return s"60|${
+        doiBoostNSPREFIX
+      }${
+        SEPARATOR
+      }${
+        id
+      }"
+    s"50|${
+      doiBoostNSPREFIX
+    }${
+      SEPARATOR
+    }${
+      id
+    }"
+  }


-  def createMAGCollectedFrom (): KeyValue = {
+  def createMAGCollectedFrom(): KeyValue = {

-  val cf = new KeyValue
-  cf.setValue (MAG)
-  cf.setKey ("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5 (MAG) )
-  cf
+    val cf = new KeyValue
+    cf.setValue(MAG_NAME)
+    cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
+    cf

-}
+  }

-  def createQualifier (clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
-  val q = new Qualifier
-  q.setClassid (clsName)
-  q.setClassname (clsValue)
-  q.setSchemeid (schName)
-  q.setSchemename (schValue)
-  q
-}
+  def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
+    val q = new Qualifier
+    q.setClassid(clsName)
+    q.setClassname(clsValue)
+    q.setSchemeid(schName)
+    q.setSchemename(schValue)
+    q
+  }

-  def createQualifier (cls: String, sch: String): Qualifier = {
-  createQualifier (cls, cls, sch, sch)
-}
+  def createQualifier(cls: String, sch: String): Qualifier = {
+    createQualifier(cls, cls, sch, sch)
+  }


-  def asField[T] (value: T): Field[T] = {
-  val tmp = new Field[T]
-  tmp.setValue (value)
-  tmp
-
-
-}
+  def asField[T](value: T): Field[T] = {
+    val tmp = new Field[T]
+    tmp.setValue(value)
+    tmp
+
+
+  }


 }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@ -0,0 +1,80 @@
+package eu.dnetlib.doiboost
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.action.AtomicAction
+import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
+import org.apache.hadoop.io.Text
+import org.apache.commons.io.IOUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.hadoop.mapred.{SequenceFileOutputFormat, TextOutputFormat}
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkGenerateDOIBoostActionSet {
+  val logger: Logger = LoggerFactory.getLogger(getClass)
+  def main(args: Array[String]): Unit = {
+
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+//    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
+//    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
+//    implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
+//
+//    implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
+//
+//
+//
+//    val dbPublicationPath           = parser.get("dbPublicationPath")
+//    val dbDatasetPath               = parser.get("dbDatasetPath")
+//    val crossRefRelation            = parser.get("crossRefRelation")
+//    val dbaffiliationRelationPath   = parser.get("dbaffiliationRelationPath")
+      val workingDirPath              = parser.get("targetPath")
+//
+//    spark.read.load(dbDatasetPath).as[OafDataset]
+//      .map(d =>DoiBoostMappingUtil.fixResult(d))
+//      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//
+//    spark.read.load(dbPublicationPath).as[Publication]
+//      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//
+//    spark.read.load(crossRefRelation).as[Relation]
+//      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+//
+//    spark.read.load(dbaffiliationRelationPath).as[Relation]
+//      .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+//      .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
+
+
+    implicit val mapEncoderPub: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
+
+    val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
+
+    SequenceFileOutputFormat
+
+    d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset_a8c2f90b-a3ae-4d6e-8187-47a437156e18_1590223414", classOf[Text], classOf[Text], classOf[TextOutputFormat[Text,Text]], classOf[GzipCodec])
+
+
+
+
+
+
+
+
+
+
+
+  }
+
+}
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
@ -1,11 +1,14 @@
 package eu.dnetlib.doiboost

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
+import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
+import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
+import scala.collection.JavaConverters._

 object SparkGenerateDoiBoost {

@ -22,7 +25,7 @@ object SparkGenerateDoiBoost {
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()

-
+    import spark.implicits._
    val crossrefPublicationPath = parser.get("crossrefPublicationPath")
    val crossrefDatasetPath = parser.get("crossrefDatasetPath")
    val uwPublicationPath = parser.get("uwPublicationPath")
@ -32,15 +35,16 @@ object SparkGenerateDoiBoost {


    logger.info("Phase 1) repartition and move all the dataset in a same working folder")
-//    spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
-//    spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
-//    spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
-//    spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
-//    spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")
+    spark.read.load(crossrefPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefPublication")
+    spark.read.load(crossrefDatasetPath).as(Encoders.bean(classOf[OafDataset])).map(s => s)(Encoders.kryo[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/crossrefDataset")
+    spark.read.load(uwPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/uwPublication")
+    spark.read.load(orcidPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/orcidPublication")
+    spark.read.load(magPublicationPath).as(Encoders.bean(classOf[Publication])).map(s => s)(Encoders.kryo[Publication]).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/magPublication")

    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
+    implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]

    logger.info("Phase 2) Join Crossref with UnpayWall")

@ -73,11 +77,49 @@ object SparkGenerateDoiBoost {

    val doiBoostPublication: Dataset[Publication] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication]

-    doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
+    val map = DoiBoostMappingUtil.retrieveHostedByMap()
+
+    doiBoostPublication.filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(p => DoiBoostMappingUtil.fixPublication(p, map)).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
+
+
+    val affiliationPath = parser.get("affiliationPath")
+    val paperAffiliationPath = parser.get("paperAffiliationPath")
+
+    val affiliation = spark.read.load(affiliationPath).where(col("GridId").isNotNull).select(col("AffiliationId"), col("GridId"))
+
+    val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
+
+
+    val a:Dataset[DoiBoostAffiliation] = paperAffiliation
+      .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))).select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId")).as[DoiBoostAffiliation]



+    val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
+      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )

+
+    magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
+      val pub:Publication = item._1._2
+      val affiliation = item._2
+      val r:Relation = new Relation
+      r.setSource(pub.getId)
+      r.setTarget(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
+      r.setRelType("resultOrganization")
+      r.setRelClass("hasAuthorInstitution")
+      r.setSubRelType("affiliation")
+      r.setDataInfo(pub.getDataInfo)
+      r.setCollectedfrom(pub.getCollectedfrom)
+      val r1:Relation = new Relation
+      r1.setTarget(pub.getId)
+      r1.setSource(DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId))
+      r1.setRelType("resultOrganization")
+      r1.setRelClass("isAuthorInstitutionOf")
+      r1.setSubRelType("affiliation")
+      r1.setDataInfo(pub.getDataInfo)
+      r1.setCollectedfrom(pub.getCollectedfrom)
+      List(r, r1)
+    })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
  }

 }
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
@ -0,0 +1,8 @@
+[
+  {"paramName": "m",    "paramLongName":"master",                    "paramDescription": "the master name",                 "paramRequired": true},
+  {"paramName": "dp",   "paramLongName":"dbPublicationPath",   "paramDescription": "the Crossref Publication Path",   "paramRequired": true},
+  {"paramName": "dd",   "paramLongName":"dbDatasetPath",       "paramDescription": "the Crossref Dataset Path",       "paramRequired": true},
+  {"paramName": "cr",   "paramLongName":"crossRefRelation",         "paramDescription": "the UnpayWall Publication Path",  "paramRequired": true},
+  {"paramName": "da",   "paramLongName":"dbaffiliationRelationPath",        "paramDescription": "the MAG Publication Path",        "paramRequired": true},
+  {"paramName": "w",    "paramLongName":"targetPath",            "paramDescription": "the Working Path",                "paramRequired": true}
+]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json
@ -1,9 +1,11 @@
 [
-  {"paramName": "m",    "paramLongName":"master",                    "paramDescription": "the master name",                "paramRequired": true},
-  {"paramName": "cp",   "paramLongName":"crossrefPublicationPath",   "paramDescription": "the Crossref Publication Path",  "paramRequired": true},
-  {"paramName": "cd",   "paramLongName":"crossrefDatasetPath",       "paramDescription": "the Crossref Dataset Path",      "paramRequired": true},
-  {"paramName": "up",   "paramLongName":"uwPublicationPath",         "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
-  {"paramName": "mp",   "paramLongName":"magPublicationPath",        "paramDescription": "the MAG Publication Path",       "paramRequired": true},
-  {"paramName": "op",   "paramLongName":"orcidPublicationPath",      "paramDescription": "the ORCID Publication Path",     "paramRequired": true},
-  {"paramName": "w",    "paramLongName":"workingDirPath",            "paramDescription": "the Working Path",               "paramRequired": true}
+  {"paramName": "m",    "paramLongName":"master",                    "paramDescription": "the master name",                 "paramRequired": true},
+  {"paramName": "cp",   "paramLongName":"crossrefPublicationPath",   "paramDescription": "the Crossref Publication Path",   "paramRequired": true},
+  {"paramName": "cd",   "paramLongName":"crossrefDatasetPath",       "paramDescription": "the Crossref Dataset Path",       "paramRequired": true},
+  {"paramName": "up",   "paramLongName":"uwPublicationPath",         "paramDescription": "the UnpayWall Publication Path",  "paramRequired": true},
+  {"paramName": "mp",   "paramLongName":"magPublicationPath",        "paramDescription": "the MAG Publication Path",        "paramRequired": true},
+  {"paramName": "op",   "paramLongName":"orcidPublicationPath",      "paramDescription": "the ORCID Publication Path",      "paramRequired": true},
+  {"paramName": "ap",   "paramLongName":"affiliationPath",            "paramDescription": "the Affliation Path",            "paramRequired": true},
+  {"paramName": "pa",   "paramLongName":"paperAffiliationPath",      "paramDescription": "the paperAffiliation Path",       "paramRequired": true},
+  {"paramName": "w",    "paramLongName":"workingDirPath",            "paramDescription": "the Working Path",                "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@ -20,6 +20,18 @@
            <name>orcidPublicationPath</name>
            <description>the ORCID Publication Path</description>
        </property>
+
+        <property>
+            <name>affiliationPath</name>
+            <description>the Affliation Path</description>
+        </property>
+
+        <property>
+            <name>paperAffiliationPath</name>
+            <description>the paperAffiliation Path</description>
+        </property>
+
+
        <property>
            <name>workingDirPath</name>
            <description>the Working Path</description>
@ -40,7 +52,7 @@



-    <start to="CreateDOIBoost"/>
+    <start to="GenerateActionSet"/>


    <kill name="Kill">
@ -75,36 +87,40 @@
            <arg>--uwPublicationPath</arg><arg>${uwPublicationPath}</arg>
            <arg>--magPublicationPath</arg><arg>${magPublicationPath}</arg>
            <arg>--orcidPublicationPath</arg><arg>${orcidPublicationPath}</arg>
+            <arg>--affiliationPath</arg><arg>${affiliationPath}</arg>
+            <arg>--paperAffiliationPath</arg><arg>${paperAffiliationPath}</arg>
            <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
+        <ok to="GenerateActionSet"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="GenerateActionSet">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Generate DOIBoost ActionSet</name>
+            <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>--dbPublicationPath</arg><arg>${workingDirPath}/doiBoostPublicationFiltered</arg>
+            <arg>--dbDatasetPath</arg><arg>${workingDirPath}/crossrefDataset</arg>
+            <arg>--crossRefRelation</arg><arg>/data/doiboost/input/crossref/relations</arg>
+            <arg>--dbaffiliationRelationPath</arg><arg>${workingDirPath}/doiBoostPublicationAffiliation</arg>
+            <arg>--targetPath</arg><arg>${workingDirPath}/actionDataSet</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>

-
-
-<!--    <action name="PreprocessMag">-->
-<!--        <spark xmlns="uri:oozie:spark-action:0.2">-->
-<!--            <master>yarn-cluster</master>-->
-<!--            <mode>cluster</mode>-->
-<!--            <name>Convert Mag to Dataset</name>-->
-<!--            <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>-->
-<!--            <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
-<!--            <spark-opts>-->
-<!--                &#45;&#45;executor-memory=${sparkExecutorMemory}-->
-<!--                &#45;&#45;executor-cores=${sparkExecutorCores}-->
-<!--                &#45;&#45;driver-memory=${sparkDriverMemory}-->
-<!--                &#45;&#45;conf spark.sql.shuffle.partitions=3840-->
-<!--                ${sparkExtraOPT}-->
-<!--            </spark-opts>-->
-<!--            <arg>&#45;&#45;sourcePath</arg><arg>${sourcePath}</arg>-->
-<!--            <arg>&#45;&#45;targetPath</arg><arg>${targetPath}</arg>-->
-<!--            <arg>&#45;&#45;master</arg><arg>yarn-cluster</arg>-->
-<!--        </spark>-->
-<!--        <ok to="End"/>-->
-<!--        <error to="Kill"/>-->
-<!--    </action>-->
-
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
@ -1,7 +1,11 @@
 package eu.dnetlib.dhp.doiboost

-
+import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
+import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
 import org.junit.jupiter.api.Test

 class DoiBoostHostedByMapTest {
@ -13,4 +17,46 @@ class DoiBoostHostedByMapTest {

  }

+
+  @Test
+  def testFilter():Unit = {
+    val conf: SparkConf = new SparkConf()
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(getClass.getSimpleName)
+        .master("local[*]").getOrCreate()
+
+
+    implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
+    implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
+    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
+
+
+
+    val pub =spark.read.load("/data/doiboost/doiboostPublicationFiltered").as[Publication]
+
+    val mapper = new ObjectMapper()
+
+    val map = DoiBoostMappingUtil.retrieveHostedByMap()
+
+   println(pub.map(p => DoiBoostMappingUtil.fixPublication(p, map)).count())
+
+
+  }
+
+
+  @Test
+  def idDSGeneration():Unit = {
+    val s ="doajarticles::0066-782X"
+
+
+
+    println(DoiBoostMappingUtil.generateDSId(s))
+
+
+  }
+
+
 }
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/dhp/doiboost/publicationInput.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/dhp/doiboost/publicationInput.json