From 446f81ee602000c0378f833fca9886c4128dd2ae Mon Sep 17 00:00:00 2001
From: Enrico Ottonello <enrico.ottonello@isti.cnr.it>
Date: Tue, 22 Feb 2022 11:42:57 +0100
Subject: [PATCH] wf to generate oaf from bioschema json datacite

---
 .../bioschema/BioschemaModelConstants.scala   | 274 +++++++++
 .../BioschemaToOAFTransformation.scala        | 529 ++++++++++++++++++
 .../eu/dnetlib/dhp/bioschema/ped_record.json  |  41 ++
 .../BioschemaDataciteToOAFTest.scala          | 108 ++++
 dhp-workflows/dhp-bmuse/sitemap.txt           |  62 ++
 5 files changed, 1014 insertions(+)
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala
 create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json
 create mode 100644 dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala
 create mode 100644 dhp-workflows/dhp-bmuse/sitemap.txt

diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala
new file mode 100644
index 000000000..d44693f5a
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala
@@ -0,0 +1,274 @@
+package eu.dnetlib.dhp.bioschema
+
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
+import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
+
+import java.io.InputStream
+import java.time.format.DateTimeFormatter
+import java.util.Locale
+import java.util.regex.Pattern
+import scala.io.Source
+
+/** This class represent the dataModel of the input Dataset of Datacite
+  * @param doi THE DOI
+  * @param timestamp timestamp of last update date
+  * @param isActive the record is active or deleted
+  * @param json the json native records
+  */
+case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
+
+/*
+  The following class are utility class used for the mapping from
+  json datacite to OAF Shema
+ */
+case class RelatedIdentifierType(
+  relationType: String,
+  relatedIdentifier: String,
+  relatedIdentifierType: String
+) {}
+
+case class NameIdentifiersType(
+  nameIdentifierScheme: Option[String],
+  schemeUri: Option[String],
+  nameIdentifier: Option[String]
+) {}
+
+case class CreatorType(
+  nameType: Option[String],
+  nameIdentifiers: Option[List[NameIdentifiersType]],
+  name: Option[String],
+  familyName: Option[String],
+  givenName: Option[String],
+  affiliation: Option[List[String]]
+) {}
+
+case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
+
+case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
+
+case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
+
+case class FundingReferenceType(
+  funderIdentifierType: Option[String],
+  awardTitle: Option[String],
+  awardUri: Option[String],
+  funderName: Option[String],
+  funderIdentifier: Option[String],
+  awardNumber: Option[String]
+) {}
+
+case class DateType(date: Option[String], dateType: Option[String]) {}
+
+case class OAFRelations(relation: String, inverse: String, relType: String)
+
+class BioschemaModelConstants extends Serializable {}
+
+object BioschemaModelConstants {
+
+  val REL_TYPE_VALUE: String = "resultResult"
+  val DATE_RELATION_KEY = "RelationDate"
+  val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
+  val DOI_CLASS = "doi"
+  val SUBJ_CLASS = "keywords"
+  val dataInfo: DataInfo = dataciteDataInfo("0.9")
+
+  val subRelTypeMapping: Map[String, OAFRelations] = Map(
+    ModelConstants.REFERENCES -> OAFRelations(
+      ModelConstants.REFERENCES,
+      ModelConstants.IS_REFERENCED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_REFERENCED_BY -> OAFRelations(
+      ModelConstants.IS_REFERENCED_BY,
+      ModelConstants.REFERENCES,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
+      ModelConstants.IS_SUPPLEMENTED_BY,
+      ModelConstants.IS_SUPPLEMENT_TO,
+      ModelConstants.SUPPLEMENT
+    ),
+    ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
+      ModelConstants.IS_SUPPLEMENT_TO,
+      ModelConstants.IS_SUPPLEMENTED_BY,
+      ModelConstants.SUPPLEMENT
+    ),
+    ModelConstants.HAS_PART -> OAFRelations(
+      ModelConstants.HAS_PART,
+      ModelConstants.IS_PART_OF,
+      ModelConstants.PART
+    ),
+    ModelConstants.IS_PART_OF -> OAFRelations(
+      ModelConstants.IS_PART_OF,
+      ModelConstants.HAS_PART,
+      ModelConstants.PART
+    ),
+    ModelConstants.IS_VERSION_OF -> OAFRelations(
+      ModelConstants.IS_VERSION_OF,
+      ModelConstants.HAS_VERSION,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.HAS_VERSION -> OAFRelations(
+      ModelConstants.HAS_VERSION,
+      ModelConstants.IS_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
+      ModelConstants.IS_IDENTICAL_TO,
+      ModelConstants.IS_IDENTICAL_TO,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_CONTINUED_BY -> OAFRelations(
+      ModelConstants.IS_CONTINUED_BY,
+      ModelConstants.CONTINUES,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.CONTINUES -> OAFRelations(
+      ModelConstants.CONTINUES,
+      ModelConstants.IS_CONTINUED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
+      ModelConstants.IS_NEW_VERSION_OF,
+      ModelConstants.IS_PREVIOUS_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
+      ModelConstants.IS_PREVIOUS_VERSION_OF,
+      ModelConstants.IS_NEW_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.DOCUMENTS,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.DOCUMENTS -> OAFRelations(
+      ModelConstants.DOCUMENTS,
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_SOURCE_OF -> OAFRelations(
+      ModelConstants.IS_SOURCE_OF,
+      ModelConstants.IS_DERIVED_FROM,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_DERIVED_FROM -> OAFRelations(
+      ModelConstants.IS_DERIVED_FROM,
+      ModelConstants.IS_SOURCE_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.CITES -> OAFRelations(
+      ModelConstants.CITES,
+      ModelConstants.IS_CITED_BY,
+      ModelConstants.CITATION
+    ),
+    ModelConstants.IS_CITED_BY -> OAFRelations(
+      ModelConstants.IS_CITED_BY,
+      ModelConstants.CITES,
+      ModelConstants.CITATION
+    ),
+    ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
+      ModelConstants.IS_VARIANT_FORM_OF,
+      ModelConstants.IS_DERIVED_FROM,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
+      ModelConstants.IS_OBSOLETED_BY,
+      ModelConstants.IS_NEW_VERSION_OF,
+      ModelConstants.VERSION
+    ),
+    ModelConstants.REVIEWS -> OAFRelations(
+      ModelConstants.REVIEWS,
+      ModelConstants.IS_REVIEWED_BY,
+      ModelConstants.REVIEW
+    ),
+    ModelConstants.IS_REVIEWED_BY -> OAFRelations(
+      ModelConstants.IS_REVIEWED_BY,
+      ModelConstants.REVIEWS,
+      ModelConstants.REVIEW
+    ),
+    ModelConstants.DOCUMENTS -> OAFRelations(
+      ModelConstants.DOCUMENTS,
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+      ModelConstants.IS_DOCUMENTED_BY,
+      ModelConstants.DOCUMENTS,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.COMPILES -> OAFRelations(
+      ModelConstants.COMPILES,
+      ModelConstants.IS_COMPILED_BY,
+      ModelConstants.RELATIONSHIP
+    ),
+    ModelConstants.IS_COMPILED_BY -> OAFRelations(
+      ModelConstants.IS_COMPILED_BY,
+      ModelConstants.COMPILES,
+      ModelConstants.RELATIONSHIP
+    )
+  )
+
+  val datacite_filter: List[String] = {
+    val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
+    require(stream != null)
+    Source.fromInputStream(stream).getLines().toList
+  }
+
+  def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
+    false,
+    null,
+    false,
+    false,
+    ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+    trust
+  )
+
+  val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
+    "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
+    Locale.ENGLISH
+  )
+
+  val df_it: DateTimeFormatter =
+    DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
+
+  val funder_regex: List[(Pattern, String)] = List(
+    (
+      Pattern.compile(
+        "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
+        Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+      ),
+      "40|corda__h2020::"
+    ),
+    (
+      Pattern.compile(
+        "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
+        Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+      ),
+      "40|corda_______::"
+    )
+  )
+
+  val Date_regex: List[Pattern] = List(
+    //Y-M-D
+    Pattern.compile(
+      "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
+      Pattern.MULTILINE
+    ),
+    //M-D-Y
+    Pattern.compile(
+      "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
+      Pattern.MULTILINE
+    ),
+    //D-M-Y
+    Pattern.compile(
+      "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
+      Pattern.MULTILINE
+    ),
+    //Y
+    Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
+  )
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala
new file mode 100644
index 000000000..3461b9198
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala
@@ -0,0 +1,529 @@
+package eu.dnetlib.dhp.bioschema
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
+import eu.dnetlib.dhp.schema.action.AtomicAction
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
+import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
+import eu.dnetlib.dhp.utils.DHPUtils
+import org.apache.commons.lang3.StringUtils
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString}
+import org.json4s.jackson.JsonMethods.parse
+
+import java.text.SimpleDateFormat
+import java.time.LocalDate
+import java.time.chrono.ThaiBuddhistDate
+import java.time.format.DateTimeFormatter
+import java.util.{Date, Locale}
+import scala.collection.JavaConverters._
+import scala.io.{Codec, Source}
+
+object BioschemaToOAFTransformation {
+
+  val mapper = new ObjectMapper()
+
+  val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
+    false,
+    null,
+    false,
+    false,
+    ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+    "0.9"
+  )
+
+  val collectedFromMap: Map[String, KeyValue] = {
+    val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+      "10|ped_________::changeme",
+      "PED"
+    )
+    PEDCollectedFrom.setDataInfo(DATA_INFO)
+
+    Map(
+      "ped"                         -> PEDCollectedFrom
+    )
+  }
+
+  /** This method should skip record if json contains invalid text
+    * defined in gile datacite_filter
+    *
+    * @param json
+    * @return True if the record should be skipped
+    */
+  def skip_record(json: String): Boolean = {
+    datacite_filter.exists(f => json.contains(f))
+  }
+
+  @deprecated("this method will be removed", "dhp")
+  def toActionSet(item: Oaf): (String, String) = {
+    val mapper = new ObjectMapper()
+
+    item match {
+      case dataset: OafDataset =>
+        val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
+        a.setClazz(classOf[OafDataset])
+        a.setPayload(dataset)
+        (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case publication: Publication =>
+        val a: AtomicAction[Publication] = new AtomicAction[Publication]
+        a.setClazz(classOf[Publication])
+        a.setPayload(publication)
+        (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case software: Software =>
+        val a: AtomicAction[Software] = new AtomicAction[Software]
+        a.setClazz(classOf[Software])
+        a.setPayload(software)
+        (software.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case orp: OtherResearchProduct =>
+        val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
+        a.setClazz(classOf[OtherResearchProduct])
+        a.setPayload(orp)
+        (orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
+
+      case relation: Relation =>
+        val a: AtomicAction[Relation] = new AtomicAction[Relation]
+        a.setClazz(classOf[Relation])
+        a.setPayload(relation)
+        (relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case _ =>
+        null
+    }
+
+  }
+
+  def embargo_end(embargo_end_date: String): Boolean = {
+    val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
+    val td = LocalDate.now()
+    td.isAfter(dt)
+  }
+
+  def extract_date(input: String): Option[String] = {
+    val d = Date_regex
+      .map(pattern => {
+        val matcher = pattern.matcher(input)
+        if (matcher.find())
+          matcher.group(0)
+        else
+          null
+      })
+      .find(s => s != null)
+
+    if (d.isDefined) {
+      val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
+      try {
+        return Some(LocalDate.parse(a_date, df_en).toString)
+      } catch {
+        case _: Throwable =>
+          try {
+            return Some(LocalDate.parse(a_date, df_it).toString)
+          } catch {
+            case _: Throwable =>
+              return None
+          }
+      }
+    }
+    d
+  }
+
+  def fix_thai_date(input: String, format: String): String = {
+    try {
+      val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
+      val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
+      LocalDate.from(d).toString
+    } catch {
+      case _: Throwable => ""
+    }
+  }
+
+  def getTypeQualifier(
+    resourceType: String,
+    resourceTypeGeneral: String,
+    schemaOrg: String,
+    vocabularies: VocabularyGroup
+  ): (Qualifier, Qualifier) = {
+    if (resourceType != null && resourceType.nonEmpty) {
+      val typeQualifier =
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
+      if (typeQualifier != null)
+        return (
+          typeQualifier,
+          vocabularies.getSynonymAsQualifier(
+            ModelConstants.DNET_RESULT_TYPOLOGIES,
+            typeQualifier.getClassid
+          )
+        )
+    }
+    if (schemaOrg != null && schemaOrg.nonEmpty) {
+      val typeQualifier =
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
+      if (typeQualifier != null)
+        return (
+          typeQualifier,
+          vocabularies.getSynonymAsQualifier(
+            ModelConstants.DNET_RESULT_TYPOLOGIES,
+            typeQualifier.getClassid
+          )
+        )
+
+    }
+    if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
+      val typeQualifier = vocabularies.getSynonymAsQualifier(
+        ModelConstants.DNET_PUBLICATION_RESOURCE,
+        resourceTypeGeneral
+      )
+      if (typeQualifier != null)
+        return (
+          typeQualifier,
+          vocabularies.getSynonymAsQualifier(
+            ModelConstants.DNET_RESULT_TYPOLOGIES,
+            typeQualifier.getClassid
+          )
+        )
+
+    }
+    null
+  }
+
+  def getResult(
+    resourceType: String,
+    resourceTypeGeneral: String,
+    schemaOrg: String,
+    vocabularies: VocabularyGroup
+  ): Result = {
+    val typeQualifiers: (Qualifier, Qualifier) =
+      getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+    if (typeQualifiers == null)
+      return null
+    val i = new Instance
+    i.setInstancetype(typeQualifiers._1)
+    typeQualifiers._2.getClassname match {
+      case "dataset" =>
+        val r = new OafDataset
+        r.setInstance(List(i).asJava)
+        return r
+      case "publication" =>
+        val r = new Publication
+        r.setInstance(List(i).asJava)
+        return r
+      case "software" =>
+        val r = new Software
+        r.setInstance(List(i).asJava)
+        return r
+      case "other" =>
+        val r = new OtherResearchProduct
+        r.setInstance(List(i).asJava)
+        return r
+    }
+    null
+  }
+
+  def available_date(input: String): Boolean = {
+
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: org.json4s.JValue = parse(input)
+    val l: List[String] = for {
+      JObject(dates)                         <- json \\ "dates"
+      JField("dateType", JString(dateTypes)) <- dates
+    } yield dateTypes
+
+    l.exists(p => p.equalsIgnoreCase("available"))
+
+  }
+
+  def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
+    val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
+    s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
+  }
+
+  def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
+    OafMapperUtils.structuredProperty(dt, q, null)
+  }
+
+  def generateRelation(
+    sourceId: String,
+    targetId: String,
+    relClass: String,
+    cf: KeyValue,
+    di: DataInfo
+  ): Relation = {
+
+    val r = new Relation
+    r.setSource(sourceId)
+    r.setTarget(targetId)
+    r.setRelType(ModelConstants.RESULT_PROJECT)
+    r.setRelClass(relClass)
+    r.setSubRelType(ModelConstants.OUTCOME)
+    r.setCollectedfrom(List(cf).asJava)
+    r.setDataInfo(di)
+    r
+
+  }
+
+  def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
+    val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
+
+    if (match_pattern.isDefined) {
+      val m = match_pattern.get._1
+      val p = match_pattern.get._2
+      val grantId = m.matcher(awardUri).replaceAll("$2")
+      val targetId = s"$p${DHPUtils.md5(grantId)}"
+      List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
+    } else
+      List()
+
+  }
+
+  def generateOAF(
+    input: String,
+    ts: Long,
+    dateOfCollection: Long,
+    vocabularies: VocabularyGroup,
+    exportLinks: Boolean
+  ): List[Oaf] = {
+    if (skip_record(input))
+      return List()
+
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+
+    val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null)
+    val resourceTypeGeneral =
+      (json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
+    val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null)
+
+    //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
+    val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+    if (result == null)
+      return List()
+
+    val pid = (json \ "id").extract[String]
+
+    result.setPid(
+      List(
+        OafMapperUtils.structuredProperty(
+          pid,
+          "ped",
+          "ped",
+          ModelConstants.DNET_PID_TYPES,
+          ModelConstants.DNET_PID_TYPES,
+          DATA_INFO
+        )
+      ).asJava
+    )
+    result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
+    result.setOriginalId(List(pid).asJava)
+
+    result.setDataInfo(dataInfo)
+
+    val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
+
+    val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
+
+    result.setTitle(
+      titles
+        .filter(t => t.title.nonEmpty)
+        .map(t => {
+          if (t.titleType.isEmpty) {
+            OafMapperUtils
+              .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
+          } else {
+            OafMapperUtils.structuredProperty(
+              t.title.get,
+              t.titleType.get,
+              t.titleType.get,
+              ModelConstants.DNET_DATACITE_TITLE,
+              ModelConstants.DNET_DATACITE_TITLE,
+              null
+            )
+          }
+        })
+        .asJava
+    )
+
+    val dates = (json \\ "dates").extract[List[DateType]]
+    val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
+
+    val i_date = dates
+      .filter(d => d.date.isDefined && d.dateType.isDefined)
+      .find(d => d.dateType.get.equalsIgnoreCase("issued"))
+      .map(d => extract_date(d.date.get))
+    val a_date: Option[String] = dates
+      .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
+      .map(d => extract_date(d.date.get))
+      .find(d => d != null && d.isDefined)
+      .map(d => d.get)
+
+    if (a_date.isDefined) {
+      result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
+    }
+    if (i_date.isDefined && i_date.get.isDefined) {
+      result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
+      result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
+    } else if (publication_year != null) {
+        result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+        result
+          .getInstance()
+          .get(0)
+          .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+    }
+
+    result.setRelevantdate(
+      dates
+        .filter(d => d.date.isDefined && d.dateType.isDefined)
+        .map(d => (extract_date(d.date.get), d.dateType.get))
+        .filter(d => d._1.isDefined)
+        .map(d =>
+          (
+            d._1.get,
+            vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
+          )
+        )
+        .filter(d => d._2 != null)
+        .map(d => generateOAFDate(d._1, d._2))
+        .asJava
+    )
+
+    result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
+
+    val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
+
+    result.setDescription(
+      descriptions
+        .filter(d => d.description.isDefined)
+        .map(d => OafMapperUtils.field(d.description.get, null))
+        .filter(s => s != null)
+        .asJava
+    )
+
+    val publisher = (json \\ "publisher").extractOrElse[String](null)
+    if (publisher != null)
+      result.setPublisher(OafMapperUtils.field(publisher, null))
+
+    val language: String = (json \\ "language").extractOrElse[String](null)
+
+    if (language != null)
+      result.setLanguage(
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
+      )
+
+    val instance = result.getInstance().get(0)
+
+    val accessRights: List[String] = for {
+      JObject(rightsList)                     <- json \\ "rightsList"
+      JField("rightsUri", JString(rightsUri)) <- rightsList
+    } yield rightsUri
+
+    val aRights: Option[AccessRight] = accessRights
+      .map(r => {
+        vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
+      })
+      .find(q => q != null)
+      .map(q => {
+        val a = new AccessRight
+        a.setClassid(q.getClassid)
+        a.setClassname(q.getClassname)
+        a.setSchemeid(q.getSchemeid)
+        a.setSchemename(q.getSchemename)
+        a
+      })
+
+    val access_rights_qualifier =
+      if (aRights.isDefined) aRights.get
+      else
+        OafMapperUtils.accessRight(
+          ModelConstants.UNKNOWN,
+          ModelConstants.NOT_AVAILABLE,
+          ModelConstants.DNET_ACCESS_MODES,
+          ModelConstants.DNET_ACCESS_MODES
+        )
+
+      instance.setCollectedfrom(collectedFromMap("ped"))
+      instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
+      instance.setAccessright(access_rights_qualifier)
+      instance.setPid(result.getPid)
+      val license = accessRights
+        .find(r =>
+          r.startsWith("http") && r.matches(
+            ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
+          )
+        )
+      if (license.isDefined)
+        instance.setLicense(OafMapperUtils.field(license.get, null))
+
+    val awardUris: List[String] = for {
+      JObject(fundingReferences)            <- json \\ "fundingReferences"
+      JField("awardUri", JString(awardUri)) <- fundingReferences
+    } yield awardUri
+
+    result.setId(IdentifierFactory.createIdentifier(result))
+    var relations: List[Relation] =
+      awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
+
+    if (result.getId == null)
+      return List()
+
+    if (exportLinks) {
+      val rels: List[RelatedIdentifierType] = for {
+        JObject(relIdentifier)                                          <- json \\ "relatedIdentifiers"
+        JField("relationType", JString(relationType))                   <- relIdentifier
+        JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
+        JField("relatedIdentifier", JString(relatedIdentifier))         <- relIdentifier
+      } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
+
+      relations = relations ::: generateRelations(
+        rels,
+        result.getId,
+        if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
+      )
+    }
+    if (relations != null && relations.nonEmpty) {
+      List(result) ::: relations
+    } else
+      List(result)
+  }
+
+  private def generateRelations(
+    rels: List[RelatedIdentifierType],
+    id: String,
+    date: String
+  ): List[Relation] = {
+    rels
+      .filter(r =>
+        subRelTypeMapping
+          .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
+        r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
+        r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
+      )
+      .map(r => {
+        val rel = new Relation
+        rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
+        rel.setDataInfo(dataInfo)
+
+        val subRelType = subRelTypeMapping(r.relationType).relType
+        rel.setRelType(REL_TYPE_VALUE)
+        rel.setSubRelType(subRelType)
+        rel.setRelClass(r.relationType)
+
+        val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+
+        rel.setProperties(List(dateProps).asJava)
+
+        rel.setSource(id)
+        rel.setTarget(
+          DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
+        )
+        rel.setCollectedfrom(List(collectedFromMap("ped") ).asJava)
+        rel.getCollectedfrom.asScala.map(c => c.getValue).toList
+        rel
+      })
+  }
+
+  def generateDSId(input: String): String = {
+    val b = StringUtils.substringBefore(input, "::")
+    val a = StringUtils.substringAfter(input, "::")
+    s"10|$b::${DHPUtils.md5(a)}"
+  }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json
new file mode 100644
index 000000000..cc8e5a714
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json
@@ -0,0 +1,41 @@
+{
+  "id": "PED00001#P38634_A_1",
+  "types": {
+    "resourceType": "Protein",
+    "resourceTypeGeneral": "Dataset"
+  },
+  "creators": [],
+  "identifiers": [
+    {
+      "identifier": "https://proteinensemble.org/PED00001#P38634_A_1",
+      "identifierType": "URL"
+    }
+  ],
+  "relatedIdentifiers": [
+    {
+      "relationType": "CitedBy",
+      "relatedIdentifier": "https://identifiers.org/pubmed:20399186"
+    },
+    {
+      "relationType": "IsIdenticalTo",
+      "relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634"
+    }
+  ],
+  "alternateIdentifiers": [
+    {
+      "alternateIdentifier": "https://identifiers.org/uniprot:P38634"
+    }
+  ],
+  "descriptions": [],
+  "titles": [
+    {
+      "title": "Protein SIC1"
+    }
+  ],
+  "dates": [
+    {
+      "date": "2021-12-09T21:10:30",
+      "dateType": "Collected"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala
new file mode 100644
index 000000000..f35749fa4
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala
@@ -0,0 +1,108 @@
+package eu.dnetlib.dhp.bioschema
+
+import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
+import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
+//import eu.dnetlib.dhp.bioschema.{BioschemaToOAFTransformation, GenerateDataciteDatasetSpark}
+import eu.dnetlib.dhp.bioschema.BioschemaToOAFTransformation
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import org.apache.commons.io.FileUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.{col, count}
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Assertions._
+import org.junit.jupiter.api.extension.ExtendWith
+import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
+import org.mockito.junit.jupiter.MockitoExtension
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.nio.file.{Files, Path}
+import java.text.SimpleDateFormat
+import java.util.Locale
+import scala.io.Source
+
+@ExtendWith(Array(classOf[MockitoExtension]))
+class BioschemaDataciteToOAFTest extends AbstractVocabularyTest {
+
+  private var workingDir: Path = null
+  val log: Logger = LoggerFactory.getLogger(getClass)
+
+  @BeforeEach
+  def setUp(): Unit = {
+
+    workingDir = Files.createTempDirectory(getClass.getSimpleName)
+    super.setUpVocabulary()
+  }
+
+  @AfterEach
+  def tearDown(): Unit = {
+    FileUtils.deleteDirectory(workingDir.toFile)
+  }
+
+  @Test
+  def testDateMapping: Unit = {
+    val inputDate = "2021-07-14T11:52:54+0000"
+    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
+    val dt = ISO8601FORMAT.parse(inputDate)
+    println(dt.getTime)
+
+  }
+
+//  @Test
+//  def testConvert(): Unit = {
+//
+//    val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
+//
+//    val conf = new SparkConf()
+//    val spark: SparkSession = SparkSession
+//      .builder()
+//      .config(conf)
+//      .appName(getClass.getSimpleName)
+//      .master("local[*]")
+//      .getOrCreate()
+//
+//    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+//    val instance = new GenerateDataciteDatasetSpark(null, null, log)
+//    val targetPath = s"$workingDir/result"
+//
+//    instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark)
+//
+//    import spark.implicits._
+//
+//    val nativeSize = spark.read.load(path).count()
+//
+//    assertEquals(100, nativeSize)
+//
+//    val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
+//
+//    result
+//      .map(s => s.getClass.getSimpleName)
+//      .groupBy(col("value").alias("class"))
+//      .agg(count("value").alias("Total"))
+//      .show(false)
+//
+//    val t = spark.read.load(targetPath).count()
+//
+//    assertTrue(t > 0)
+//
+//    spark.stop()
+//
+//  }
+
+  @Test
+  def testMapping(): Unit = {
+    val record = Source
+      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
+      .mkString
+
+    val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
+    val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
+
+    res.foreach(r => {
+      println(mapper.writeValueAsString(r))
+      println("----------------------------")
+
+    })
+
+  }
+
+}
diff --git a/dhp-workflows/dhp-bmuse/sitemap.txt b/dhp-workflows/dhp-bmuse/sitemap.txt
new file mode 100644
index 000000000..d8ed5ebe1
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/sitemap.txt
@@ -0,0 +1,62 @@
+https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
+
+PED
+<property>
+    <name>workingPath</name>
+    <value>/data/bioschema/ped/</value>
+    <description>the working path</description>
+</property>
+<property>
+    <name>sitemapUrl</name>
+    <value>https://proteinensemble.org/sitemap2.xml.gz</value>
+</property>
+<property>
+    <name>sitemapURLKey</name>
+    <value>loc</value>
+</property>
+<property>
+    <name>dynamic</name>
+    <value>true</value>
+    <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
+</property>
+
+DISPROT
+ <property>
+    <name>workingPath</name>
+    <value>/data/bioschema/disprot/</value>
+    <description>the working path</description>
+</property>
+<property>
+    <name>sitemapUrl</name>
+    <value>https://disprot.org/sitemap2.xml.gz</value>
+</property>
+<property>
+    <name>sitemapURLKey</name>
+    <value>loc</value>
+</property>
+<property>
+    <name>dynamic</name>
+    <value>true</value>
+    <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
+</property>
+
+MOBIDB
+ <property>
+    <name>workingPath</name>
+    <value>/data/bioschema/mobidb/</value>
+    <description>the working path</description>
+</property>
+<property>
+    <name>sitemapUrl</name>
+    <value>https://mobidb.org/sitemap2.xml.gz</value>
+</property>
+<property>
+    <name>sitemapURLKey</name>
+    <value>loc</value>
+</property>
+<property>
+    <name>dynamic</name>
+    <value>true</value>
+    <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
+</property>
+<property>