diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala
new file mode 100644
index 000000000..d44693f5a
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala
@@ -0,0 +1,274 @@
+package eu.dnetlib.dhp.bioschema
+
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
+import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
+
+import java.io.InputStream
+import java.time.format.DateTimeFormatter
+import java.util.Locale
+import java.util.regex.Pattern
+import scala.io.Source
+
+/** This class represent the dataModel of the input Dataset of Datacite
+ * @param doi THE DOI
+ * @param timestamp timestamp of last update date
+ * @param isActive the record is active or deleted
+ * @param json the json native records
+ */
+case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
+
+/*
+ The following class are utility class used for the mapping from
+ json datacite to OAF Shema
+ */
+case class RelatedIdentifierType(
+ relationType: String,
+ relatedIdentifier: String,
+ relatedIdentifierType: String
+) {}
+
+case class NameIdentifiersType(
+ nameIdentifierScheme: Option[String],
+ schemeUri: Option[String],
+ nameIdentifier: Option[String]
+) {}
+
+case class CreatorType(
+ nameType: Option[String],
+ nameIdentifiers: Option[List[NameIdentifiersType]],
+ name: Option[String],
+ familyName: Option[String],
+ givenName: Option[String],
+ affiliation: Option[List[String]]
+) {}
+
+case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
+
+case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
+
+case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
+
+case class FundingReferenceType(
+ funderIdentifierType: Option[String],
+ awardTitle: Option[String],
+ awardUri: Option[String],
+ funderName: Option[String],
+ funderIdentifier: Option[String],
+ awardNumber: Option[String]
+) {}
+
+case class DateType(date: Option[String], dateType: Option[String]) {}
+
+case class OAFRelations(relation: String, inverse: String, relType: String)
+
+class BioschemaModelConstants extends Serializable {}
+
+object BioschemaModelConstants {
+
+ val REL_TYPE_VALUE: String = "resultResult"
+ val DATE_RELATION_KEY = "RelationDate"
+ val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
+ val DOI_CLASS = "doi"
+ val SUBJ_CLASS = "keywords"
+ val dataInfo: DataInfo = dataciteDataInfo("0.9")
+
+ val subRelTypeMapping: Map[String, OAFRelations] = Map(
+ ModelConstants.REFERENCES -> OAFRelations(
+ ModelConstants.REFERENCES,
+ ModelConstants.IS_REFERENCED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_REFERENCED_BY -> OAFRelations(
+ ModelConstants.IS_REFERENCED_BY,
+ ModelConstants.REFERENCES,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
+ ModelConstants.IS_SUPPLEMENTED_BY,
+ ModelConstants.IS_SUPPLEMENT_TO,
+ ModelConstants.SUPPLEMENT
+ ),
+ ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
+ ModelConstants.IS_SUPPLEMENT_TO,
+ ModelConstants.IS_SUPPLEMENTED_BY,
+ ModelConstants.SUPPLEMENT
+ ),
+ ModelConstants.HAS_PART -> OAFRelations(
+ ModelConstants.HAS_PART,
+ ModelConstants.IS_PART_OF,
+ ModelConstants.PART
+ ),
+ ModelConstants.IS_PART_OF -> OAFRelations(
+ ModelConstants.IS_PART_OF,
+ ModelConstants.HAS_PART,
+ ModelConstants.PART
+ ),
+ ModelConstants.IS_VERSION_OF -> OAFRelations(
+ ModelConstants.IS_VERSION_OF,
+ ModelConstants.HAS_VERSION,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.HAS_VERSION -> OAFRelations(
+ ModelConstants.HAS_VERSION,
+ ModelConstants.IS_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
+ ModelConstants.IS_IDENTICAL_TO,
+ ModelConstants.IS_IDENTICAL_TO,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_CONTINUED_BY -> OAFRelations(
+ ModelConstants.IS_CONTINUED_BY,
+ ModelConstants.CONTINUES,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.CONTINUES -> OAFRelations(
+ ModelConstants.CONTINUES,
+ ModelConstants.IS_CONTINUED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
+ ModelConstants.IS_NEW_VERSION_OF,
+ ModelConstants.IS_PREVIOUS_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
+ ModelConstants.IS_PREVIOUS_VERSION_OF,
+ ModelConstants.IS_NEW_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.DOCUMENTS,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.DOCUMENTS -> OAFRelations(
+ ModelConstants.DOCUMENTS,
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_SOURCE_OF -> OAFRelations(
+ ModelConstants.IS_SOURCE_OF,
+ ModelConstants.IS_DERIVED_FROM,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_DERIVED_FROM -> OAFRelations(
+ ModelConstants.IS_DERIVED_FROM,
+ ModelConstants.IS_SOURCE_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.CITES -> OAFRelations(
+ ModelConstants.CITES,
+ ModelConstants.IS_CITED_BY,
+ ModelConstants.CITATION
+ ),
+ ModelConstants.IS_CITED_BY -> OAFRelations(
+ ModelConstants.IS_CITED_BY,
+ ModelConstants.CITES,
+ ModelConstants.CITATION
+ ),
+ ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
+ ModelConstants.IS_VARIANT_FORM_OF,
+ ModelConstants.IS_DERIVED_FROM,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
+ ModelConstants.IS_OBSOLETED_BY,
+ ModelConstants.IS_NEW_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.REVIEWS -> OAFRelations(
+ ModelConstants.REVIEWS,
+ ModelConstants.IS_REVIEWED_BY,
+ ModelConstants.REVIEW
+ ),
+ ModelConstants.IS_REVIEWED_BY -> OAFRelations(
+ ModelConstants.IS_REVIEWED_BY,
+ ModelConstants.REVIEWS,
+ ModelConstants.REVIEW
+ ),
+ ModelConstants.DOCUMENTS -> OAFRelations(
+ ModelConstants.DOCUMENTS,
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.DOCUMENTS,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.COMPILES -> OAFRelations(
+ ModelConstants.COMPILES,
+ ModelConstants.IS_COMPILED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_COMPILED_BY -> OAFRelations(
+ ModelConstants.IS_COMPILED_BY,
+ ModelConstants.COMPILES,
+ ModelConstants.RELATIONSHIP
+ )
+ )
+
+ val datacite_filter: List[String] = {
+ val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
+ require(stream != null)
+ Source.fromInputStream(stream).getLines().toList
+ }
+
+ def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
+ false,
+ null,
+ false,
+ false,
+ ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+ trust
+ )
+
+ val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
+ "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
+ Locale.ENGLISH
+ )
+
+ val df_it: DateTimeFormatter =
+ DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
+
+ val funder_regex: List[(Pattern, String)] = List(
+ (
+ Pattern.compile(
+ "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
+ Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+ ),
+ "40|corda__h2020::"
+ ),
+ (
+ Pattern.compile(
+ "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
+ Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+ ),
+ "40|corda_______::"
+ )
+ )
+
+ val Date_regex: List[Pattern] = List(
+ //Y-M-D
+ Pattern.compile(
+ "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
+ Pattern.MULTILINE
+ ),
+ //M-D-Y
+ Pattern.compile(
+ "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
+ Pattern.MULTILINE
+ ),
+ //D-M-Y
+ Pattern.compile(
+ "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
+ Pattern.MULTILINE
+ ),
+ //Y
+ Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
+ )
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala
new file mode 100644
index 000000000..3461b9198
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala
@@ -0,0 +1,529 @@
+package eu.dnetlib.dhp.bioschema
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._
+import eu.dnetlib.dhp.schema.action.AtomicAction
+import eu.dnetlib.dhp.schema.common.ModelConstants
+import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
+import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
+import eu.dnetlib.dhp.utils.DHPUtils
+import org.apache.commons.lang3.StringUtils
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString}
+import org.json4s.jackson.JsonMethods.parse
+
+import java.text.SimpleDateFormat
+import java.time.LocalDate
+import java.time.chrono.ThaiBuddhistDate
+import java.time.format.DateTimeFormatter
+import java.util.{Date, Locale}
+import scala.collection.JavaConverters._
+import scala.io.{Codec, Source}
+
+object BioschemaToOAFTransformation {
+
+ val mapper = new ObjectMapper()
+
+ val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
+ false,
+ null,
+ false,
+ false,
+ ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+ "0.9"
+ )
+
+ val collectedFromMap: Map[String, KeyValue] = {
+ val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|ped_________::changeme",
+ "PED"
+ )
+ PEDCollectedFrom.setDataInfo(DATA_INFO)
+
+ Map(
+ "ped" -> PEDCollectedFrom
+ )
+ }
+
+ /** This method should skip record if json contains invalid text
+ * defined in gile datacite_filter
+ *
+ * @param json
+ * @return True if the record should be skipped
+ */
+ def skip_record(json: String): Boolean = {
+ datacite_filter.exists(f => json.contains(f))
+ }
+
+ @deprecated("this method will be removed", "dhp")
+ def toActionSet(item: Oaf): (String, String) = {
+ val mapper = new ObjectMapper()
+
+ item match {
+ case dataset: OafDataset =>
+ val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
+ a.setClazz(classOf[OafDataset])
+ a.setPayload(dataset)
+ (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
+ case publication: Publication =>
+ val a: AtomicAction[Publication] = new AtomicAction[Publication]
+ a.setClazz(classOf[Publication])
+ a.setPayload(publication)
+ (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
+ case software: Software =>
+ val a: AtomicAction[Software] = new AtomicAction[Software]
+ a.setClazz(classOf[Software])
+ a.setPayload(software)
+ (software.getClass.getCanonicalName, mapper.writeValueAsString(a))
+ case orp: OtherResearchProduct =>
+ val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
+ a.setClazz(classOf[OtherResearchProduct])
+ a.setPayload(orp)
+ (orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
+
+ case relation: Relation =>
+ val a: AtomicAction[Relation] = new AtomicAction[Relation]
+ a.setClazz(classOf[Relation])
+ a.setPayload(relation)
+ (relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
+ case _ =>
+ null
+ }
+
+ }
+
+ def embargo_end(embargo_end_date: String): Boolean = {
+ val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
+ val td = LocalDate.now()
+ td.isAfter(dt)
+ }
+
+ def extract_date(input: String): Option[String] = {
+ val d = Date_regex
+ .map(pattern => {
+ val matcher = pattern.matcher(input)
+ if (matcher.find())
+ matcher.group(0)
+ else
+ null
+ })
+ .find(s => s != null)
+
+ if (d.isDefined) {
+ val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
+ try {
+ return Some(LocalDate.parse(a_date, df_en).toString)
+ } catch {
+ case _: Throwable =>
+ try {
+ return Some(LocalDate.parse(a_date, df_it).toString)
+ } catch {
+ case _: Throwable =>
+ return None
+ }
+ }
+ }
+ d
+ }
+
+ def fix_thai_date(input: String, format: String): String = {
+ try {
+ val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
+ val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
+ LocalDate.from(d).toString
+ } catch {
+ case _: Throwable => ""
+ }
+ }
+
+ def getTypeQualifier(
+ resourceType: String,
+ resourceTypeGeneral: String,
+ schemaOrg: String,
+ vocabularies: VocabularyGroup
+ ): (Qualifier, Qualifier) = {
+ if (resourceType != null && resourceType.nonEmpty) {
+ val typeQualifier =
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
+ if (typeQualifier != null)
+ return (
+ typeQualifier,
+ vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ typeQualifier.getClassid
+ )
+ )
+ }
+ if (schemaOrg != null && schemaOrg.nonEmpty) {
+ val typeQualifier =
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
+ if (typeQualifier != null)
+ return (
+ typeQualifier,
+ vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ typeQualifier.getClassid
+ )
+ )
+
+ }
+ if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
+ val typeQualifier = vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ resourceTypeGeneral
+ )
+ if (typeQualifier != null)
+ return (
+ typeQualifier,
+ vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ typeQualifier.getClassid
+ )
+ )
+
+ }
+ null
+ }
+
+ def getResult(
+ resourceType: String,
+ resourceTypeGeneral: String,
+ schemaOrg: String,
+ vocabularies: VocabularyGroup
+ ): Result = {
+ val typeQualifiers: (Qualifier, Qualifier) =
+ getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+ if (typeQualifiers == null)
+ return null
+ val i = new Instance
+ i.setInstancetype(typeQualifiers._1)
+ typeQualifiers._2.getClassname match {
+ case "dataset" =>
+ val r = new OafDataset
+ r.setInstance(List(i).asJava)
+ return r
+ case "publication" =>
+ val r = new Publication
+ r.setInstance(List(i).asJava)
+ return r
+ case "software" =>
+ val r = new Software
+ r.setInstance(List(i).asJava)
+ return r
+ case "other" =>
+ val r = new OtherResearchProduct
+ r.setInstance(List(i).asJava)
+ return r
+ }
+ null
+ }
+
+ def available_date(input: String): Boolean = {
+
+ implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+ lazy val json: org.json4s.JValue = parse(input)
+ val l: List[String] = for {
+ JObject(dates) <- json \\ "dates"
+ JField("dateType", JString(dateTypes)) <- dates
+ } yield dateTypes
+
+ l.exists(p => p.equalsIgnoreCase("available"))
+
+ }
+
+ def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
+ val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
+ s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
+ }
+
+ def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
+ OafMapperUtils.structuredProperty(dt, q, null)
+ }
+
+ def generateRelation(
+ sourceId: String,
+ targetId: String,
+ relClass: String,
+ cf: KeyValue,
+ di: DataInfo
+ ): Relation = {
+
+ val r = new Relation
+ r.setSource(sourceId)
+ r.setTarget(targetId)
+ r.setRelType(ModelConstants.RESULT_PROJECT)
+ r.setRelClass(relClass)
+ r.setSubRelType(ModelConstants.OUTCOME)
+ r.setCollectedfrom(List(cf).asJava)
+ r.setDataInfo(di)
+ r
+
+ }
+
+ def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
+ val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
+
+ if (match_pattern.isDefined) {
+ val m = match_pattern.get._1
+ val p = match_pattern.get._2
+ val grantId = m.matcher(awardUri).replaceAll("$2")
+ val targetId = s"$p${DHPUtils.md5(grantId)}"
+ List(generateRelation(sourceId, targetId, "isProducedBy", collectedFromMap("ped"), dataInfo))
+ } else
+ List()
+
+ }
+
+ def generateOAF(
+ input: String,
+ ts: Long,
+ dateOfCollection: Long,
+ vocabularies: VocabularyGroup,
+ exportLinks: Boolean
+ ): List[Oaf] = {
+ if (skip_record(input))
+ return List()
+
+ implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+ lazy val json = parse(input)
+
+ val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null)
+ val resourceTypeGeneral =
+ (json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
+ val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null)
+
+ //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
+ val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+ if (result == null)
+ return List()
+
+ val pid = (json \ "id").extract[String]
+
+ result.setPid(
+ List(
+ OafMapperUtils.structuredProperty(
+ pid,
+ "ped",
+ "ped",
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ DATA_INFO
+ )
+ ).asJava
+ )
+ result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true))
+ result.setOriginalId(List(pid).asJava)
+
+ result.setDataInfo(dataInfo)
+
+ val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
+
+ val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
+
+ result.setTitle(
+ titles
+ .filter(t => t.title.nonEmpty)
+ .map(t => {
+ if (t.titleType.isEmpty) {
+ OafMapperUtils
+ .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
+ } else {
+ OafMapperUtils.structuredProperty(
+ t.title.get,
+ t.titleType.get,
+ t.titleType.get,
+ ModelConstants.DNET_DATACITE_TITLE,
+ ModelConstants.DNET_DATACITE_TITLE,
+ null
+ )
+ }
+ })
+ .asJava
+ )
+
+ val dates = (json \\ "dates").extract[List[DateType]]
+ val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
+
+ val i_date = dates
+ .filter(d => d.date.isDefined && d.dateType.isDefined)
+ .find(d => d.dateType.get.equalsIgnoreCase("issued"))
+ .map(d => extract_date(d.date.get))
+ val a_date: Option[String] = dates
+ .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
+ .map(d => extract_date(d.date.get))
+ .find(d => d != null && d.isDefined)
+ .map(d => d.get)
+
+ if (a_date.isDefined) {
+ result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
+ }
+ if (i_date.isDefined && i_date.get.isDefined) {
+ result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
+ result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
+ } else if (publication_year != null) {
+ result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+ result
+ .getInstance()
+ .get(0)
+ .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+ }
+
+ result.setRelevantdate(
+ dates
+ .filter(d => d.date.isDefined && d.dateType.isDefined)
+ .map(d => (extract_date(d.date.get), d.dateType.get))
+ .filter(d => d._1.isDefined)
+ .map(d =>
+ (
+ d._1.get,
+ vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
+ )
+ )
+ .filter(d => d._2 != null)
+ .map(d => generateOAFDate(d._1, d._2))
+ .asJava
+ )
+
+ result.setCollectedfrom(List(collectedFromMap("ped")).asJava)
+
+ val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
+
+ result.setDescription(
+ descriptions
+ .filter(d => d.description.isDefined)
+ .map(d => OafMapperUtils.field(d.description.get, null))
+ .filter(s => s != null)
+ .asJava
+ )
+
+ val publisher = (json \\ "publisher").extractOrElse[String](null)
+ if (publisher != null)
+ result.setPublisher(OafMapperUtils.field(publisher, null))
+
+ val language: String = (json \\ "language").extractOrElse[String](null)
+
+ if (language != null)
+ result.setLanguage(
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
+ )
+
+ val instance = result.getInstance().get(0)
+
+ val accessRights: List[String] = for {
+ JObject(rightsList) <- json \\ "rightsList"
+ JField("rightsUri", JString(rightsUri)) <- rightsList
+ } yield rightsUri
+
+ val aRights: Option[AccessRight] = accessRights
+ .map(r => {
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
+ })
+ .find(q => q != null)
+ .map(q => {
+ val a = new AccessRight
+ a.setClassid(q.getClassid)
+ a.setClassname(q.getClassname)
+ a.setSchemeid(q.getSchemeid)
+ a.setSchemename(q.getSchemename)
+ a
+ })
+
+ val access_rights_qualifier =
+ if (aRights.isDefined) aRights.get
+ else
+ OafMapperUtils.accessRight(
+ ModelConstants.UNKNOWN,
+ ModelConstants.NOT_AVAILABLE,
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
+
+ instance.setCollectedfrom(collectedFromMap("ped"))
+ instance.setUrl(List(s"https://proteinensemble.org/$pid").asJava)
+ instance.setAccessright(access_rights_qualifier)
+ instance.setPid(result.getPid)
+ val license = accessRights
+ .find(r =>
+ r.startsWith("http") && r.matches(
+ ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
+ )
+ )
+ if (license.isDefined)
+ instance.setLicense(OafMapperUtils.field(license.get, null))
+
+ val awardUris: List[String] = for {
+ JObject(fundingReferences) <- json \\ "fundingReferences"
+ JField("awardUri", JString(awardUri)) <- fundingReferences
+ } yield awardUri
+
+ result.setId(IdentifierFactory.createIdentifier(result))
+ var relations: List[Relation] =
+ awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
+
+ if (result.getId == null)
+ return List()
+
+ if (exportLinks) {
+ val rels: List[RelatedIdentifierType] = for {
+ JObject(relIdentifier) <- json \\ "relatedIdentifiers"
+ JField("relationType", JString(relationType)) <- relIdentifier
+ JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
+ JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
+ } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
+
+ relations = relations ::: generateRelations(
+ rels,
+ result.getId,
+ if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
+ )
+ }
+ if (relations != null && relations.nonEmpty) {
+ List(result) ::: relations
+ } else
+ List(result)
+ }
+
+ private def generateRelations(
+ rels: List[RelatedIdentifierType],
+ id: String,
+ date: String
+ ): List[Relation] = {
+ rels
+ .filter(r =>
+ subRelTypeMapping
+ .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
+ r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
+ r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
+ )
+ .map(r => {
+ val rel = new Relation
+ rel.setCollectedfrom(List(collectedFromMap("ped")).asJava)
+ rel.setDataInfo(dataInfo)
+
+ val subRelType = subRelTypeMapping(r.relationType).relType
+ rel.setRelType(REL_TYPE_VALUE)
+ rel.setSubRelType(subRelType)
+ rel.setRelClass(r.relationType)
+
+ val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+
+ rel.setProperties(List(dateProps).asJava)
+
+ rel.setSource(id)
+ rel.setTarget(
+ DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
+ )
+ rel.setCollectedfrom(List(collectedFromMap("ped") ).asJava)
+ rel.getCollectedfrom.asScala.map(c => c.getValue).toList
+ rel
+ })
+ }
+
+ def generateDSId(input: String): String = {
+ val b = StringUtils.substringBefore(input, "::")
+ val a = StringUtils.substringAfter(input, "::")
+ s"10|$b::${DHPUtils.md5(a)}"
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json
new file mode 100644
index 000000000..cc8e5a714
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/bioschema/ped_record.json
@@ -0,0 +1,41 @@
+{
+ "id": "PED00001#P38634_A_1",
+ "types": {
+ "resourceType": "Protein",
+ "resourceTypeGeneral": "Dataset"
+ },
+ "creators": [],
+ "identifiers": [
+ {
+ "identifier": "https://proteinensemble.org/PED00001#P38634_A_1",
+ "identifierType": "URL"
+ }
+ ],
+ "relatedIdentifiers": [
+ {
+ "relationType": "CitedBy",
+ "relatedIdentifier": "https://identifiers.org/pubmed:20399186"
+ },
+ {
+ "relationType": "IsIdenticalTo",
+ "relatedIdentifier": "http://purl.uniprot.org/uniprot/P38634"
+ }
+ ],
+ "alternateIdentifiers": [
+ {
+ "alternateIdentifier": "https://identifiers.org/uniprot:P38634"
+ }
+ ],
+ "descriptions": [],
+ "titles": [
+ {
+ "title": "Protein SIC1"
+ }
+ ],
+ "dates": [
+ {
+ "date": "2021-12-09T21:10:30",
+ "dateType": "Collected"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala
new file mode 100644
index 000000000..f35749fa4
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/bioschema/BioschemaDataciteToOAFTest.scala
@@ -0,0 +1,108 @@
+package eu.dnetlib.dhp.bioschema
+
+import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
+import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
+//import eu.dnetlib.dhp.bioschema.{BioschemaToOAFTransformation, GenerateDataciteDatasetSpark}
+import eu.dnetlib.dhp.bioschema.BioschemaToOAFTransformation
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import org.apache.commons.io.FileUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.{col, count}
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Assertions._
+import org.junit.jupiter.api.extension.ExtendWith
+import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
+import org.mockito.junit.jupiter.MockitoExtension
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.nio.file.{Files, Path}
+import java.text.SimpleDateFormat
+import java.util.Locale
+import scala.io.Source
+
+@ExtendWith(Array(classOf[MockitoExtension]))
+class BioschemaDataciteToOAFTest extends AbstractVocabularyTest {
+
+ private var workingDir: Path = null
+ val log: Logger = LoggerFactory.getLogger(getClass)
+
+ @BeforeEach
+ def setUp(): Unit = {
+
+ workingDir = Files.createTempDirectory(getClass.getSimpleName)
+ super.setUpVocabulary()
+ }
+
+ @AfterEach
+ def tearDown(): Unit = {
+ FileUtils.deleteDirectory(workingDir.toFile)
+ }
+
+ @Test
+ def testDateMapping: Unit = {
+ val inputDate = "2021-07-14T11:52:54+0000"
+ val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
+ val dt = ISO8601FORMAT.parse(inputDate)
+ println(dt.getTime)
+
+ }
+
+// @Test
+// def testConvert(): Unit = {
+//
+// val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
+//
+// val conf = new SparkConf()
+// val spark: SparkSession = SparkSession
+// .builder()
+// .config(conf)
+// .appName(getClass.getSimpleName)
+// .master("local[*]")
+// .getOrCreate()
+//
+// implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+// val instance = new GenerateDataciteDatasetSpark(null, null, log)
+// val targetPath = s"$workingDir/result"
+//
+// instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark)
+//
+// import spark.implicits._
+//
+// val nativeSize = spark.read.load(path).count()
+//
+// assertEquals(100, nativeSize)
+//
+// val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
+//
+// result
+// .map(s => s.getClass.getSimpleName)
+// .groupBy(col("value").alias("class"))
+// .agg(count("value").alias("Total"))
+// .show(false)
+//
+// val t = spark.read.load(targetPath).count()
+//
+// assertTrue(t > 0)
+//
+// spark.stop()
+//
+// }
+
+ @Test
+ def testMapping(): Unit = {
+ val record = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/bioschema/ped_record.json"))
+ .mkString
+
+ val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
+ val res: List[Oaf] = BioschemaToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
+
+ res.foreach(r => {
+ println(mapper.writeValueAsString(r))
+ println("----------------------------")
+
+ })
+
+ }
+
+}
diff --git a/dhp-workflows/dhp-bmuse/sitemap.txt b/dhp-workflows/dhp-bmuse/sitemap.txt
new file mode 100644
index 000000000..d8ed5ebe1
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/sitemap.txt
@@ -0,0 +1,62 @@
+https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
+
+PED
+
+ workingPath
+ /data/bioschema/ped/
+ the working path
+
+
+ sitemapUrl
+ https://proteinensemble.org/sitemap2.xml.gz
+
+
+ sitemapURLKey
+ loc
+
+
+ dynamic
+ true
+ the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)
+
+
+DISPROT
+
+ workingPath
+ /data/bioschema/disprot/
+ the working path
+
+
+ sitemapUrl
+ https://disprot.org/sitemap2.xml.gz
+
+
+ sitemapURLKey
+ loc
+
+
+ dynamic
+ true
+ the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)
+
+
+MOBIDB
+
+ workingPath
+ /data/bioschema/mobidb/
+ the working path
+
+
+ sitemapUrl
+ https://mobidb.org/sitemap2.xml.gz
+
+
+ sitemapURLKey
+ loc
+
+
+ dynamic
+ true
+ the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)
+
+