package eu.dnetlib.dhp.bioschema import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.bioschema.BioschemaModelConstants._ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse import java.time.LocalDate import scala.collection.JavaConverters._ object BioschemaToOAFTransformation { val mapper = new ObjectMapper() val DATA_INFO: DataInfo = OafMapperUtils.dataInfo( false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9" ) val resolvedURL: Map[String, String] = Map( "uniprot" -> "https://www.uniprot.org/uniprot/", "pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/" ) val collectedFromMap: Map[String, KeyValue] = { val PEDCollectedFrom: KeyValue = OafMapperUtils.keyValue( //TODO create pedDatasourceId and update this value "10|ped_________::pedDatasourceId", "Protein Ensemble Database" ) PEDCollectedFrom.setDataInfo(DATA_INFO) Map( "ped" -> PEDCollectedFrom ) } def extract_date(input: String): Option[String] = { val d = Date_regex .map(pattern => { val matcher = pattern.matcher(input) if (matcher.find()) matcher.group(0) else null }) .find(s => s != null) if (d.isDefined) { val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get try { return Some(LocalDate.parse(a_date, df_en).toString) } catch { case _: Throwable => try { return Some(LocalDate.parse(a_date, df_it).toString) } catch { case _: Throwable => return None } } } d } def getTypeQualifier( resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup ): (Qualifier, Qualifier) = { if (resourceType != null && resourceType.nonEmpty) { val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) if (typeQualifier != null) return ( typeQualifier, vocabularies.getSynonymAsQualifier( ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid ) ) } if (schemaOrg != null && schemaOrg.nonEmpty) { val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) if (typeQualifier != null) return ( typeQualifier, vocabularies.getSynonymAsQualifier( ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid ) ) } if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { val typeQualifier = vocabularies.getSynonymAsQualifier( ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral ) if (typeQualifier != null) return ( typeQualifier, vocabularies.getSynonymAsQualifier( ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid ) ) } null } def getResult( resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup ): Result = { val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) if (typeQualifiers == null) return null val i = new Instance i.setInstancetype( OafMapperUtils.qualifier( "0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE ) ) // i.setInstancetype(typeQualifiers._1) typeQualifiers._2.getClassname match { case "dataset" => val r = new OafDataset r.setInstance(List(i).asJava) return r case "publication" => val r = new Publication r.setInstance(List(i).asJava) return r case "software" => val r = new Software r.setInstance(List(i).asJava) return r case "other" => val r = new OtherResearchProduct r.setInstance(List(i).asJava) return r } null } def available_date(input: String): Boolean = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) val l: List[String] = for { JObject(dates) <- json \\ "dates" JField("dateType", JString(dateTypes)) <- dates } yield dateTypes l.exists(p => p.equalsIgnoreCase("available")) } def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = { val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_') s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}" } def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { OafMapperUtils.structuredProperty(dt, q, null) } def generateRelation( sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo ): Relation = { val r = new Relation r.setSource(sourceId) r.setTarget(targetId) r.setRelType(ModelConstants.RESULT_PROJECT) r.setRelClass(relClass) r.setSubRelType(ModelConstants.OUTCOME) r.setCollectedfrom(List(cf).asJava) r.setDataInfo(di) r } def generateOAF( input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean ): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val resourceType = (json \ "types" \ "resourceType").extractOrElse[String](null) val resourceTypeGeneral = (json \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) val schemaOrg = (json \ "types" \ "schemaOrg").extractOrElse[String](null) //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) if (result == null) return List() val pid = (json \ "id").extract[String] result.setPid( List( OafMapperUtils.structuredProperty( pid, "ped", "ped", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO ) ).asJava ) result.setId(OafMapperUtils.createOpenaireId(50, s"ped_________::$pid", true)) result.setOriginalId(List(pid).asJava) result.setDataInfo(dataInfo) val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) result.setTitle( titles .filter(t => t.title.nonEmpty) .map(t => { OafMapperUtils .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo) }) .asJava ) val dates = (json \\ "dates").extract[List[DateType]] val publication_year = (json \\ "publicationYear").extractOrElse[String](null) val i_date = dates .filter(d => d.date.isDefined && d.dateType.isDefined) .find(d => d.dateType.get.equalsIgnoreCase("issued")) .map(d => extract_date(d.date.get)) val a_date: Option[String] = dates .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")) .map(d => extract_date(d.date.get)) .find(d => d != null && d.isDefined) .map(d => d.get) if (a_date.isDefined) { result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) } if (i_date.isDefined && i_date.get.isDefined) { result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) } else if (publication_year != null) { result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) result .getInstance() .get(0) .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) } result.setRelevantdate( dates .filter(d => d.date.isDefined && d.dateType.isDefined) .map(d => (extract_date(d.date.get), d.dateType.get)) .filter(d => d._1.isDefined) .map(d => ( d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase()) ) ) .filter(d => d._2 != null) .map(d => generateOAFDate(d._1, d._2)) .asJava ) result.setCollectedfrom(List(collectedFromMap("ped")).asJava) val descriptions = (json \\ "descriptions").extract[List[DescriptionType]] result.setDescription( descriptions .filter(d => d.description.isDefined) .map(d => OafMapperUtils.field(d.description.get, null)) .filter(s => s != null) .asJava ) val publisher = (json \\ "publisher").extractOrElse[String](null) if (publisher != null) result.setPublisher(OafMapperUtils.field(publisher, null)) val language: String = (json \\ "language").extractOrElse[String](null) if (language != null) result.setLanguage( vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language) ) val instance = result.getInstance().get(0) val ids: List[IdentifierType] = for { JObject(identifiers) <- json \\ "identifiers" JField("identifier", JString(identifier)) <- identifiers JField("identifierType", JString(identifierType)) <- identifiers } yield IdentifierType(identifier, identifierType) instance.setUrl( ids .map(id => { id.identifier }) .asJava ) instance.setCollectedfrom(collectedFromMap("ped")) instance.setPid(result.getPid) result.setId(IdentifierFactory.createIdentifier(result)) var relations: List[Relation] = List() if (result.getId == null) return List() val alternativeIdentifierUrls: List[String] = for { JObject(alternateIdentifiers) <- json \\ "alternateIdentifiers" JField("alternateIdentifier", JString(alternateIdentifier)) <- alternateIdentifiers foundResolvedURLId = resolvedURL .map(k => { if (alternateIdentifier.contains(s"${k._1}:")) k._1 else null }) .find(s => s != null) alternativeIdentifierUrl = StringUtils.substringAfter(alternateIdentifier, s"${foundResolvedURLId.get}:") } yield alternativeIdentifierUrl alternativeIdentifierUrls.map(id => { var alternateIdentifier: StructuredProperty = null alternateIdentifier = OafMapperUtils.structuredProperty( id, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo ) instance.setAlternateIdentifier(List(alternateIdentifier).asJava) }) if (exportLinks) { val rels: List[RelatedIdentifierType] = for { JObject(relIdentifier) <- json \\ "relatedIdentifiers" JField("relationType", JString(relationType)) <- relIdentifier JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) relations = relations ::: generateRelations( rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null, pid ) } if (relations != null && relations.nonEmpty) { List(result) ::: relations } else List(result) } private def generateRelations( rels: List[RelatedIdentifierType], id: String, date: String, pid: String ): List[Relation] = { rels .map(r => { val rel = new Relation rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) rel.setDataInfo(dataInfo) val subRelType = subRelTypeMapping(r.relationType).relType rel.setRelType(REL_TYPE_VALUE) rel.setSubRelType(subRelType) rel.setRelClass(r.relationType) val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) rel.setProperties(List(dateProps).asJava) val foundResolvedURLId = resolvedURL .map(k => { if (r.relatedIdentifier.contains(s"${k._1}:")) k._1 else null }) .find(s => s != null); if (foundResolvedURLId.nonEmpty) { val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:") rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}") } else rel.setTarget( DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) ) rel.setSource(id) rel.setCollectedfrom(List(collectedFromMap("ped")).asJava) rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel }) } def generateDSId(input: String): String = { val b = StringUtils.substringBefore(input, "::") val a = StringUtils.substringAfter(input, "::") s"10|$b::${DHPUtils.md5(a)}" } }