package eu.dnetlib.dhp.sx.bio import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf._ import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.{compact, parse, render} import collection.JavaConverters._ object BioDBToOAF { case class EBILinkItem(id: Long, links: String) {} case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {} case class UniprotDate(date: String, date_info: String) {} case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {} val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") val SUBJ_CLASS = "Keywords" val DATE_RELATION_KEY = "RelationDate" val resolvedURL: Map[String, String] = Map( "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/", "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/", "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/", "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/", "ena" -> "https://www.ebi.ac.uk/ena/browser/view/", "clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/", "onim" -> "https://omim.org/entry/", "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/", "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" ) val collectedFromMap: Map[String, KeyValue] = { val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank") val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive") val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide") val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot") val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier") val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature") val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)") val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") UNIPROTCollectedFrom.setDataInfo(DATA_INFO) PDBCollectedFrom.setDataInfo(DATA_INFO) ElsevierCollectedFrom.setDataInfo(DATA_INFO) EBICollectedFrom.setDataInfo(DATA_INFO) pubmedCollectedFrom.setDataInfo(DATA_INFO) enaCollectedFrom.setDataInfo(DATA_INFO) ncbiCollectedFrom.setDataInfo(DATA_INFO) springerNatureCollectedFrom.setDataInfo(DATA_INFO) Map( "uniprot" -> UNIPROTCollectedFrom, "pdb" -> PDBCollectedFrom, "elsevier" -> ElsevierCollectedFrom, "ebi" -> EBICollectedFrom, "Springer Nature" -> springerNatureCollectedFrom, "NCBI Nucleotide" -> ncbiCollectedFrom, "European Nucleotide Archive" -> enaCollectedFrom, "Europe PMC" -> pubmedCollectedFrom ) } def crossrefLinksToOaf(input: String): Oaf = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String] val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String]) createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date) } def scholixResolvedToOAF(input: ScholixResolved): Oaf = { val d = new Dataset d.setPid( List( OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) ).asJava ) d.setDataInfo(DATA_INFO) val nsPrefix = input.pidType.toLowerCase.padTo(12, '_') d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true)) if (input.tilte != null && input.tilte.nonEmpty) d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) d.setOriginalId(List(input.pid).asJava) val i = new Instance i.setPid(d.getPid) if (resolvedURL.contains(input.pidType)) { i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava) } if (input.pidType.equalsIgnoreCase("clinicaltrials.gov")) i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) else i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) if (input.datasource == null || input.datasource.isEmpty) return null val ds = input.datasource.head d.setCollectedfrom(List(collectedFromMap(ds)).asJava) i.setCollectedfrom(collectedFromMap(ds)) d.setInstance(List(i).asJava) if (input.authors != null && input.authors.nonEmpty) { val authors = input.authors.map(a => { val authorOAF = new Author authorOAF.setFullname(a) authorOAF }) d.setAuthor(authors.asJava) } if (input.date != null && input.date.nonEmpty) { val dt = input.date.head i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO)) } d } def uniprotToOAF(input: String): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val pid = (json \ "pid").extract[String] val d = new Dataset d.setPid( List( OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) ).asJava ) d.setDataInfo(DATA_INFO) d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true)) d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava) val title: String = (json \ "title").extractOrElse[String](null) if (title != null) d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) d.setOriginalId(List(pid).asJava) val i = new Instance i.setPid(d.getPid) i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava) i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setCollectedfrom(collectedFromMap("uniprot")) d.setInstance(List(i).asJava) val dates: List[UniprotDate] = for { JObject(dateOBJ) <- json \ "dates" JField("date", JString(date)) <- dateOBJ JField("date_info", JString(date_info)) <- dateOBJ } yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info) val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null) if (subjects != null) { d.setSubject( subjects.map(s => OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) ).asJava) } var i_date: Option[UniprotDate] = None if (dates.nonEmpty) { i_date = dates.find(d => d.date_info.contains("entry version")) if (i_date.isDefined) { i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) } val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version")) .map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO)) if (relevant_dates != null && relevant_dates.nonEmpty) d.setRelevantdate(relevant_dates.asJava) d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) } val references_pmid: List[String] = for { JObject(reference) <- json \ "references" JField("PubMed", JString(pid)) <- reference } yield pid val references_doi: List[String] = for { JObject(reference) <- json \ "references" JField(" DOI", JString(pid)) <- reference } yield pid if (references_pmid != null && references_pmid.nonEmpty) { val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) rel.getCollectedfrom List(d, rel) } else if (references_doi != null && references_doi.nonEmpty) { val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) List(d, rel) } else List(d) } def generate_unresolved_id(pid: String, pidType: String): String = { s"unresolved::$pid::$pidType" } def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = { val rel = new Relation rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava) rel.setDataInfo(DATA_INFO) rel.setRelType(ModelConstants.RESULT_RESULT) rel.setSubRelType(subRelType) rel.setRelClass(relClass) rel.setSource(sourceId) rel.setTarget(s"unresolved::$pid::$pidType") val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) rel.setProperties(List(dateProps).asJava) rel.getTarget.startsWith("unresolved") rel.setCollectedfrom(List(collectedFrom).asJava) rel } def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = { createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date) } def pdbTOOaf(input: String): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val pdb = (json \ "pdb").extract[String].toLowerCase if (pdb.isEmpty) return List() val d = new Dataset d.setPid( List( OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) ).asJava ) d.setCollectedfrom(List(collectedFromMap("pdb")).asJava) d.setDataInfo(DATA_INFO) d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true)) d.setOriginalId(List(pdb).asJava) val title = (json \ "title").extractOrElse[String](null) if (title == null) return List() d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null) if (authors != null) { val convertedAuthors = authors.zipWithIndex.map { a => val res = new Author res.setFullname(a._1) res.setRank(a._2 + 1) res } d.setAuthor(convertedAuthors.asJava) } val i = new Instance i.setPid(d.getPid) i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava) i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setCollectedfrom(collectedFromMap("pdb")) d.setInstance(List(i).asJava) val pmid = (json \ "pmid").extractOrElse[String](null) if (pmid != null) List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null)) else List(d) } def extractEBILinksFromDump(input: String): EBILinkItem = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val pmid = (json \ "publication" \ "pmid").extract[String] val links = (json \ "links").extract[JObject] EBILinkItem(pmid.toLong, compact(render(links))) } def EBITargetLinksFilter(input: EBILinks): Boolean = { input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot") } def parse_ebi_links(input: String): List[EBILinks] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val pmid = (json \ "request" \ "id").extract[String] for { JObject(link) <- json \\ "Link" JField("Target", JObject(target)) <- link JField("RelationshipType", JObject(relType)) <- link JField("Name", JString(relation)) <- relType JField("PublicationDate", JString(publicationDate)) <- link JField("Title", JString(title)) <- target JField("Identifier", JObject(identifier)) <- target JField("IDScheme", JString(idScheme)) <- identifier JField("IDURL", JString(idUrl)) <- identifier JField("ID", JString(id)) <- identifier } yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl) } def convertEBILinksToOaf(input: EBILinks): List[Oaf] = { val d = new Dataset d.setCollectedfrom(List(collectedFromMap("ebi")).asJava) d.setDataInfo(DATA_INFO) d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_') d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true)) d.setOriginalId(List(input.targetPid.toLowerCase).asJava) d.setPid( List( OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) ).asJava ) val i = new Instance i.setPid(d.getPid) i.setUrl(List(input.targetUrl).asJava) i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setCollectedfrom(collectedFromMap("ebi")) d.setInstance(List(i).asJava) i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)) List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date))) } }