dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala

396 lines
16 KiB
Scala

package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf._
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import collection.JavaConverters._
object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {}
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
case class UniprotDate(date: String, date_info: String) {}
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val SUBJ_CLASS = "Keywords"
val DATE_RELATION_KEY = "RelationDate"
val resolvedURL: Map[String, String] = Map(
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
"onim" -> "https://omim.org/entry/",
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
)
val collectedFromMap: Map[String, KeyValue] = {
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO)
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
EBICollectedFrom.setDataInfo(DATA_INFO)
pubmedCollectedFrom.setDataInfo(DATA_INFO)
enaCollectedFrom.setDataInfo(DATA_INFO)
ncbiCollectedFrom.setDataInfo(DATA_INFO)
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
Map(
"uniprot" -> UNIPROTCollectedFrom,
"pdb" -> PDBCollectedFrom,
"elsevier" -> ElsevierCollectedFrom,
"ebi" -> EBICollectedFrom,
"Springer Nature" -> springerNatureCollectedFrom,
"NCBI Nucleotide" -> ncbiCollectedFrom,
"European Nucleotide Archive" -> enaCollectedFrom,
"Europe PMC" -> pubmedCollectedFrom
)
}
def crossrefLinksToOaf(input: String): Oaf = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
}
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
val d = new Dataset
d.setPid(
List(
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
d.setDataInfo(DATA_INFO)
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
if (input.tilte != null && input.tilte.nonEmpty)
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setOriginalId(List(input.pid).asJava)
val i = new Instance
i.setPid(d.getPid)
if (resolvedURL.contains(input.pidType)) {
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
}
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
else
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
if (input.datasource == null || input.datasource.isEmpty)
return null
val ds = input.datasource.head
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
i.setCollectedfrom(collectedFromMap(ds))
d.setInstance(List(i).asJava)
if (input.authors != null && input.authors.nonEmpty) {
val authors = input.authors.map(a => {
val authorOAF = new Author
authorOAF.setFullname(a)
authorOAF
})
d.setAuthor(authors.asJava)
}
if (input.date != null && input.date.nonEmpty) {
val dt = input.date.head
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
}
d
}
def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val pid = (json \ "pid").extract[String]
val d = new Dataset
d.setPid(
List(
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
d.setDataInfo(DATA_INFO)
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
val title: String = (json \ "title").extractOrElse[String](null)
if (title != null)
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setOriginalId(List(pid).asJava)
val i = new Instance
i.setPid(d.getPid)
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setCollectedfrom(collectedFromMap("uniprot"))
d.setInstance(List(i).asJava)
val dates: List[UniprotDate] = for {
JObject(dateOBJ) <- json \ "dates"
JField("date", JString(date)) <- dateOBJ
JField("date_info", JString(date_info)) <- dateOBJ
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
if (subjects != null) {
d.setSubject(
subjects.map(s =>
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
}
var i_date: Option[UniprotDate] = None
if (dates.nonEmpty) {
i_date = dates.find(d => d.date_info.contains("entry version"))
if (i_date.isDefined) {
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
if (relevant_dates != null && relevant_dates.nonEmpty)
d.setRelevantdate(relevant_dates.asJava)
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
val references_pmid: List[String] = for {
JObject(reference) <- json \ "references"
JField("PubMed", JString(pid)) <- reference
} yield pid
val references_doi: List[String] = for {
JObject(reference) <- json \ "references"
JField(" DOI", JString(pid)) <- reference
} yield pid
if (references_pmid != null && references_pmid.nonEmpty) {
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
rel.getCollectedfrom
List(d, rel)
}
else if (references_doi != null && references_doi.nonEmpty) {
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
List(d, rel)
}
else
List(d)
}
def generate_unresolved_id(pid: String, pidType: String): String = {
s"unresolved::$pid::$pidType"
}
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
rel.setDataInfo(DATA_INFO)
rel.setRelType(ModelConstants.RESULT_RESULT)
rel.setSubRelType(subRelType)
rel.setRelClass(relClass)
rel.setSource(sourceId)
rel.setTarget(s"unresolved::$pid::$pidType")
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.getTarget.startsWith("unresolved")
rel.setCollectedfrom(List(collectedFrom).asJava)
rel
}
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
}
def pdbTOOaf(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val pdb = (json \ "pdb").extract[String].toLowerCase
if (pdb.isEmpty)
return List()
val d = new Dataset
d.setPid(
List(
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
d.setDataInfo(DATA_INFO)
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
d.setOriginalId(List(pdb).asJava)
val title = (json \ "title").extractOrElse[String](null)
if (title == null)
return List()
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
if (authors != null) {
val convertedAuthors = authors.zipWithIndex.map { a =>
val res = new Author
res.setFullname(a._1)
res.setRank(a._2 + 1)
res
}
d.setAuthor(convertedAuthors.asJava)
}
val i = new Instance
i.setPid(d.getPid)
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setCollectedfrom(collectedFromMap("pdb"))
d.setInstance(List(i).asJava)
val pmid = (json \ "pmid").extractOrElse[String](null)
if (pmid != null)
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
else
List(d)
}
def extractEBILinksFromDump(input: String): EBILinkItem = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val pmid = (json \ "publication" \ "pmid").extract[String]
val links = (json \ "links").extract[JObject]
EBILinkItem(pmid.toLong, compact(render(links)))
}
def EBITargetLinksFilter(input: EBILinks): Boolean = {
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
}
def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val pmid = (json \ "request" \ "id").extract[String]
for {
JObject(link) <- json \\ "Link"
JField("Target", JObject(target)) <- link
JField("RelationshipType", JObject(relType)) <- link
JField("Name", JString(relation)) <- relType
JField("PublicationDate", JString(publicationDate)) <- link
JField("Title", JString(title)) <- target
JField("Identifier", JObject(identifier)) <- target
JField("IDScheme", JString(idScheme)) <- identifier
JField("IDURL", JString(idUrl)) <- identifier
JField("ID", JString(id)) <- identifier
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
}
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
val d = new Dataset
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
d.setDataInfo(DATA_INFO)
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
d.setPid(
List(
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
val i = new Instance
i.setPid(d.getPid)
i.setUrl(List(input.targetUrl).asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setCollectedfrom(collectedFromMap("ebi"))
d.setInstance(List(i).asJava)
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
}
}