2021-10-15 15:00:15 +02:00
|
|
|
package eu.dnetlib.dhp.sx.bio
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
2021-06-29 12:02:03 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
2021-10-12 08:11:53 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf._
|
2021-06-24 17:20:00 +02:00
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
|
|
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
2021-10-12 08:11:53 +02:00
|
|
|
import collection.JavaConverters._
|
2021-12-06 11:26:36 +01:00
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
object BioDBToOAF {
|
|
|
|
|
|
|
|
case class EBILinkItem(id: Long, links: String) {}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
case class EBILinks(
|
|
|
|
relType: String,
|
|
|
|
date: String,
|
|
|
|
title: String,
|
|
|
|
pmid: String,
|
|
|
|
targetPid: String,
|
|
|
|
targetPidType: String,
|
|
|
|
targetUrl: String
|
|
|
|
) {}
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
case class UniprotDate(date: String, date_info: String) {}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
case class ScholixResolved(
|
|
|
|
pid: String,
|
|
|
|
pidType: String,
|
|
|
|
typology: String,
|
|
|
|
tilte: List[String],
|
|
|
|
datasource: List[String],
|
|
|
|
date: List[String],
|
|
|
|
authors: List[String]
|
|
|
|
) {}
|
|
|
|
|
|
|
|
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
|
|
|
false,
|
|
|
|
null,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
|
|
|
"0.9"
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
val SUBJ_CLASS = "Keywords"
|
|
|
|
|
2021-06-29 12:02:03 +02:00
|
|
|
val DATE_RELATION_KEY = "RelationDate"
|
|
|
|
|
2021-10-12 08:11:53 +02:00
|
|
|
val resolvedURL: Map[String, String] = Map(
|
2022-01-11 16:57:48 +01:00
|
|
|
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
|
|
|
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
|
|
|
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
|
|
|
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
|
|
|
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
2021-10-12 08:11:53 +02:00
|
|
|
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
2022-01-11 16:57:48 +01:00
|
|
|
"onim" -> "https://omim.org/entry/",
|
|
|
|
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
|
|
|
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
2021-06-28 22:04:22 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
val collectedFromMap: Map[String, KeyValue] = {
|
2022-01-11 16:57:48 +01:00
|
|
|
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
|
|
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
|
|
|
|
"Protein Data Bank"
|
|
|
|
)
|
|
|
|
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
|
|
"10|re3data_____::c2a591f440598b63d854556beaf01591",
|
|
|
|
"European Nucleotide Archive"
|
|
|
|
)
|
|
|
|
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
|
|
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
|
|
|
|
"NCBI Nucleotide"
|
|
|
|
)
|
|
|
|
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
|
|
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
|
|
|
|
"UniProtKB/Swiss-Prot"
|
|
|
|
)
|
|
|
|
val ElsevierCollectedFrom: KeyValue =
|
|
|
|
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
|
|
|
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
|
|
|
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
|
|
|
|
"Springer Nature"
|
|
|
|
)
|
|
|
|
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
2023-06-22 07:56:33 +02:00
|
|
|
"10|opendoar____::3e60e09c222f206c725385f53d7e567c",
|
2022-01-11 16:57:48 +01:00
|
|
|
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
|
|
|
|
)
|
|
|
|
val pubmedCollectedFrom: KeyValue =
|
|
|
|
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
2021-06-28 22:04:22 +02:00
|
|
|
|
|
|
|
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
EBICollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
pubmedCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
enaCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
ncbiCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
|
|
|
|
|
|
|
Map(
|
2022-01-11 16:57:48 +01:00
|
|
|
"uniprot" -> UNIPROTCollectedFrom,
|
|
|
|
"pdb" -> PDBCollectedFrom,
|
|
|
|
"elsevier" -> ElsevierCollectedFrom,
|
|
|
|
"ebi" -> EBICollectedFrom,
|
|
|
|
"Springer Nature" -> springerNatureCollectedFrom,
|
|
|
|
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
2021-06-28 22:04:22 +02:00
|
|
|
"European Nucleotide Archive" -> enaCollectedFrom,
|
2022-01-11 16:57:48 +01:00
|
|
|
"Europe PMC" -> pubmedCollectedFrom
|
2021-06-28 22:04:22 +02:00
|
|
|
)
|
|
|
|
}
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2021-10-12 08:11:53 +02:00
|
|
|
def crossrefLinksToOaf(input: String): Oaf = {
|
2021-06-29 10:21:23 +02:00
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
|
|
|
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
|
|
|
|
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
|
|
|
|
|
|
|
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
|
|
|
|
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
|
|
|
|
2021-10-12 08:11:53 +02:00
|
|
|
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
|
2021-06-29 10:21:23 +02:00
|
|
|
|
2021-06-29 12:02:03 +02:00
|
|
|
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
2021-06-29 10:33:09 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
createRelation(
|
|
|
|
target_pid,
|
|
|
|
target_pid_type,
|
|
|
|
generate_unresolved_id(source_pid, source_pid_type),
|
|
|
|
collectedFromMap("elsevier"),
|
|
|
|
"relationship",
|
|
|
|
relation_semantic,
|
|
|
|
date
|
|
|
|
)
|
2021-06-29 10:21:23 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-10-12 08:11:53 +02:00
|
|
|
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
2021-06-28 22:04:22 +02:00
|
|
|
|
|
|
|
val d = new Dataset
|
|
|
|
|
|
|
|
d.setPid(
|
|
|
|
List(
|
2022-01-11 16:57:48 +01:00
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
input.pid.toLowerCase,
|
|
|
|
input.pidType.toLowerCase,
|
|
|
|
input.pidType.toLowerCase,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
2021-06-28 22:04:22 +02:00
|
|
|
).asJava
|
|
|
|
)
|
|
|
|
|
|
|
|
d.setDataInfo(DATA_INFO)
|
|
|
|
|
|
|
|
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
|
|
|
|
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
|
|
|
|
|
|
|
if (input.tilte != null && input.tilte.nonEmpty)
|
2022-01-11 16:57:48 +01:00
|
|
|
d.setTitle(
|
|
|
|
List(
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
input.tilte.head,
|
|
|
|
ModelConstants.MAIN_TITLE_QUALIFIER,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
2021-06-28 22:04:22 +02:00
|
|
|
|
|
|
|
d.setOriginalId(List(input.pid).asJava)
|
|
|
|
val i = new Instance
|
|
|
|
|
|
|
|
i.setPid(d.getPid)
|
|
|
|
|
|
|
|
if (resolvedURL.contains(input.pidType)) {
|
|
|
|
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
|
|
|
|
}
|
|
|
|
|
2023-11-29 12:45:30 +01:00
|
|
|
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov")) {
|
2022-01-11 16:57:48 +01:00
|
|
|
i.setInstancetype(
|
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
"0037",
|
|
|
|
"Clinical Trial",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
)
|
2023-11-29 12:45:30 +01:00
|
|
|
val itm = new InstanceTypeMapping
|
|
|
|
itm.setOriginalType(input.pidType)
|
2023-11-29 13:15:43 +01:00
|
|
|
itm.setVocabularyName(ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)
|
2023-11-29 12:45:30 +01:00
|
|
|
i.setInstanceTypeMapping(List(itm).asJava)
|
|
|
|
} else {
|
2022-01-11 16:57:48 +01:00
|
|
|
i.setInstancetype(
|
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
"0046",
|
|
|
|
"Bioentity",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
)
|
2023-11-29 12:45:30 +01:00
|
|
|
val itm = new InstanceTypeMapping
|
|
|
|
itm.setOriginalType("Bioentity")
|
2023-11-29 13:15:43 +01:00
|
|
|
itm.setVocabularyName(ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)
|
2023-11-29 12:45:30 +01:00
|
|
|
i.setInstanceTypeMapping(List(itm).asJava)
|
|
|
|
}
|
2021-06-28 22:04:22 +02:00
|
|
|
|
|
|
|
if (input.datasource == null || input.datasource.isEmpty)
|
|
|
|
return null
|
|
|
|
|
|
|
|
val ds = input.datasource.head
|
|
|
|
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
|
|
|
|
i.setCollectedfrom(collectedFromMap(ds))
|
|
|
|
d.setInstance(List(i).asJava)
|
|
|
|
|
|
|
|
if (input.authors != null && input.authors.nonEmpty) {
|
2021-10-12 08:11:53 +02:00
|
|
|
val authors = input.authors.map(a => {
|
2021-06-28 22:04:22 +02:00
|
|
|
val authorOAF = new Author
|
|
|
|
authorOAF.setFullname(a)
|
|
|
|
authorOAF
|
|
|
|
})
|
|
|
|
d.setAuthor(authors.asJava)
|
|
|
|
}
|
2021-10-12 08:11:53 +02:00
|
|
|
if (input.date != null && input.date.nonEmpty) {
|
|
|
|
val dt = input.date.head
|
|
|
|
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
|
|
|
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
|
|
|
}
|
2021-06-28 22:04:22 +02:00
|
|
|
d
|
|
|
|
}
|
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
def uniprotToOAF(input: String): List[Oaf] = {
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
|
|
|
val pid = (json \ "pid").extract[String]
|
|
|
|
|
|
|
|
val d = new Dataset
|
|
|
|
|
|
|
|
d.setPid(
|
|
|
|
List(
|
2022-01-11 16:57:48 +01:00
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
pid,
|
|
|
|
"uniprot",
|
|
|
|
"uniprot",
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
).asJava
|
|
|
|
)
|
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
d.setDataInfo(DATA_INFO)
|
2021-06-24 17:20:00 +02:00
|
|
|
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
|
2021-06-28 22:04:22 +02:00
|
|
|
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
val title: String = (json \ "title").extractOrElse[String](null)
|
|
|
|
|
|
|
|
if (title != null)
|
2022-01-11 16:57:48 +01:00
|
|
|
d.setTitle(
|
|
|
|
List(
|
|
|
|
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
|
|
|
).asJava
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
d.setOriginalId(List(pid).asJava)
|
|
|
|
val i = new Instance
|
|
|
|
|
|
|
|
i.setPid(d.getPid)
|
|
|
|
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
2022-01-11 16:57:48 +01:00
|
|
|
i.setInstancetype(
|
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
"0046",
|
|
|
|
"Bioentity",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
)
|
2023-11-29 12:45:30 +01:00
|
|
|
val itm = new InstanceTypeMapping
|
|
|
|
itm.setOriginalType("Bioentity")
|
2023-11-29 13:15:43 +01:00
|
|
|
itm.setVocabularyName(ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)
|
2023-11-29 12:45:30 +01:00
|
|
|
i.setInstanceTypeMapping(List(itm).asJava)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
i.setCollectedfrom(collectedFromMap("uniprot"))
|
2021-06-24 17:20:00 +02:00
|
|
|
d.setInstance(List(i).asJava)
|
|
|
|
|
|
|
|
val dates: List[UniprotDate] = for {
|
2022-01-11 16:57:48 +01:00
|
|
|
JObject(dateOBJ) <- json \ "dates"
|
|
|
|
JField("date", JString(date)) <- dateOBJ
|
2021-06-24 17:20:00 +02:00
|
|
|
JField("date_info", JString(date_info)) <- dateOBJ
|
2021-06-29 12:02:03 +02:00
|
|
|
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
|
|
|
|
|
|
|
if (subjects != null) {
|
|
|
|
d.setSubject(
|
2022-01-11 16:57:48 +01:00
|
|
|
subjects
|
|
|
|
.map(s =>
|
2022-08-04 11:39:39 +02:00
|
|
|
OafMapperUtils.subject(
|
2022-01-11 16:57:48 +01:00
|
|
|
s,
|
|
|
|
SUBJ_CLASS,
|
|
|
|
SUBJ_CLASS,
|
|
|
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
|
|
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
|
|
|
null
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.asJava
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
2021-10-12 08:11:53 +02:00
|
|
|
var i_date: Option[UniprotDate] = None
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
if (dates.nonEmpty) {
|
2021-06-29 10:33:09 +02:00
|
|
|
i_date = dates.find(d => d.date_info.contains("entry version"))
|
2021-06-24 17:20:00 +02:00
|
|
|
if (i_date.isDefined) {
|
2021-06-28 22:04:22 +02:00
|
|
|
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
|
|
|
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
2022-01-11 16:57:48 +01:00
|
|
|
val relevant_dates: List[StructuredProperty] = dates
|
|
|
|
.filter(d => !d.date_info.contains("entry version"))
|
|
|
|
.map(date =>
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
date.date,
|
|
|
|
ModelConstants.UNKNOWN,
|
|
|
|
ModelConstants.UNKNOWN,
|
|
|
|
ModelConstants.DNET_DATACITE_DATE,
|
|
|
|
ModelConstants.DNET_DATACITE_DATE,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
if (relevant_dates != null && relevant_dates.nonEmpty)
|
|
|
|
d.setRelevantdate(relevant_dates.asJava)
|
2021-06-28 22:04:22 +02:00
|
|
|
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
val references_pmid: List[String] = for {
|
2022-01-11 16:57:48 +01:00
|
|
|
JObject(reference) <- json \ "references"
|
2021-06-24 17:20:00 +02:00
|
|
|
JField("PubMed", JString(pid)) <- reference
|
|
|
|
} yield pid
|
|
|
|
|
|
|
|
val references_doi: List[String] = for {
|
2022-01-11 16:57:48 +01:00
|
|
|
JObject(reference) <- json \ "references"
|
2021-06-24 17:20:00 +02:00
|
|
|
JField(" DOI", JString(pid)) <- reference
|
|
|
|
} yield pid
|
|
|
|
|
|
|
|
if (references_pmid != null && references_pmid.nonEmpty) {
|
2022-01-11 16:57:48 +01:00
|
|
|
val rel = createRelation(
|
|
|
|
references_pmid.head,
|
|
|
|
"pmid",
|
|
|
|
d.getId,
|
|
|
|
collectedFromMap("uniprot"),
|
|
|
|
ModelConstants.RELATIONSHIP,
|
|
|
|
ModelConstants.IS_RELATED_TO,
|
|
|
|
if (i_date.isDefined) i_date.get.date else null
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
rel.getCollectedfrom
|
|
|
|
List(d, rel)
|
2022-01-11 16:57:48 +01:00
|
|
|
} else if (references_doi != null && references_doi.nonEmpty) {
|
|
|
|
val rel = createRelation(
|
|
|
|
references_doi.head,
|
|
|
|
"doi",
|
|
|
|
d.getId,
|
|
|
|
collectedFromMap("uniprot"),
|
|
|
|
ModelConstants.RELATIONSHIP,
|
|
|
|
ModelConstants.IS_RELATED_TO,
|
|
|
|
if (i_date.isDefined) i_date.get.date else null
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
List(d, rel)
|
2022-01-11 16:57:48 +01:00
|
|
|
} else
|
2021-06-24 17:20:00 +02:00
|
|
|
List(d)
|
|
|
|
}
|
|
|
|
|
2021-10-12 08:11:53 +02:00
|
|
|
def generate_unresolved_id(pid: String, pidType: String): String = {
|
2021-06-29 10:21:23 +02:00
|
|
|
s"unresolved::$pid::$pidType"
|
2021-06-28 22:04:22 +02:00
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def createRelation(
|
|
|
|
pid: String,
|
|
|
|
pidType: String,
|
|
|
|
sourceId: String,
|
|
|
|
collectedFrom: KeyValue,
|
|
|
|
subRelType: String,
|
|
|
|
relClass: String,
|
|
|
|
date: String
|
|
|
|
): Relation = {
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
val rel = new Relation
|
2021-06-28 22:04:22 +02:00
|
|
|
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
|
|
|
rel.setDataInfo(DATA_INFO)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2021-08-02 14:28:59 +02:00
|
|
|
rel.setRelType(ModelConstants.RESULT_RESULT)
|
2021-06-24 17:20:00 +02:00
|
|
|
rel.setSubRelType(subRelType)
|
|
|
|
rel.setRelClass(relClass)
|
|
|
|
|
|
|
|
rel.setSource(sourceId)
|
|
|
|
rel.setTarget(s"unresolved::$pid::$pidType")
|
|
|
|
|
2021-10-12 08:11:53 +02:00
|
|
|
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
2021-06-29 12:02:03 +02:00
|
|
|
|
|
|
|
rel.setProperties(List(dateProps).asJava)
|
2021-06-29 10:33:09 +02:00
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
rel.getTarget.startsWith("unresolved")
|
|
|
|
rel.setCollectedfrom(List(collectedFrom).asJava)
|
|
|
|
rel
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def createSupplementaryRelation(
|
|
|
|
pid: String,
|
|
|
|
pidType: String,
|
|
|
|
sourceId: String,
|
|
|
|
collectedFrom: KeyValue,
|
|
|
|
date: String
|
|
|
|
): Relation = {
|
|
|
|
createRelation(
|
|
|
|
pid,
|
|
|
|
pidType,
|
|
|
|
sourceId,
|
|
|
|
collectedFrom,
|
|
|
|
ModelConstants.SUPPLEMENT,
|
|
|
|
ModelConstants.IS_SUPPLEMENT_TO,
|
|
|
|
date
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
def pdbTOOaf(input: String): List[Oaf] = {
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
|
|
|
val pdb = (json \ "pdb").extract[String].toLowerCase
|
|
|
|
|
|
|
|
if (pdb.isEmpty)
|
|
|
|
return List()
|
|
|
|
|
|
|
|
val d = new Dataset
|
|
|
|
|
|
|
|
d.setPid(
|
|
|
|
List(
|
2022-01-11 16:57:48 +01:00
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
pdb,
|
|
|
|
"pdb",
|
|
|
|
"Protein Data Bank Identifier",
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
).asJava
|
|
|
|
)
|
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
|
|
|
d.setDataInfo(DATA_INFO)
|
2021-06-24 17:20:00 +02:00
|
|
|
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
|
|
|
|
d.setOriginalId(List(pdb).asJava)
|
|
|
|
|
|
|
|
val title = (json \ "title").extractOrElse[String](null)
|
|
|
|
|
|
|
|
if (title == null)
|
|
|
|
return List()
|
2022-01-11 16:57:48 +01:00
|
|
|
d.setTitle(
|
|
|
|
List(
|
|
|
|
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
|
|
|
).asJava
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
|
|
|
|
|
|
|
if (authors != null) {
|
|
|
|
val convertedAuthors = authors.zipWithIndex.map { a =>
|
|
|
|
val res = new Author
|
|
|
|
res.setFullname(a._1)
|
|
|
|
res.setRank(a._2 + 1)
|
|
|
|
res
|
|
|
|
}
|
|
|
|
|
|
|
|
d.setAuthor(convertedAuthors.asJava)
|
|
|
|
}
|
|
|
|
|
|
|
|
val i = new Instance
|
|
|
|
|
|
|
|
i.setPid(d.getPid)
|
|
|
|
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
2022-01-11 16:57:48 +01:00
|
|
|
i.setInstancetype(
|
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
"0046",
|
|
|
|
"Bioentity",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
)
|
2023-11-29 12:45:30 +01:00
|
|
|
val itm = new InstanceTypeMapping
|
|
|
|
itm.setOriginalType("Bioentity")
|
2023-11-29 13:15:43 +01:00
|
|
|
itm.setVocabularyName(ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)
|
2023-11-29 12:45:30 +01:00
|
|
|
i.setInstanceTypeMapping(List(itm).asJava)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
i.setCollectedfrom(collectedFromMap("pdb"))
|
2021-06-24 17:20:00 +02:00
|
|
|
d.setInstance(List(i).asJava)
|
|
|
|
|
|
|
|
val pmid = (json \ "pmid").extractOrElse[String](null)
|
|
|
|
|
|
|
|
if (pmid != null)
|
2021-06-29 10:33:09 +02:00
|
|
|
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
|
2021-06-24 17:20:00 +02:00
|
|
|
else
|
|
|
|
List(d)
|
|
|
|
}
|
|
|
|
|
|
|
|
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
|
|
|
|
|
|
|
val pmid = (json \ "publication" \ "pmid").extract[String]
|
|
|
|
val links = (json \ "links").extract[JObject]
|
|
|
|
EBILinkItem(pmid.toLong, compact(render(links)))
|
|
|
|
}
|
|
|
|
|
|
|
|
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
|
|
|
|
"pdb"
|
|
|
|
) || input.targetPidType.equalsIgnoreCase("uniprot")
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
def parse_ebi_links(input: String): List[EBILinks] = {
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
|
|
|
val pmid = (json \ "request" \ "id").extract[String]
|
|
|
|
for {
|
2022-01-11 16:57:48 +01:00
|
|
|
JObject(link) <- json \\ "Link"
|
|
|
|
JField("Target", JObject(target)) <- link
|
|
|
|
JField("RelationshipType", JObject(relType)) <- link
|
|
|
|
JField("Name", JString(relation)) <- relType
|
2021-06-24 17:20:00 +02:00
|
|
|
JField("PublicationDate", JString(publicationDate)) <- link
|
2022-01-11 16:57:48 +01:00
|
|
|
JField("Title", JString(title)) <- target
|
|
|
|
JField("Identifier", JObject(identifier)) <- target
|
|
|
|
JField("IDScheme", JString(idScheme)) <- identifier
|
|
|
|
JField("IDURL", JString(idUrl)) <- identifier
|
|
|
|
JField("ID", JString(id)) <- identifier
|
|
|
|
|
|
|
|
} yield EBILinks(
|
|
|
|
relation,
|
|
|
|
GraphCleaningFunctions.cleanDate(publicationDate),
|
|
|
|
title,
|
|
|
|
pmid,
|
|
|
|
id,
|
|
|
|
idScheme,
|
|
|
|
idUrl
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
|
|
|
val d = new Dataset
|
2021-06-28 22:04:22 +02:00
|
|
|
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
|
|
|
d.setDataInfo(DATA_INFO)
|
2022-01-11 16:57:48 +01:00
|
|
|
d.setTitle(
|
|
|
|
List(
|
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
input.title,
|
|
|
|
ModelConstants.MAIN_TITLE_QUALIFIER,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
|
|
|
|
|
|
|
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
|
|
|
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
|
|
|
|
|
|
|
d.setPid(
|
|
|
|
List(
|
2022-01-11 16:57:48 +01:00
|
|
|
OafMapperUtils.structuredProperty(
|
|
|
|
input.targetPid.toLowerCase,
|
|
|
|
input.targetPidType.toLowerCase,
|
|
|
|
"Protein Data Bank Identifier",
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
DATA_INFO
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
).asJava
|
|
|
|
)
|
|
|
|
|
|
|
|
val i = new Instance
|
|
|
|
|
|
|
|
i.setPid(d.getPid)
|
|
|
|
i.setUrl(List(input.targetUrl).asJava)
|
2022-01-11 16:57:48 +01:00
|
|
|
i.setInstancetype(
|
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
"0046",
|
|
|
|
"Bioentity",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
)
|
2023-11-29 12:45:30 +01:00
|
|
|
val itm = new InstanceTypeMapping
|
|
|
|
itm.setOriginalType("Bioentity")
|
2023-11-29 13:15:43 +01:00
|
|
|
itm.setVocabularyName(ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)
|
2023-11-29 12:45:30 +01:00
|
|
|
i.setInstanceTypeMapping(List(itm).asJava)
|
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
i.setCollectedfrom(collectedFromMap("ebi"))
|
2021-06-24 17:20:00 +02:00
|
|
|
d.setInstance(List(i).asJava)
|
2022-01-11 16:57:48 +01:00
|
|
|
i.setDateofacceptance(
|
|
|
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
|
|
|
)
|
|
|
|
d.setDateofacceptance(
|
|
|
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
List(
|
|
|
|
d,
|
|
|
|
createRelation(
|
|
|
|
input.pmid,
|
|
|
|
"pmid",
|
|
|
|
d.getId,
|
|
|
|
collectedFromMap("ebi"),
|
|
|
|
ModelConstants.RELATIONSHIP,
|
|
|
|
ModelConstants.IS_RELATED_TO,
|
|
|
|
GraphCleaningFunctions.cleanDate(input.date)
|
|
|
|
)
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
|
|
|
}
|