implemented mapping from EBI and Scholix Resolved to OAF

This commit is contained in:
Sandro La Bruzzo 2021-06-28 22:04:22 +02:00
parent ad50415167
commit 511ec14c63
6 changed files with 244 additions and 56 deletions

View File

@ -15,19 +15,115 @@ object BioDBToOAF {
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val PDB_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
val UNIPROT_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
val SUBJ_CLASS = "Keywords"
UNIPROT_COLLECTED_FROM.setDataInfo(dataInfo)
PDB_COLLECTED_FROM.setDataInfo(dataInfo)
val EBI_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
case class UniprotDate(date: String, date_info: String) {}
case class ScholixResolved(pid:String, pidType:String, typology:String, tilte:List[String], datasource:List[String], date:List[String], authors:List[String]){}
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val SUBJ_CLASS = "Keywords"
val resolvedURL:Map[String,String] = Map(
"genbank"-> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
"clinicaltrials.gov"-> "https://clinicaltrials.gov/ct2/show/",
"onim"-> "https://omim.org/entry/",
"refseq"-> "https://www.ncbi.nlm.nih.gov/nuccore/",
"geo"-> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
)
val collectedFromMap: Map[String, KeyValue] = {
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
val pubmedCollectedFrom:KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO)
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
EBICollectedFrom.setDataInfo(DATA_INFO)
pubmedCollectedFrom.setDataInfo(DATA_INFO)
enaCollectedFrom.setDataInfo(DATA_INFO)
ncbiCollectedFrom.setDataInfo(DATA_INFO)
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
Map(
"uniprot" -> UNIPROTCollectedFrom,
"pdb"-> PDBCollectedFrom,
"elsevier" ->ElsevierCollectedFrom,
"ebi" ->EBICollectedFrom,
"Springer Nature" -> springerNatureCollectedFrom,
"NCBI Nucleotide" -> ncbiCollectedFrom,
"European Nucleotide Archive" -> enaCollectedFrom,
"Europe PMC" -> pubmedCollectedFrom
)
}
def scholixResolvedToOAF(input:ScholixResolved):Oaf = {
val d = new Dataset
d.setPid(
List(
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
d.setDataInfo(DATA_INFO)
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
if (input.tilte != null && input.tilte.nonEmpty)
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setOriginalId(List(input.pid).asJava)
val i = new Instance
i.setPid(d.getPid)
if (resolvedURL.contains(input.pidType)) {
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
}
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
else
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
if (input.datasource == null || input.datasource.isEmpty)
return null
val ds = input.datasource.head
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
i.setCollectedfrom(collectedFromMap(ds))
d.setInstance(List(i).asJava)
if (input.authors != null && input.authors.nonEmpty) {
val authors = input.authors.map(a =>{
val authorOAF = new Author
authorOAF.setFullname(a)
authorOAF
})
d.setAuthor(authors.asJava)
}
if (input.date!= null && input.date.nonEmpty) {
val dt = input.date.head
i.setDateofacceptance(OafMapperUtils.field(dt, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(dt, DATA_INFO))
}
d
}
def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@ -37,18 +133,18 @@ object BioDBToOAF {
d.setPid(
List(
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
d.setDataInfo(dataInfo)
d.setDataInfo(DATA_INFO)
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
d.setCollectedfrom(List(UNIPROT_COLLECTED_FROM).asJava)
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
val title: String = (json \ "title").extractOrElse[String](null)
if (title != null)
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setOriginalId(List(pid).asJava)
val i = new Instance
@ -57,7 +153,7 @@ object BioDBToOAF {
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setCollectedfrom(UNIPROT_COLLECTED_FROM)
i.setCollectedfrom(collectedFromMap("uniprot"))
d.setInstance(List(i).asJava)
val dates: List[UniprotDate] = for {
@ -80,14 +176,14 @@ object BioDBToOAF {
if (dates.nonEmpty) {
val i_date = dates.find(d => d.date_info.contains("entry version"))
if (i_date.isDefined) {
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, dataInfo))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, dataInfo))
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
.map(date => OafMapperUtils.structuredProperty(date.date, "UNKNOWN", "UNKNOWN", ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, dataInfo))
.map(date => OafMapperUtils.structuredProperty(date.date, "UNKNOWN", "UNKNOWN", ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
if (relevant_dates != null && relevant_dates.nonEmpty)
d.setRelevantdate(relevant_dates.asJava)
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, dataInfo))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
@ -103,25 +199,30 @@ object BioDBToOAF {
if (references_pmid != null && references_pmid.nonEmpty) {
val rel = createRelation(references_pmid.head, "pmid", d.getId, UNIPROT_COLLECTED_FROM, "relationship", "isRelatedTo")
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), "relationship", "isRelatedTo")
rel.getCollectedfrom
List(d, rel)
}
else if (references_doi != null && references_doi.nonEmpty) {
val rel = createRelation(references_doi.head, "doi", d.getId, UNIPROT_COLLECTED_FROM, "relationship", "isRelatedTo")
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), "relationship", "isRelatedTo")
List(d, rel)
}
else
List(d)
}
def crossrefLinkToRelation(input:String):Oaf = {
null
}
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType:String, relClass:String):Relation = {
val rel = new Relation
rel.setCollectedfrom(List(PDB_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo)
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
rel.setDataInfo(DATA_INFO)
rel.setRelType("resultResult")
rel.setSubRelType(subRelType)
@ -154,12 +255,12 @@ object BioDBToOAF {
d.setPid(
List(
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
d.setCollectedfrom(List(PDB_COLLECTED_FROM).asJava)
d.setDataInfo(dataInfo)
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
d.setDataInfo(DATA_INFO)
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
d.setOriginalId(List(pdb).asJava)
@ -167,7 +268,7 @@ object BioDBToOAF {
if (title == null)
return List()
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
@ -189,13 +290,13 @@ object BioDBToOAF {
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setCollectedfrom(PDB_COLLECTED_FROM)
i.setCollectedfrom(collectedFromMap("pdb"))
d.setInstance(List(i).asJava)
val pmid = (json \ "pmid").extractOrElse[String](null)
if (pmid != null)
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, PDB_COLLECTED_FROM))
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb")))
else
List(d)
}
@ -240,9 +341,9 @@ object BioDBToOAF {
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
val d = new Dataset
d.setCollectedfrom(List(EBI_COLLECTED_FROM).asJava)
d.setDataInfo(dataInfo)
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
d.setDataInfo(DATA_INFO)
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
@ -252,7 +353,7 @@ object BioDBToOAF {
d.setPid(
List(
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
).asJava
)
@ -262,11 +363,11 @@ object BioDBToOAF {
i.setUrl(List(input.targetUrl).asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setCollectedfrom(EBI_COLLECTED_FROM)
i.setCollectedfrom(collectedFromMap("ebi"))
d.setInstance(List(i).asJava)
i.setDateofacceptance(OafMapperUtils.field(input.date, dataInfo))
d.setDateofacceptance(OafMapperUtils.field(input.date, dataInfo))
i.setDateofacceptance(OafMapperUtils.field(input.date, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(input.date, DATA_INFO))
List(d, createRelation(input.pmid, "pmid", d.getId, EBI_COLLECTED_FROM,"relationship", "isRelatedTo"))
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"),"relationship", "isRelatedTo"))
}
}

View File

@ -2,6 +2,7 @@ package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
@ -31,13 +32,15 @@ object SparkTransformBioDatabaseToOAF {
val sc = spark.sparkContext
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
import spark.implicits._
database.toUpperCase() match {
case "UNIPROT" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath)
case "PDB"=>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
case "SCHOLIX" =>
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -1,4 +1,4 @@
<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>PDBPath</name>
@ -15,6 +15,11 @@
<description>the EBI Links Dataset Path</description>
</property>
<property>
<name>ScholixResolvedDBPath</name>
<description>the Scholix Resolved Dataset Path</description>
</property>
<property>
<name>targetPath</name>
<description>the Target Working dir path</description>
@ -102,10 +107,36 @@
<arg>--sourcePath</arg><arg>${EBIDataset}</arg>
<arg>--targetPath</arg><arg>${targetPath}/ebi_OAF</arg>
</spark>
<ok to="End"/>
<ok to="ConvertScholixResolved"/>
<error to="Kill"/>
</action>
<action name="ConvertScholixResolved">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Convert Scholix to OAF Dataset</name>
<class>eu.dnetlib.dhp.sx.bio.SparkTransformBioDatabaseToOAF</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=2000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--dbPath</arg><arg>${ScholixResolvedDBPath}</arg>
<arg>--database</arg><arg>SCHOLIX</arg>
<arg>--targetPath</arg><arg>${targetPath}/scholix_resolved_OAF</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.sx.bio.pubmed
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.ebi.SparkEBILinksToOaf
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith
@ -22,6 +23,9 @@ import org.json4s.jackson.JsonMethods.parse
class BioScholixTest extends AbstractVocabularyTest{
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
@BeforeEach
def setUp() :Unit = {
@ -46,8 +50,6 @@ class BioScholixTest extends AbstractVocabularyTest{
@Test
def testEBIData() = {
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
@ -58,9 +60,6 @@ class BioScholixTest extends AbstractVocabularyTest{
def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
val records:String =Source.fromInputStream(getClass.getResourceAsStream("pubmed_dump")).mkString
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size)
@ -74,9 +73,6 @@ class BioScholixTest extends AbstractVocabularyTest{
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/pdb_dump")).mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
@ -99,8 +95,6 @@ class BioScholixTest extends AbstractVocabularyTest{
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/uniprot_dump")).mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
@ -142,12 +136,41 @@ class BioScholixTest extends AbstractVocabularyTest{
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi_links.gz"), "UTF-8")
val data = iterator.next()
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
print(res.length)
println(mapper.writeValueAsString(res.head))
}
@Test
def scholixResolvedToOAF():Unit ={
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/scholix_resolved")).mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val l:List[ScholixResolved] = records.lines.map{input =>
lazy val json = parse(input)
json.extract[ScholixResolved]
}.toList
val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty)
}
}

View File

@ -0,0 +1,10 @@
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "P29964", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=P29964[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0141-8130", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0141-8130"}, "Name": "International Journal of Biological Macromolecules"}, "Identifier": {"ID": "10.1016/j.ijbiomac.2017.09.060", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ijbiomac.2017.09.060"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2018", "Title": "Molecular structure of cyclomaltodextrinase derived from amylolytic lactic acid bacterium "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "P56942", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=P56942[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0091-3022", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0091-3022"}, "Name": "Frontiers in Neuroendocrinology"}, "Identifier": {"ID": "10.1016/j.yfrne.2010.09.001", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.yfrne.2010.09.001"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2011", "Title": "New aspects of melanocortin signaling: A role for PRCP in \u03b1-MSH degradation"}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "Q9R0R4", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=Q9R0R4[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0091-3022", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0091-3022"}, "Name": "Frontiers in Neuroendocrinology"}, "Identifier": {"ID": "10.1016/j.yfrne.2010.09.001", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.yfrne.2010.09.001"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2011", "Title": "New aspects of melanocortin signaling: A role for PRCP in \u03b1-MSH degradation"}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "Q9EQX0", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=Q9EQX0[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0091-3022", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0091-3022"}, "Name": "Frontiers in Neuroendocrinology"}, "Identifier": {"ID": "10.1016/j.yfrne.2010.09.001", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.yfrne.2010.09.001"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2011", "Title": "New aspects of melanocortin signaling: A role for PRCP in \u03b1-MSH degradation"}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "NP_060522.3", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=NP_060522.3[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0002-9297", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0002-9297"}, "Name": "The American Journal of Human Genetics"}, "Identifier": {"ID": "10.1016/j.ajhg.2016.05.008", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ajhg.2016.05.008"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2016", "Title": "Biallelic Mutations of "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "NP_666328.2", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=NP_666328.2[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0002-9297", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0002-9297"}, "Name": "The American Journal of Human Genetics"}, "Identifier": {"ID": "10.1016/j.ajhg.2016.05.008", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ajhg.2016.05.008"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2016", "Title": "Biallelic Mutations of "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "NP_001025735.1", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=NP_001025735.1[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0002-9297", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0002-9297"}, "Name": "The American Journal of Human Genetics"}, "Identifier": {"ID": "10.1016/j.ajhg.2016.05.008", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ajhg.2016.05.008"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2016", "Title": "Biallelic Mutations of "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "XP_008119452.1", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=XP_008119452.1[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0002-9297", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0002-9297"}, "Name": "The American Journal of Human Genetics"}, "Identifier": {"ID": "10.1016/j.ajhg.2016.05.008", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ajhg.2016.05.008"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2016", "Title": "Biallelic Mutations of "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "XP_003964762.1", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=XP_003964762.1[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0002-9297", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0002-9297"}, "Name": "The American Journal of Human Genetics"}, "Identifier": {"ID": "10.1016/j.ajhg.2016.05.008", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ajhg.2016.05.008"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2016", "Title": "Biallelic Mutations of "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}
{"LinkProvider": {"Name": "Elsevier"}, "Target": {"Identifier": {"ID": "XP_004208946.2", "IDScheme": "ncbi-p", "IDURL": "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=protein&doptcmdl=genbank&term=XP_004208946.2[accn]"}, "Type": {"Name": "dataset"}}, "Source": {"Publisher": {"Identifier": {"ID": "0002-9297", "IDScheme": "ISNI", "IDURL": "http://www.isni.org/isni/0002-9297"}, "Name": "The American Journal of Human Genetics"}, "Identifier": {"ID": "10.1016/j.ajhg.2016.05.008", "IDScheme": "DOI", "IDURL": "https://doi.org/10.1016/j.ajhg.2016.05.008"}, "Type": {"SubType": "journal article", "Name": "literature"}, "PublicationDate": "2016", "Title": "Biallelic Mutations of "}, "LinkedPublicationDate": "2018", "LicenseURL": "https://creativecommons.org/licenses/by/4.0/", "RelationshipType": {"Name": "IsReferencedBy"}}

View File

@ -0,0 +1,20 @@
{"pid":"nm_012611","pidType":"genbank","typology":"dataset","tilte":["Dynamic localization and functional implications of C-peptide might for suppression of iNOS in high glucose-stimulated rat mesangial cells"],"datasource":["Springer Nature"],"date":["12-FEB-2000"],"authors":["Li Y","Zhao M","Li B","Qi J"]}
{"pid":"nm_008696","pidType":"genbank","typology":"dataset","tilte":["Map4k4 suppresses Srebp-1 and adipocyte lipogenesis independent of JNK signaling"],"datasource":["Springer Nature"],"date":["04-JAN-2000"],"authors":["Danai LV","Guilherme A","Guntur KV","Straubhaar J","Nicoloro SM","Czech MP"]}
{"pid":"nm_010676","pidType":"genbank","typology":"dataset","tilte":["A high-resolution anatomical atlas of the transcriptome in the mouse embryo"],"datasource":["Springer Nature"],"date":["25-JAN-2000"],"authors":["Diez-Roux G","Banfi S","Sultan M","Geffers L","Anand S","Rozado D","Magen A","Canidio E","Pagani M","Peluso I","Lin-Marq N","Koch M","Bilio M","Cantiello I","Verde R","De Masi C","Bianchi SA","Cicchini J","Perroud E","Mehmeti S","Dagand E","Schrinner S","Nurnberger A","Schmidt K","Metz K","Zwingmann C","Brieske N","Springer C","Hernandez AM","Herzog S","Grabbe F","Sieverding C","Fischer B","Schrader K","Brockmeyer M","Dettmer S","Helbig C","Alunni V","Battaini MA","Mura C","Henrichsen CN","Garcia-Lopez R","Echevarria D","Puelles E","Garcia-Calero E","Kruse S","Uhr M","Kauck C","Feng G","Milyaev N","Ong CK","Kumar L","Lam M","Semple CA","Gyenesei A","Mundlos S","Radelof U","Lehrach H","Sarmientos P","Reymond A","Davidson DR","Dolle P","Antonarakis SE","Yaspo ML","Martinez S","Baldock RA","Eichele G","Ballabio A"]}
{"pid":"nm_001511","pidType":"genbank","typology":"dataset","tilte":["CXCL1 inhibits airway smooth muscle cell migration through the decoy receptor Duffy antigen receptor for chemokines"],"datasource":["Springer Nature"],"date":["19-MAR-1999"],"authors":["Al-Alwan LA","Chang Y","Rousseau S","Martin JG","Eidelman DH","Hamid Q"]}
{"pid":"nm_024810","pidType":"genbank","typology":"dataset","tilte":["Homo sapiens cDNA: FLJ23018 fis, clone LNG00903"],"datasource":["Springer Nature"],"date":["18-MAR-2001"],"authors":["Sugano,S.","Suzuki,Y.","Ota,T.","Obayashi,M.","Nishi,T.","Isogai,T.","Shibahara,T.","Tanaka,T.","Nakamura,Y."]}
{"pid":"nm_001864","pidType":"genbank","typology":"dataset","tilte":["Muscle inflammatory signaling in response to 9 days of physical inactivity in young men with low compared with normal birth weight"],"datasource":["Springer Nature"],"date":["19-MAR-1999"],"authors":["Friedrichsen M","Ribel-Madsen R","Mortensen B","Hansen CN","Alibegovic AC","Hojbjerre L","Sonne MP","Wojtaszewski JF","Stallknecht B","Dela F","Vaag A"]}
{"pid":"nm_019578","pidType":"genbank","typology":"dataset","tilte":["A conditional knockout resource for the genome-wide study of mouse gene function"],"datasource":["Springer Nature"],"date":["31-JUL-2000"],"authors":["Skarnes WC","Rosen B","West AP","Koutsourakis M","Bushell W","Iyer V","Mujica AO","Thomas M","Harrow J","Cox T","Jackson D","Severin J","Biggs P","Fu J","Nefedov M","de Jong PJ","Stewart AF","Bradley A"]}
{"pid":"ay187231","pidType":"genbank","typology":"dataset","tilte":["Fkbp8: novel isoforms, genomic organization, and characterization of a forebrain promoter in transgenic mice"],"datasource":["Springer Nature","European Nucleotide Archive"],"date":["02-FEB-2003","2003-02-03"],"authors":["Nielsen,J.V.","Mitchelmore,C.","Pedersen,K.M.","Kjaerulff,K.M.","Finsen,B.","Jensen,N.A."]}
{"pid":"p50213","pidType":"genbank","typology":"dataset","tilte":["Characterization of a cDNA clone for human NAD(+)-specific isocitrate dehydrogenase alpha-subunit and structural comparison with its isoenzymes from different species"],"datasource":["Springer Nature"],"date":["01-OCT-1996"],"authors":["Kim,Y.O.","Oh,I.U.","Park,H.S.","Jeng,J.","Song,B.J.","Huh,T.L."]}
{"pid":"af187814","pidType":"genbank","typology":"dataset","tilte":["Rattus norvegicus putative N-acetyltransferase CML3 (Cml3) mRNA, partial cds."],"datasource":["Springer Nature","European Nucleotide Archive"],"date":["2000-01-02","01-JAN-2000"],"authors":["Popsueva,A.E.","Luchinskaya,N.N.","Ludwig,A.V.","Zinovjeva,O.Y.","Poteryaev,D.A.","Feigelman,M.M.","Ponomarev,M.B.","Berekelya,L.","Belyavsky,A.V."]}
{"pid":"p02787","pidType":"genbank","typology":"dataset","tilte":["Human transferrin: cDNA characterization and chromosomal localization"],"datasource":["Springer Nature"],"date":["21-JUL-1986"],"authors":["Yang,F.","Lum,J.B.","McGill,J.R.","Moore,C.M.","Naylor,S.L.","van Bragt,P.H.","Baldwin,W.D.","Bowman,B.H."]}
{"pid":"cp000653","pidType":"genbank","typology":"dataset","tilte":["Complete sequence of chromosome of Enterobacter sp. 638"],"datasource":["Springer Nature"],"date":["18-APR-2007"],"authors":["Copeland,A.","Lucas,S.","Lapidus,A.","Barry,K.","Glavina del Rio,T.","Dalin,E.","Tice,H.","Pitluck,S.","Kiss,H.","Brettin,T.","Bruce,D.","Detter,J.C.","Han,C.","Tapia,R.","Gilna,P.","Schmutz,J.","Larimer,F.","Land,M.","Hauser,L.","Kyrpides,N.","Kim,E.","Taghavi,S.","Newman,L.","Vangronsveld,J.","van der Lelie,D.","Richardson,P."]}
{"pid":"nm_011448","pidType":"nuccore","typology":"dataset","tilte":["Sry HMG box protein 9-positive (Sox9+) epithelial cell adhesion molecule-negative (EpCAM-) biphenotypic cells derived from hepatocytes are involved in mouse liver regeneration"],"datasource":["Springer Nature"],"date":["19-AUG-2002"],"authors":["Tanimizu N","Nishikawa Y","Ichinohe N","Akiyama H","Mitaka T"]}
{"pid":"nm_013127","pidType":"genbank","typology":"dataset","tilte":["Attenuation of glucose-induced insulin secretion by intermittent hypoxia via down-regulation of CD38"],"datasource":["Springer Nature"],"date":["12-FEB-2000"],"authors":["Ota H","Tamaki S","Itaya-Hironaka A","Yamauchi A","Sakuramoto-Tsuchida S","Morioka T","Takasawa S","Kimura H"]}
{"pid":"o57380","pidType":"genbank","typology":"dataset","tilte":["Structural and enzymatic properties of a gastric NADP(H)- dependent and retinal-active alcohol dehydrogenase"],"datasource":["Springer Nature"],"date":["30-MAY-2000"],"authors":["Peralba,J.M.","Cederlund,E.","Crosas,B.","Moreno,A.","Julia,P.","Martinez,S.E.","Persson,B.","Farr s,J.","Pares,X.","Jornvall,H."]}
{"pid":"nm_003299","pidType":"genbank","typology":"dataset","tilte":["The significance of HSP90AA1, HSP90AB1 and HSP90B1 gene polymorphisms in a Turkish population with non-small cell lung cancer"],"datasource":["Springer Nature"],"date":["19-MAR-1999"],"authors":["Coskunpinar E","Akkaya N","Yildiz P","Oltulu YM","Aynaci E","Isbir T","Yaylim I"]}
{"pid":"x58196","pidType":"genbank","typology":"dataset","tilte":["M.musculus H19 mRNA"],"datasource":["Springer Nature","European Nucleotide Archive"],"date":["1992-02-11","11-FEB-1992"],"authors":["Poirier,F.","Chan,C.T.","Timmons,P.M.","Robertson,E.J.","Evans,M.J.","Rigby,P.W."]}
{"pid":"nm_000389","pidType":"genbank","typology":"dataset","tilte":["Resveratrol mediated cell death in cigarette smoke transformed breast epithelial cells is through induction of p21Waf1/Cip1 and inhibition of long patch base excision repair pathway"],"datasource":["Springer Nature"],"date":["28-NOV-2000"],"authors":["Mohapatra P","Satapathy SR","Das D","Siddharth S","Choudhuri T","Kundu CN"]}
{"pid":"x13274","pidType":"genbank","typology":"dataset","tilte":["Expression of human immune interferon cDNA in E. coli and monkey cells"],"datasource":["Springer Nature","European Nucleotide Archive"],"date":["20-OCT-1988","1988-10-20"],"authors":["Gray,P.W.","Leung,D.W.","Pennica,D.","Yelverton,E.","Najarian,R.","Simonsen,C.C.","Derynck,R.","Sherwood,P.J.","Wallace,D.M.","Berger,S.L.","Levinson,A.D.","Goeddel,D.V."]}
{"pid":"nm_000201","pidType":"genbank","typology":"dataset","tilte":["Intercellular adhesion molecular-1, Fas, and Fas ligand as diagnostic biomarkers for acute allograft rejection of pancreaticoduodenal transplantation in pigs"],"datasource":["Springer Nature"],"date":["01-APR-1999"],"authors":["Dong GH","Li XF","Li JZ","Zhang ZD","Hu WM","Luo YH","Li ZD","Tian BL","He MX","Zhu XW"]}