dnet-hadoop/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala

203 lines
6.4 KiB
Scala
Raw Normal View History

package eu.dnetlib.dhp.sx.bio
2020-07-10 14:44:50 +02:00
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
2021-10-12 08:11:53 +02:00
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
2021-06-21 16:23:59 +02:00
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
2021-06-29 10:33:09 +02:00
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
2021-06-29 10:33:09 +02:00
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream
import scala.collection.JavaConverters._
import scala.io.Source
import scala.xml.pull.XMLEventReader
2021-06-03 10:52:09 +02:00
@ExtendWith(Array(classOf[MockitoExtension]))
2022-01-11 16:57:48 +01:00
class BioScholixTest extends AbstractVocabularyTest {
2021-05-04 14:54:12 +02:00
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
2022-01-11 16:57:48 +01:00
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@BeforeEach
2022-01-11 16:57:48 +01:00
def setUp(): Unit = {
2020-07-10 14:44:50 +02:00
super.setUpVocabulary()
}
2020-07-10 14:44:50 +02:00
class BufferedReaderIterator(reader: BufferedReader) extends Iterator[String] {
override def hasNext() = reader.ready
override def next() = reader.readLine()
}
object GzFileIterator {
2022-01-11 16:57:48 +01:00
def apply(is: InputStream, encoding: String) = {
new BufferedReaderIterator(
2022-01-11 16:57:48 +01:00
new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
)
}
}
2021-05-04 14:54:12 +02:00
@Test
2020-07-10 14:44:50 +02:00
def testEBIData() = {
2022-01-11 16:57:48 +01:00
val inputXML = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
.mkString
2021-05-04 14:54:12 +02:00
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
2022-01-11 16:57:48 +01:00
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
}
@Test
def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
2022-01-11 16:57:48 +01:00
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
.mkString
val r: List[Oaf] = records.lines.toList
.map(s => mapper.readValue(s, classOf[PMArticle]))
.map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size)
2022-01-11 16:57:48 +01:00
assertTrue(
r.map(p => p.asInstanceOf[Result])
.flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
.exists(p => "0037".equalsIgnoreCase(p))
)
println(mapper.writeValueAsString(r.head))
2021-08-24 15:20:04 +02:00
2021-06-18 16:41:24 +02:00
}
@Test
2022-01-11 16:57:48 +01:00
def testPDBToOAF(): Unit = {
2021-06-03 10:52:09 +02:00
2021-06-18 16:41:24 +02:00
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
2022-01-11 16:57:48 +01:00
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
.mkString
2021-06-18 16:41:24 +02:00
records.lines.foreach(s => assertTrue(s.nonEmpty))
2020-07-10 14:44:50 +02:00
2022-01-11 16:57:48 +01:00
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
2021-06-21 16:23:59 +02:00
2021-06-18 16:41:24 +02:00
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
2020-07-10 14:44:50 +02:00
2021-06-21 16:23:59 +02:00
println(result.count(o => o.isInstanceOf[Relation]))
println(mapper.writeValueAsString(result.head))
2020-07-10 14:44:50 +02:00
}
@Test
2022-01-11 16:57:48 +01:00
def testUNIprotToOAF(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
2022-01-11 16:57:48 +01:00
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
2022-01-11 16:57:48 +01:00
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
println(result.count(o => o.isInstanceOf[Relation]))
println(mapper.writeValueAsString(result.head))
}
2022-01-11 16:57:48 +01:00
case class EBILinks(
relType: String,
date: String,
title: String,
pmid: String,
targetPid: String,
targetPidType: String
) {}
2022-01-11 16:57:48 +01:00
def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
2022-01-11 16:57:48 +01:00
val pmid = (json \ "publication" \ "pmid").extract[String]
for {
2022-01-11 16:57:48 +01:00
JObject(link) <- json \\ "Link"
JField("Target", JObject(target)) <- link
JField("RelationshipType", JObject(relType)) <- link
JField("Name", JString(relation)) <- relType
JField("PublicationDate", JString(publicationDate)) <- link
JField("Title", JString(title)) <- target
JField("Identifier", JObject(identifier)) <- target
JField("IDScheme", JString(idScheme)) <- identifier
JField("ID", JString(id)) <- identifier
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
}
@Test
2022-01-11 16:57:48 +01:00
def testCrossrefLinksToOAF(): Unit = {
2022-01-11 16:57:48 +01:00
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
2022-01-11 16:57:48 +01:00
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
assertNotNull(result)
assertTrue(result.nonEmpty)
println(mapper.writeValueAsString(result.head))
}
@Test
2022-01-11 16:57:48 +01:00
def testEBILinksToOAF(): Unit = {
val iterator = GzFileIterator(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
"UTF-8"
)
val data = iterator.next()
2022-01-11 16:57:48 +01:00
val res = BioDBToOAF
.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
.filter(BioDBToOAF.EBITargetLinksFilter)
.flatMap(BioDBToOAF.convertEBILinksToOaf)
print(res.length)
println(mapper.writeValueAsString(res.head))
}
@Test
2022-01-11 16:57:48 +01:00
def scholixResolvedToOAF(): Unit = {
2022-01-11 16:57:48 +01:00
val records: String = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
)
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
2022-01-11 16:57:48 +01:00
val l: List[ScholixResolved] = records.lines.map { input =>
lazy val json = parse(input)
json.extract[ScholixResolved]
}.toList
2022-01-11 16:57:48 +01:00
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty)
}
2020-07-10 14:44:50 +02:00
}