2021-10-15 15:00:15 +02:00
|
|
|
package eu.dnetlib.dhp.sx.bio
|
2020-07-10 14:44:50 +02:00
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
|
2021-10-12 08:11:53 +02:00
|
|
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
2022-07-13 15:27:17 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
2021-10-15 15:00:15 +02:00
|
|
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
2022-07-14 10:19:59 +02:00
|
|
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
|
2021-06-29 10:33:09 +02:00
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
|
|
|
import org.json4s.jackson.JsonMethods.parse
|
2021-06-16 14:56:24 +02:00
|
|
|
import org.junit.jupiter.api.Assertions._
|
|
|
|
import org.junit.jupiter.api.extension.ExtendWith
|
|
|
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
|
|
|
import org.mockito.junit.jupiter.MockitoExtension
|
|
|
|
|
2021-06-29 10:33:09 +02:00
|
|
|
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
2021-06-24 17:20:00 +02:00
|
|
|
import java.util.zip.GZIPInputStream
|
2021-06-16 14:56:24 +02:00
|
|
|
import scala.collection.JavaConverters._
|
2022-07-13 15:27:17 +02:00
|
|
|
import scala.collection.mutable.ListBuffer
|
2021-06-16 14:56:24 +02:00
|
|
|
import scala.io.Source
|
|
|
|
import scala.xml.pull.XMLEventReader
|
2021-06-03 10:52:09 +02:00
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
@ExtendWith(Array(classOf[MockitoExtension]))
|
2022-01-11 16:57:48 +01:00
|
|
|
class BioScholixTest extends AbstractVocabularyTest {
|
2021-05-04 14:54:12 +02:00
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
2022-01-11 16:57:48 +01:00
|
|
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
2021-06-28 22:04:22 +02:00
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
@BeforeEach
|
2022-01-11 16:57:48 +01:00
|
|
|
def setUp(): Unit = {
|
2020-07-10 14:44:50 +02:00
|
|
|
|
2021-06-16 14:56:24 +02:00
|
|
|
super.setUpVocabulary()
|
|
|
|
}
|
2020-07-10 14:44:50 +02:00
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
class BufferedReaderIterator(reader: BufferedReader) extends Iterator[String] {
|
|
|
|
override def hasNext() = reader.ready
|
|
|
|
override def next() = reader.readLine()
|
|
|
|
}
|
|
|
|
|
|
|
|
object GzFileIterator {
|
2022-01-11 16:57:48 +01:00
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
def apply(is: InputStream, encoding: String) = {
|
|
|
|
new BufferedReaderIterator(
|
2022-01-11 16:57:48 +01:00
|
|
|
new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-04 14:54:12 +02:00
|
|
|
@Test
|
2020-07-10 14:44:50 +02:00
|
|
|
def testEBIData() = {
|
2022-01-11 16:57:48 +01:00
|
|
|
val inputXML = Source
|
|
|
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
|
|
|
.mkString
|
2021-05-04 14:54:12 +02:00
|
|
|
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
2022-01-11 16:57:48 +01:00
|
|
|
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
2021-06-16 14:56:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
def testPubmedToOaf(): Unit = {
|
|
|
|
assertNotNull(vocabularies)
|
|
|
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
2022-01-11 16:57:48 +01:00
|
|
|
val records: String = Source
|
|
|
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
|
|
|
|
.mkString
|
|
|
|
val r: List[Oaf] = records.lines.toList
|
|
|
|
.map(s => mapper.readValue(s, classOf[PMArticle]))
|
|
|
|
.map(a => PubMedToOaf.convert(a, vocabularies))
|
2021-06-16 14:56:24 +02:00
|
|
|
assertEquals(10, r.size)
|
2022-01-11 16:57:48 +01:00
|
|
|
assertTrue(
|
|
|
|
r.map(p => p.asInstanceOf[Result])
|
|
|
|
.flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
|
|
|
|
.exists(p => "0037".equalsIgnoreCase(p))
|
|
|
|
)
|
2021-06-16 14:56:24 +02:00
|
|
|
println(mapper.writeValueAsString(r.head))
|
2021-08-24 15:20:04 +02:00
|
|
|
|
2022-07-13 15:27:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private def checkPMArticle(article:PMArticle): Unit = {
|
|
|
|
assertNotNull(article.getPmid)
|
|
|
|
assertNotNull(article.getTitle)
|
|
|
|
assertNotNull(article.getAuthors)
|
|
|
|
article.getAuthors.asScala.foreach{a =>
|
|
|
|
assertNotNull(a)
|
|
|
|
assertNotNull(a.getFullName)
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
def testParsingPubmedXML():Unit = {
|
|
|
|
val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")))
|
|
|
|
val parser = new PMParser(xml)
|
|
|
|
parser.foreach(checkPMArticle)
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private def checkPubmedPublication(o:Oaf): Unit = {
|
|
|
|
assertTrue(o.isInstanceOf[Publication])
|
|
|
|
val p:Publication = o.asInstanceOf[Publication]
|
|
|
|
assertNotNull(p.getId)
|
|
|
|
assertNotNull(p.getTitle)
|
|
|
|
p.getTitle.asScala.foreach(t =>assertNotNull(t.getValue))
|
|
|
|
p.getAuthor.asScala.foreach(a =>assertNotNull(a.getFullname))
|
|
|
|
assertNotNull(p.getInstance())
|
|
|
|
p.getInstance().asScala.foreach { i =>
|
|
|
|
assertNotNull(i.getCollectedfrom)
|
|
|
|
assertNotNull(i.getPid)
|
|
|
|
assertNotNull(i.getInstancetype)
|
|
|
|
}
|
|
|
|
assertNotNull(p.getOriginalId)
|
|
|
|
p.getOriginalId.asScala.foreach(oId => assertNotNull(oId))
|
|
|
|
|
|
|
|
|
|
|
|
val hasPMC = p.getInstance().asScala.exists(i => i.getPid.asScala.exists(pid => pid.getQualifier.getClassid.equalsIgnoreCase(PidType.pmc.toString)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (hasPMC) {
|
|
|
|
assertTrue(p.getOriginalId.asScala.exists(oId => oId.startsWith("od_______267::")))
|
2022-07-14 10:19:59 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-13 15:27:17 +02:00
|
|
|
|
|
|
|
|
2022-07-14 10:19:59 +02:00
|
|
|
@Test
|
|
|
|
def testPubmedOriginalID():Unit = {
|
|
|
|
val article:PMArticle = new PMArticle
|
|
|
|
|
|
|
|
|
|
|
|
article.setPmid("1234")
|
|
|
|
|
|
|
|
article.setTitle("a Title")
|
2022-07-13 15:27:17 +02:00
|
|
|
|
2022-07-14 10:19:59 +02:00
|
|
|
// VERIFY PUBLICATION IS NOT NULL
|
|
|
|
article.getPublicationTypes.add( new PMSubject("article",null, null))
|
|
|
|
var publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication]
|
|
|
|
assertNotNull(publication)
|
|
|
|
assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId)
|
2022-07-13 15:27:17 +02:00
|
|
|
|
2022-07-14 10:19:59 +02:00
|
|
|
// VERIFY PUBLICATION ID DOES NOT CHANGE ALSO IF SETTING PMC IDENTIFIER
|
|
|
|
article.setPmcId("PMC1517292")
|
|
|
|
publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication]
|
|
|
|
assertNotNull(publication)
|
|
|
|
assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId)
|
2022-07-13 15:27:17 +02:00
|
|
|
|
2022-07-14 10:19:59 +02:00
|
|
|
// VERIFY ORIGINAL ID GENERATE IN OLD WAY USING PMC IDENTIFIER EXISTS
|
|
|
|
|
|
|
|
|
|
|
|
val oldOpenaireID ="od_______267::0000072375bc0e68fa09d4e6b7658248"
|
|
|
|
|
|
|
|
val hasOldOpenAIREID = publication.getOriginalId.asScala.exists(o => o.equalsIgnoreCase(oldOpenaireID))
|
|
|
|
|
|
|
|
assertTrue(hasOldOpenAIREID)
|
2022-07-13 15:27:17 +02:00
|
|
|
}
|
2022-07-14 10:19:59 +02:00
|
|
|
|
|
|
|
|
2022-07-13 15:27:17 +02:00
|
|
|
@Test
|
|
|
|
def testPubmedMapping() :Unit = {
|
|
|
|
|
|
|
|
val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")))
|
|
|
|
val parser = new PMParser(xml)
|
|
|
|
val results = ListBuffer[Oaf]()
|
|
|
|
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results.foreach(checkPubmedPublication)
|
|
|
|
|
|
|
|
|
|
|
|
|
2021-06-18 16:41:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
2022-01-11 16:57:48 +01:00
|
|
|
def testPDBToOAF(): Unit = {
|
2021-06-03 10:52:09 +02:00
|
|
|
|
2021-06-18 16:41:24 +02:00
|
|
|
assertNotNull(vocabularies)
|
|
|
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
2022-01-11 16:57:48 +01:00
|
|
|
val records: String = Source
|
|
|
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
|
|
|
|
.mkString
|
2021-06-18 16:41:24 +02:00
|
|
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
2020-07-10 14:44:50 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
2021-06-21 16:23:59 +02:00
|
|
|
|
2021-06-18 16:41:24 +02:00
|
|
|
assertTrue(result.nonEmpty)
|
|
|
|
result.foreach(r => assertNotNull(r))
|
2020-07-10 14:44:50 +02:00
|
|
|
|
2021-06-21 16:23:59 +02:00
|
|
|
println(result.count(o => o.isInstanceOf[Relation]))
|
|
|
|
println(mapper.writeValueAsString(result.head))
|
2020-07-10 14:44:50 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
@Test
|
2022-01-11 16:57:48 +01:00
|
|
|
def testUNIprotToOAF(): Unit = {
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
assertNotNull(vocabularies)
|
|
|
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val records: String = Source
|
|
|
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
|
|
|
|
.mkString
|
2021-06-24 17:20:00 +02:00
|
|
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
assertTrue(result.nonEmpty)
|
|
|
|
result.foreach(r => assertNotNull(r))
|
|
|
|
|
|
|
|
println(result.count(o => o.isInstanceOf[Relation]))
|
|
|
|
println(mapper.writeValueAsString(result.head))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
case class EBILinks(
|
|
|
|
relType: String,
|
|
|
|
date: String,
|
|
|
|
title: String,
|
|
|
|
pmid: String,
|
|
|
|
targetPid: String,
|
|
|
|
targetPidType: String
|
|
|
|
) {}
|
2021-06-24 17:20:00 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def parse_ebi_links(input: String): List[EBILinks] = {
|
2021-06-24 17:20:00 +02:00
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json = parse(input)
|
2022-01-11 16:57:48 +01:00
|
|
|
val pmid = (json \ "publication" \ "pmid").extract[String]
|
2021-06-24 17:20:00 +02:00
|
|
|
for {
|
2022-01-11 16:57:48 +01:00
|
|
|
JObject(link) <- json \\ "Link"
|
|
|
|
JField("Target", JObject(target)) <- link
|
|
|
|
JField("RelationshipType", JObject(relType)) <- link
|
|
|
|
JField("Name", JString(relation)) <- relType
|
|
|
|
JField("PublicationDate", JString(publicationDate)) <- link
|
|
|
|
JField("Title", JString(title)) <- target
|
|
|
|
JField("Identifier", JObject(identifier)) <- target
|
|
|
|
JField("IDScheme", JString(idScheme)) <- identifier
|
|
|
|
JField("ID", JString(id)) <- identifier
|
2021-06-24 17:20:00 +02:00
|
|
|
|
|
|
|
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
|
|
|
|
}
|
|
|
|
|
2021-06-29 10:21:23 +02:00
|
|
|
@Test
|
2022-01-11 16:57:48 +01:00
|
|
|
def testCrossrefLinksToOAF(): Unit = {
|
2021-06-29 10:21:23 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val records: String = Source
|
|
|
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
|
|
|
|
.mkString
|
2021-06-29 10:21:23 +02:00
|
|
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
2021-06-29 10:21:23 +02:00
|
|
|
|
|
|
|
assertNotNull(result)
|
|
|
|
assertTrue(result.nonEmpty)
|
|
|
|
|
|
|
|
println(mapper.writeValueAsString(result.head))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-06-24 17:20:00 +02:00
|
|
|
@Test
|
2022-01-11 16:57:48 +01:00
|
|
|
def testEBILinksToOAF(): Unit = {
|
|
|
|
val iterator = GzFileIterator(
|
|
|
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
|
|
|
|
"UTF-8"
|
|
|
|
)
|
2021-06-24 17:20:00 +02:00
|
|
|
val data = iterator.next()
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val res = BioDBToOAF
|
|
|
|
.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
|
|
|
|
.filter(BioDBToOAF.EBITargetLinksFilter)
|
|
|
|
.flatMap(BioDBToOAF.convertEBILinksToOaf)
|
2021-06-24 17:20:00 +02:00
|
|
|
print(res.length)
|
|
|
|
|
2021-06-28 22:04:22 +02:00
|
|
|
println(mapper.writeValueAsString(res.head))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
2022-01-11 16:57:48 +01:00
|
|
|
def scholixResolvedToOAF(): Unit = {
|
2021-06-28 22:04:22 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val records: String = Source
|
|
|
|
.fromInputStream(
|
|
|
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
|
|
|
|
)
|
|
|
|
.mkString
|
2021-06-28 22:04:22 +02:00
|
|
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
|
|
|
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val l: List[ScholixResolved] = records.lines.map { input =>
|
2021-06-28 22:04:22 +02:00
|
|
|
lazy val json = parse(input)
|
|
|
|
json.extract[ScholixResolved]
|
|
|
|
}.toList
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
2021-06-28 22:04:22 +02:00
|
|
|
|
|
|
|
assertTrue(result.nonEmpty)
|
2021-06-24 17:20:00 +02:00
|
|
|
}
|
|
|
|
|
2020-07-10 14:44:50 +02:00
|
|
|
}
|