implemented new version of pubmed parser

This commit is contained in:
Sandro La Bruzzo 2023-07-12 10:32:25 +02:00
parent acf947442a
commit f1ae28fe42
2 changed files with 72 additions and 17 deletions

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.sx.bio.pubmed
import javax.xml.stream.{ XMLInputFactory, XMLEventReader, XMLStreamConstants }
import javax.xml.stream.{XMLEventReader, XMLInputFactory, XMLStreamConstants}
import scala.language.postfixOps
import scala.xml.MetaData
//import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
@ -8,6 +9,13 @@ import scala.xml.MetaData
*/
class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
private val reader: XMLEventReader = {
println("INSTANTIATE READER")
val factory = XMLInputFactory.newInstance()
factory.createXMLEventReader(stream)
}
var currentArticle: PMArticle = generateNextArticle()
override def hasNext: Boolean = currentArticle != null
@ -18,11 +26,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
tmp
}
private val reader: XMLEventReader = {
val factory = XMLInputFactory.newInstance()
factory.createXMLEventReader(stream)
}
def extractAttributes(attrs: MetaData, key: String): String = {
@ -56,13 +60,63 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
var currentMonth = "01"
var currentDay = "01"
var currentArticleType: String = null
while (reader.hasNext) {
var sb = new StringBuilder()
var insideChar = false
var complete = false
while (reader.hasNext && !complete) {
val next = reader.nextEvent()
if (next.isStartElement) {
if(insideChar) {
if (sb.nonEmpty)
println(s"got data ${sb.toString.trim}")
insideChar = false
}
val name = next.asStartElement().getName.getLocalPart
println(s"Start Element $name")
next.asStartElement().getAttributes.forEachRemaining(e => print(e.toString))
} else if (next.isEndElement) {
if (insideChar) {
if (sb.nonEmpty)
println(s"got data ${sb.toString.trim}")
insideChar = false
}
val name = next.asEndElement().getName.getLocalPart
println(s"End Element $name")
if (name.equalsIgnoreCase("PubmedArticle")) {
complete = true
println("Condizione di uscita")
}
} else if (next.isCharacters) {
if (!insideChar) {
insideChar = true
sb.clear()
}
val d = next.asCharacters().getData
if (d.trim.nonEmpty)
sb.append(d.trim)
}
// next match {
// case _ if (next.isStartElement) =>
// val name = next.asStartElement().getName.getLocalPart
// println(s"Start Element $name")
// case _ if (next.isEndElement) =>
// val name = next.asStartElement().getName.getLocalPart
// println(s"End Element $name")
// case _ if (next.isCharacters) =>
// val c = next.asCharacters()
// val data = c.getData
// println(s"Text value $data")
//
// }
//
//
// reader.next match {
//

View File

@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.utils.PidType
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMSubject, PubMedToOaf}
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser}
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
@ -14,7 +14,7 @@ import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.io.{BufferedReader, FileInputStream, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream
import scala.collection.JavaConverters._
import scala.io.Source
@ -46,14 +46,15 @@ class BioScholixTest extends AbstractVocabularyTest {
}
}
// @Test
// def testEBIData() = {
// val inputXML = Source
// .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
// .mkString
// val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
// new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
// }
@Test
def testEBIData() = {
val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
// new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
print("DONE")
}
@Test
def testPubmedToOaf(): Unit = {