implemented new version of pubmed parser

This commit is contained in:
Sandro La Bruzzo 2023-07-12 10:32:25 +02:00
parent acf947442a
commit f1ae28fe42
2 changed files with 72 additions and 17 deletions

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.sx.bio.pubmed package eu.dnetlib.dhp.sx.bio.pubmed
import javax.xml.stream.{ XMLInputFactory, XMLEventReader, XMLStreamConstants } import javax.xml.stream.{XMLEventReader, XMLInputFactory, XMLStreamConstants}
import scala.language.postfixOps
import scala.xml.MetaData import scala.xml.MetaData
//import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} //import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
@ -8,6 +9,13 @@ import scala.xml.MetaData
*/ */
class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] { class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
private val reader: XMLEventReader = {
println("INSTANTIATE READER")
val factory = XMLInputFactory.newInstance()
factory.createXMLEventReader(stream)
}
var currentArticle: PMArticle = generateNextArticle() var currentArticle: PMArticle = generateNextArticle()
override def hasNext: Boolean = currentArticle != null override def hasNext: Boolean = currentArticle != null
@ -18,11 +26,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
tmp tmp
} }
private val reader: XMLEventReader = {
val factory = XMLInputFactory.newInstance()
factory.createXMLEventReader(stream)
}
def extractAttributes(attrs: MetaData, key: String): String = { def extractAttributes(attrs: MetaData, key: String): String = {
@ -56,13 +60,63 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
var currentMonth = "01" var currentMonth = "01"
var currentDay = "01" var currentDay = "01"
var currentArticleType: String = null var currentArticleType: String = null
var sb = new StringBuilder()
while (reader.hasNext) { var insideChar = false
var complete = false
while (reader.hasNext && !complete) {
val next = reader.nextEvent() val next = reader.nextEvent()
if (next.isStartElement) {
if(insideChar) {
if (sb.nonEmpty)
println(s"got data ${sb.toString.trim}")
insideChar = false
}
val name = next.asStartElement().getName.getLocalPart
println(s"Start Element $name")
next.asStartElement().getAttributes.forEachRemaining(e => print(e.toString))
} else if (next.isEndElement) {
if (insideChar) {
if (sb.nonEmpty)
println(s"got data ${sb.toString.trim}")
insideChar = false
}
val name = next.asEndElement().getName.getLocalPart
println(s"End Element $name")
if (name.equalsIgnoreCase("PubmedArticle")) {
complete = true
println("Condizione di uscita")
}
} else if (next.isCharacters) {
if (!insideChar) {
insideChar = true
sb.clear()
}
val d = next.asCharacters().getData
if (d.trim.nonEmpty)
sb.append(d.trim)
}
// next match {
// case _ if (next.isStartElement) =>
// val name = next.asStartElement().getName.getLocalPart
// println(s"Start Element $name")
// case _ if (next.isEndElement) =>
// val name = next.asStartElement().getName.getLocalPart
// println(s"End Element $name")
// case _ if (next.isCharacters) =>
// val c = next.asCharacters()
// val data = c.getData
// println(s"Text value $data")
// //
// }
//
// //
// reader.next match { // reader.next match {
// //

View File

@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.utils.PidType
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMSubject, PubMedToOaf} import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -14,7 +14,7 @@ import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader} import java.io.{BufferedReader, FileInputStream, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream import java.util.zip.GZIPInputStream
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
@ -46,14 +46,15 @@ class BioScholixTest extends AbstractVocabularyTest {
} }
} }
// @Test @Test
// def testEBIData() = { def testEBIData() = {
// val inputXML = Source val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
// .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
// .mkString
// val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes())) // new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
// new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
// } print("DONE")
}
@Test @Test
def testPubmedToOaf(): Unit = { def testPubmedToOaf(): Unit = {