From f1ae28fe426d25059adc69a972e4e3b2bd949a1a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 12 Jul 2023 10:32:25 +0200 Subject: [PATCH] implemented new version of pubmed parser --- .../dnetlib/dhp/sx/bio/pubmed/PMParser.scala | 68 +++++++++++++++++-- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 21 +++--- 2 files changed, 72 insertions(+), 17 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index a92aa0486..c97b8ead2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.sx.bio.pubmed -import javax.xml.stream.{ XMLInputFactory, XMLEventReader, XMLStreamConstants } +import javax.xml.stream.{XMLEventReader, XMLInputFactory, XMLStreamConstants} +import scala.language.postfixOps import scala.xml.MetaData //import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} @@ -8,6 +9,13 @@ import scala.xml.MetaData */ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] { + private val reader: XMLEventReader = { + println("INSTANTIATE READER") + val factory = XMLInputFactory.newInstance() + factory.createXMLEventReader(stream) + + } + var currentArticle: PMArticle = generateNextArticle() override def hasNext: Boolean = currentArticle != null @@ -18,11 +26,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] { tmp } - private val reader: XMLEventReader = { - val factory = XMLInputFactory.newInstance() - factory.createXMLEventReader(stream) - } def extractAttributes(attrs: MetaData, key: String): String = { @@ -56,13 +60,63 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] { var currentMonth = "01" var currentDay = "01" var currentArticleType: String = null - - while (reader.hasNext) { + var sb = new StringBuilder() + var insideChar = false + var complete = false + while (reader.hasNext && !complete) { val next = reader.nextEvent() + if (next.isStartElement) { + if(insideChar) { + if (sb.nonEmpty) + println(s"got data ${sb.toString.trim}") + insideChar = false + } + val name = next.asStartElement().getName.getLocalPart + println(s"Start Element $name") + next.asStartElement().getAttributes.forEachRemaining(e => print(e.toString)) + + } else if (next.isEndElement) { + if (insideChar) { + if (sb.nonEmpty) + println(s"got data ${sb.toString.trim}") + insideChar = false + } + val name = next.asEndElement().getName.getLocalPart + println(s"End Element $name") + if (name.equalsIgnoreCase("PubmedArticle")) { + complete = true + println("Condizione di uscita") + } + } else if (next.isCharacters) { + if (!insideChar) { + insideChar = true + sb.clear() + } + val d = next.asCharacters().getData + if (d.trim.nonEmpty) + sb.append(d.trim) + } + + + +// next match { +// case _ if (next.isStartElement) => +// val name = next.asStartElement().getName.getLocalPart +// println(s"Start Element $name") +// case _ if (next.isEndElement) => +// val name = next.asStartElement().getName.getLocalPart +// println(s"End Element $name") +// case _ if (next.isCharacters) => +// val c = next.asCharacters() +// val data = c.getData +// println(s"Text value $data") // +// } + + // // // reader.next match { // diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 5ca97de19..b537ba797 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMSubject, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser} import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse @@ -14,7 +14,7 @@ import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.{BeforeEach, Test} import org.mockito.junit.jupiter.MockitoExtension -import java.io.{BufferedReader, InputStream, InputStreamReader} +import java.io.{BufferedReader, FileInputStream, InputStream, InputStreamReader} import java.util.zip.GZIPInputStream import scala.collection.JavaConverters._ import scala.io.Source @@ -46,14 +46,15 @@ class BioScholixTest extends AbstractVocabularyTest { } } -// @Test -// def testEBIData() = { -// val inputXML = Source -// .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) -// .mkString -// val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes())) -// new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) -// } + @Test + def testEBIData() = { + val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml") + + + // new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz"))) + new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz"))) + print("DONE") + } @Test def testPubmedToOaf(): Unit = {