forked from D-Net/dnet-hadoop
implemented new version of pubmed parser
This commit is contained in:
parent
acf947442a
commit
f1ae28fe42
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import javax.xml.stream.{ XMLInputFactory, XMLEventReader, XMLStreamConstants }
|
||||
import javax.xml.stream.{XMLEventReader, XMLInputFactory, XMLStreamConstants}
|
||||
import scala.language.postfixOps
|
||||
import scala.xml.MetaData
|
||||
//import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
|
||||
|
@ -8,6 +9,13 @@ import scala.xml.MetaData
|
|||
*/
|
||||
class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||
|
||||
private val reader: XMLEventReader = {
|
||||
println("INSTANTIATE READER")
|
||||
val factory = XMLInputFactory.newInstance()
|
||||
factory.createXMLEventReader(stream)
|
||||
|
||||
}
|
||||
|
||||
var currentArticle: PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle != null
|
||||
|
@ -18,11 +26,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
|||
tmp
|
||||
}
|
||||
|
||||
private val reader: XMLEventReader = {
|
||||
val factory = XMLInputFactory.newInstance()
|
||||
factory.createXMLEventReader(stream)
|
||||
|
||||
}
|
||||
|
||||
def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
|
@ -56,13 +60,63 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
|||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (reader.hasNext) {
|
||||
var sb = new StringBuilder()
|
||||
var insideChar = false
|
||||
var complete = false
|
||||
while (reader.hasNext && !complete) {
|
||||
|
||||
val next = reader.nextEvent()
|
||||
|
||||
if (next.isStartElement) {
|
||||
if(insideChar) {
|
||||
if (sb.nonEmpty)
|
||||
println(s"got data ${sb.toString.trim}")
|
||||
insideChar = false
|
||||
}
|
||||
val name = next.asStartElement().getName.getLocalPart
|
||||
println(s"Start Element $name")
|
||||
next.asStartElement().getAttributes.forEachRemaining(e => print(e.toString))
|
||||
|
||||
} else if (next.isEndElement) {
|
||||
if (insideChar) {
|
||||
if (sb.nonEmpty)
|
||||
println(s"got data ${sb.toString.trim}")
|
||||
insideChar = false
|
||||
}
|
||||
val name = next.asEndElement().getName.getLocalPart
|
||||
println(s"End Element $name")
|
||||
if (name.equalsIgnoreCase("PubmedArticle")) {
|
||||
complete = true
|
||||
println("Condizione di uscita")
|
||||
}
|
||||
|
||||
} else if (next.isCharacters) {
|
||||
if (!insideChar) {
|
||||
insideChar = true
|
||||
sb.clear()
|
||||
}
|
||||
val d = next.asCharacters().getData
|
||||
if (d.trim.nonEmpty)
|
||||
sb.append(d.trim)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// next match {
|
||||
// case _ if (next.isStartElement) =>
|
||||
// val name = next.asStartElement().getName.getLocalPart
|
||||
// println(s"Start Element $name")
|
||||
// case _ if (next.isEndElement) =>
|
||||
// val name = next.asStartElement().getName.getLocalPart
|
||||
// println(s"End Element $name")
|
||||
// case _ if (next.isCharacters) =>
|
||||
// val c = next.asCharacters()
|
||||
// val data = c.getData
|
||||
// println(s"Text value $data")
|
||||
//
|
||||
// }
|
||||
|
||||
//
|
||||
//
|
||||
// reader.next match {
|
||||
//
|
||||
|
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
|||
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMSubject, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
@ -14,7 +14,7 @@ import org.junit.jupiter.api.extension.ExtendWith
|
|||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
|
||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||
import java.io.{BufferedReader, FileInputStream, InputStream, InputStreamReader}
|
||||
import java.util.zip.GZIPInputStream
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
@ -46,14 +46,15 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
}
|
||||
}
|
||||
|
||||
// @Test
|
||||
// def testEBIData() = {
|
||||
// val inputXML = Source
|
||||
// .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||
// .mkString
|
||||
// val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||
// new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
||||
// }
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
|
||||
|
||||
|
||||
// new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
||||
new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
||||
print("DONE")
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPubmedToOaf(): Unit = {
|
||||
|
|
Loading…
Reference in New Issue