forked from D-Net/dnet-hadoop
implemented new version of pubmed parser
This commit is contained in:
parent
acf947442a
commit
f1ae28fe42
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import javax.xml.stream.{ XMLInputFactory, XMLEventReader, XMLStreamConstants }
|
import javax.xml.stream.{XMLEventReader, XMLInputFactory, XMLStreamConstants}
|
||||||
|
import scala.language.postfixOps
|
||||||
import scala.xml.MetaData
|
import scala.xml.MetaData
|
||||||
//import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
//import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||||
|
|
||||||
|
@ -8,6 +9,13 @@ import scala.xml.MetaData
|
||||||
*/
|
*/
|
||||||
class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||||
|
|
||||||
|
private val reader: XMLEventReader = {
|
||||||
|
println("INSTANTIATE READER")
|
||||||
|
val factory = XMLInputFactory.newInstance()
|
||||||
|
factory.createXMLEventReader(stream)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
var currentArticle: PMArticle = generateNextArticle()
|
var currentArticle: PMArticle = generateNextArticle()
|
||||||
|
|
||||||
override def hasNext: Boolean = currentArticle != null
|
override def hasNext: Boolean = currentArticle != null
|
||||||
|
@ -18,11 +26,7 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||||
tmp
|
tmp
|
||||||
}
|
}
|
||||||
|
|
||||||
private val reader: XMLEventReader = {
|
|
||||||
val factory = XMLInputFactory.newInstance()
|
|
||||||
factory.createXMLEventReader(stream)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def extractAttributes(attrs: MetaData, key: String): String = {
|
def extractAttributes(attrs: MetaData, key: String): String = {
|
||||||
|
|
||||||
|
@ -56,13 +60,63 @@ class PMParser(stream: java.io.InputStream) extends Iterator[PMArticle] {
|
||||||
var currentMonth = "01"
|
var currentMonth = "01"
|
||||||
var currentDay = "01"
|
var currentDay = "01"
|
||||||
var currentArticleType: String = null
|
var currentArticleType: String = null
|
||||||
|
var sb = new StringBuilder()
|
||||||
while (reader.hasNext) {
|
var insideChar = false
|
||||||
|
var complete = false
|
||||||
|
while (reader.hasNext && !complete) {
|
||||||
|
|
||||||
val next = reader.nextEvent()
|
val next = reader.nextEvent()
|
||||||
|
|
||||||
|
if (next.isStartElement) {
|
||||||
|
if(insideChar) {
|
||||||
|
if (sb.nonEmpty)
|
||||||
|
println(s"got data ${sb.toString.trim}")
|
||||||
|
insideChar = false
|
||||||
|
}
|
||||||
|
val name = next.asStartElement().getName.getLocalPart
|
||||||
|
println(s"Start Element $name")
|
||||||
|
next.asStartElement().getAttributes.forEachRemaining(e => print(e.toString))
|
||||||
|
|
||||||
|
} else if (next.isEndElement) {
|
||||||
|
if (insideChar) {
|
||||||
|
if (sb.nonEmpty)
|
||||||
|
println(s"got data ${sb.toString.trim}")
|
||||||
|
insideChar = false
|
||||||
|
}
|
||||||
|
val name = next.asEndElement().getName.getLocalPart
|
||||||
|
println(s"End Element $name")
|
||||||
|
if (name.equalsIgnoreCase("PubmedArticle")) {
|
||||||
|
complete = true
|
||||||
|
println("Condizione di uscita")
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (next.isCharacters) {
|
||||||
|
if (!insideChar) {
|
||||||
|
insideChar = true
|
||||||
|
sb.clear()
|
||||||
|
}
|
||||||
|
val d = next.asCharacters().getData
|
||||||
|
if (d.trim.nonEmpty)
|
||||||
|
sb.append(d.trim)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// next match {
|
||||||
|
// case _ if (next.isStartElement) =>
|
||||||
|
// val name = next.asStartElement().getName.getLocalPart
|
||||||
|
// println(s"Start Element $name")
|
||||||
|
// case _ if (next.isEndElement) =>
|
||||||
|
// val name = next.asStartElement().getName.getLocalPart
|
||||||
|
// println(s"End Element $name")
|
||||||
|
// case _ if (next.isCharacters) =>
|
||||||
|
// val c = next.asCharacters()
|
||||||
|
// val data = c.getData
|
||||||
|
// println(s"Text value $data")
|
||||||
//
|
//
|
||||||
|
// }
|
||||||
|
|
||||||
|
//
|
||||||
//
|
//
|
||||||
// reader.next match {
|
// reader.next match {
|
||||||
//
|
//
|
||||||
|
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMSubject, PubMedToOaf}
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf, PubmedParser}
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
@ -14,7 +14,7 @@ import org.junit.jupiter.api.extension.ExtendWith
|
||||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||||
import org.mockito.junit.jupiter.MockitoExtension
|
import org.mockito.junit.jupiter.MockitoExtension
|
||||||
|
|
||||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
import java.io.{BufferedReader, FileInputStream, InputStream, InputStreamReader}
|
||||||
import java.util.zip.GZIPInputStream
|
import java.util.zip.GZIPInputStream
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
@ -46,14 +46,15 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
@Test
|
||||||
// def testEBIData() = {
|
def testEBIData() = {
|
||||||
// val inputXML = Source
|
val inputXML = getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")
|
||||||
// .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
|
||||||
// .mkString
|
|
||||||
// val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
// new PubmedParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
||||||
// new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
new PMParser(new GZIPInputStream(new FileInputStream("/Users/sandro/Downloads/pubmed23n1078.xml.gz")))
|
||||||
// }
|
print("DONE")
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testPubmedToOaf(): Unit = {
|
def testPubmedToOaf(): Unit = {
|
||||||
|
|
Loading…
Reference in New Issue