1
0
Fork 0

used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader

This commit is contained in:
Sandro La Bruzzo 2023-09-18 13:58:22 +02:00 committed by Giambattista Bloisi
parent 8c3e9a09d3
commit 52495f2cd2
3 changed files with 21 additions and 19 deletions

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.pubmed._
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils
@ -14,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql._
import org.apache.spark.sql.expressions.Aggregator
import org.slf4j.{Logger, LoggerFactory}
import java.io.InputStream
import scala.io.Source
import scala.xml.pull.XMLEventReader
import java.io.{ByteArrayInputStream, InputStream}
import java.nio.charset.Charset
import javax.xml.stream.XMLInputFactory
object SparkCreateBaselineDataFrame {
@ -83,7 +83,7 @@ object SparkCreateBaselineDataFrame {
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
} else
return IOUtils.toString(response.getEntity.getContent)
return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
} catch {
case e: Throwable =>
println(s"Error on requesting ${r.getURI}")
@ -155,7 +155,7 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
)
),Charset.defaultCharset()
)
)
parser.parseArgument(args)
@ -194,10 +194,11 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val inputFactory = XMLInputFactory.newInstance
val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => {
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
new PMParser(xml)
})
)

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
import javax.xml.stream.XMLEventReader
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
/** @param xml
*/

View File

@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream
import javax.xml.stream.XMLInputFactory
import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testEBIData() = {
val inputXML = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
.mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
val inputFactory = XMLInputFactory.newInstance
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
}
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testParsingPubmedXML(): Unit = {
val xml = new XMLEventReader(
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
)
val inputFactory = XMLInputFactory.newInstance
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
val parser = new PMParser(xml)
parser.foreach(checkPMArticle)
}
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testPubmedMapping(): Unit = {
val xml = new XMLEventReader(
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
)
val inputFactory = XMLInputFactory.newInstance
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
val parser = new PMParser(xml)
val results = ListBuffer[Oaf]()
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))