used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader

This commit is contained in:
Sandro La Bruzzo 2023-09-18 13:58:22 +02:00
parent 8064abf86c
commit e21b65cabb
3 changed files with 21 additions and 19 deletions

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.sx.bio.pubmed._
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
@ -14,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.expressions.Aggregator
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.io.InputStream import java.io.{ByteArrayInputStream, InputStream}
import scala.io.Source import java.nio.charset.Charset
import scala.xml.pull.XMLEventReader import javax.xml.stream.XMLInputFactory
object SparkCreateBaselineDataFrame { object SparkCreateBaselineDataFrame {
@ -83,7 +83,7 @@ object SparkCreateBaselineDataFrame {
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} else } else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
} catch { } catch {
case e: Throwable => case e: Throwable =>
println(s"Error on requesting ${r.getURI}") println(s"Error on requesting ${r.getURI}")
@ -155,7 +155,7 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString( IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream( SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
) ),Charset.defaultCharset()
) )
) )
parser.parseArgument(args) parser.parseArgument(args)
@ -194,10 +194,11 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) { if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val inputFactory = XMLInputFactory.newInstance
val ds: Dataset[PMArticle] = spark.createDataset( val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz")) k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => { .flatMap(i => {
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
new PMParser(xml) new PMParser(xml)
}) })
) )

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.sx.bio.pubmed package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} import javax.xml.stream.XMLEventReader
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
/** @param xml /** @param xml
*/ */

View File

@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader} import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream import java.util.zip.GZIPInputStream
import javax.xml.stream.XMLInputFactory
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer import scala.collection.mutable.ListBuffer
import scala.io.Source import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test @Test
def testEBIData() = { def testEBIData() = {
val inputXML = Source val inputFactory = XMLInputFactory.newInstance
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
.mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
} }
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test @Test
def testParsingPubmedXML(): Unit = { def testParsingPubmedXML(): Unit = {
val xml = new XMLEventReader( val inputFactory = XMLInputFactory.newInstance
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
) val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
val parser = new PMParser(xml) val parser = new PMParser(xml)
parser.foreach(checkPMArticle) parser.foreach(checkPMArticle)
} }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test @Test
def testPubmedMapping(): Unit = { def testPubmedMapping(): Unit = {
val xml = new XMLEventReader( val inputFactory = XMLInputFactory.newInstance
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
)
val parser = new PMParser(xml) val parser = new PMParser(xml)
val results = ListBuffer[Oaf]() val results = ListBuffer[Oaf]()
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies)) parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))