dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala

135 lines
5.3 KiB
Scala
Raw Normal View History

package eu.dnetlib.dhp.sx.bio.pubmed
2021-05-04 14:54:12 +02:00
import scala.xml.MetaData
2020-07-10 14:44:50 +02:00
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
2022-01-11 16:57:48 +01:00
/** @param xml
*/
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
2022-01-11 16:57:48 +01:00
var currentArticle: PMArticle = generateNextArticle()
2020-07-10 14:44:50 +02:00
2022-01-11 16:57:48 +01:00
override def hasNext: Boolean = currentArticle != null
2020-07-10 14:44:50 +02:00
override def next(): PMArticle = {
val tmp = currentArticle
currentArticle = generateNextArticle()
tmp
}
2022-01-11 16:57:48 +01:00
def extractAttributes(attrs: MetaData, key: String): String = {
2021-05-04 14:54:12 +02:00
val res = attrs.get(key)
if (res.isDefined) {
2022-01-11 16:57:48 +01:00
val s = res.get
2021-05-04 14:54:12 +02:00
if (s != null && s.nonEmpty)
s.head.text
else
null
2022-01-11 16:57:48 +01:00
} else null
2021-05-04 14:54:12 +02:00
}
2022-01-11 16:57:48 +01:00
def validate_Date(year: String, month: String, day: String): String = {
2021-05-04 14:54:12 +02:00
try {
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
} catch {
2022-01-11 16:57:48 +01:00
case _: Throwable => null
2021-05-04 14:54:12 +02:00
}
}
2020-07-10 14:44:50 +02:00
2022-01-11 16:57:48 +01:00
def generateNextArticle(): PMArticle = {
2021-05-04 14:54:12 +02:00
2022-01-11 16:57:48 +01:00
var currentSubject: PMSubject = null
2020-07-10 14:44:50 +02:00
var currentAuthor: PMAuthor = null
var currentJournal: PMJournal = null
2021-05-04 14:54:12 +02:00
var currentGrant: PMGrant = null
2020-07-10 14:44:50 +02:00
var currNode: String = null
var currentYear = "0"
var currentMonth = "01"
var currentDay = "01"
2022-01-11 16:57:48 +01:00
var currentArticleType: String = null
2020-07-10 14:44:50 +02:00
while (xml.hasNext) {
xml.next match {
2021-05-04 14:54:12 +02:00
case EvElemStart(_, label, attrs, _) =>
2020-07-10 14:44:50 +02:00
currNode = label
2021-05-04 14:54:12 +02:00
2020-07-10 14:44:50 +02:00
label match {
case "PubmedArticle" => currentArticle = new PMArticle
2022-01-11 16:57:48 +01:00
case "Author" => currentAuthor = new PMAuthor
case "Journal" => currentJournal = new PMJournal
case "Grant" => currentGrant = new PMGrant
2021-05-04 14:54:12 +02:00
case "PublicationType" | "DescriptorName" =>
currentSubject = new PMSubject
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
2022-01-11 16:57:48 +01:00
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
case _ =>
2020-07-10 14:44:50 +02:00
}
case EvElemEnd(_, label) =>
label match {
case "PubmedArticle" => return currentArticle
2022-01-11 16:57:48 +01:00
case "Author" => currentArticle.getAuthors.add(currentAuthor)
case "Journal" => currentArticle.setJournal(currentJournal)
case "Grant" => currentArticle.getGrants.add(currentGrant)
case "PubMedPubDate" =>
if (currentArticle.getDate == null)
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
case _ =>
2020-07-10 14:44:50 +02:00
}
case EvText(text) =>
2022-01-11 16:57:48 +01:00
if (currNode != null && text.trim.nonEmpty)
2020-07-10 14:44:50 +02:00
currNode match {
case "ArticleTitle" => {
2022-01-11 16:57:48 +01:00
if (currentArticle.getTitle == null)
2020-07-10 14:44:50 +02:00
currentArticle.setTitle(text.trim)
else
currentArticle.setTitle(currentArticle.getTitle + text.trim)
}
case "AbstractText" => {
2022-01-11 16:57:48 +01:00
if (currentArticle.getDescription == null)
2020-07-10 14:44:50 +02:00
currentArticle.setDescription(text.trim)
else
currentArticle.setDescription(currentArticle.getDescription + text.trim)
}
case "PMID" => currentArticle.setPmid(text.trim)
2022-01-11 16:57:48 +01:00
case "ArticleId" =>
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
2022-07-13 15:27:17 +02:00
if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
2022-01-11 16:57:48 +01:00
case "Language" => currentArticle.setLanguage(text.trim)
case "ISSN" => currentJournal.setIssn(text.trim)
case "GrantID" => currentGrant.setGrantID(text.trim)
case "Agency" => currentGrant.setAgency(text.trim)
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
case "Year" => currentYear = text.trim
case "Month" => currentMonth = text.trim
case "Day" => currentDay = text.trim
case "Volume" => currentJournal.setVolume(text.trim)
case "Issue" => currentJournal.setIssue(text.trim)
2021-05-04 14:54:12 +02:00
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
2020-07-10 14:44:50 +02:00
case "LastName" => {
if (currentAuthor != null)
currentAuthor.setLastName(text.trim)
}
2022-01-11 16:57:48 +01:00
case "ForeName" =>
if (currentAuthor != null)
currentAuthor.setForeName(text.trim)
2020-07-10 14:44:50 +02:00
case "Title" =>
2022-01-11 16:57:48 +01:00
if (currentJournal.getTitle == null)
2020-07-10 14:44:50 +02:00
currentJournal.setTitle(text.trim)
else
currentJournal.setTitle(currentJournal.getTitle + text.trim)
case _ =>
}
case _ =>
}
}
null
}
}