Organized getters and setters in the PMArticle class for better readability and maintainability.

This commit is contained in:
Sandro La Bruzzo 2024-11-04 17:45:28 +01:00
parent a42c8b7c85
commit a8ed5a3b04
8 changed files with 754 additions and 299 deletions

View File

@ -26,16 +26,16 @@
<dependencies> <dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-actionmanager</artifactId>
<version>${project.version}</version>
</dependency>
<!-- <dependency>--> <!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>--> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-aggregation</artifactId>--> <!-- <artifactId>dhp-actionmanager</artifactId>-->
<!-- <version>${project.version}</version>--> <!-- <version>${project.version}</version>-->
<!-- </dependency>--> <!-- </dependency>-->
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-aggregation</artifactId>
<version>${project.version}</version>
</dependency>
<!-- <dependency>--> <!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>--> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-blacklist</artifactId>--> <!-- <artifactId>dhp-blacklist</artifactId>-->
@ -56,61 +56,61 @@
<!-- <artifactId>dhp-enrichment</artifactId>--> <!-- <artifactId>dhp-enrichment</artifactId>-->
<!-- <version>${project.version}</version>--> <!-- <version>${project.version}</version>-->
<!-- </dependency>--> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-graph-mapper</artifactId> <!-- <artifactId>dhp-graph-mapper</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-graph-provision</artifactId> <!-- <artifactId>dhp-graph-provision</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-impact-indicators</artifactId> <!-- <artifactId>dhp-impact-indicators</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-stats-actionsets</artifactId> <!-- <artifactId>dhp-stats-actionsets</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-stats-hist-snaps</artifactId> <!-- <artifactId>dhp-stats-hist-snaps</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-stats-monitor-irish</artifactId> <!-- <artifactId>dhp-stats-monitor-irish</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-stats-promote</artifactId> <!-- <artifactId>dhp-stats-promote</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-stats-update</artifactId> <!-- <artifactId>dhp-stats-update</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-swh</artifactId> <!-- <artifactId>dhp-swh</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-usage-raw-data-update</artifactId> <!-- <artifactId>dhp-usage-raw-data-update</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
<dependency> <!-- <dependency>-->
<groupId>eu.dnetlib.dhp</groupId> <!-- <groupId>eu.dnetlib.dhp</groupId>-->
<artifactId>dhp-usage-stats-build</artifactId> <!-- <artifactId>dhp-usage-stats-build</artifactId>-->
<version>${project.version}</version> <!-- <version>${project.version}</version>-->
</dependency> <!-- </dependency>-->
</dependencies> </dependencies>

View File

@ -15,6 +15,7 @@ import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@ -29,7 +30,6 @@ import org.apache.spark.sql.Dataset;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.spark_project.jetty.util.StringUtil;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -206,7 +206,7 @@ public class ExtractPerson implements Serializable {
null); null);
relation.setValidated(true); relation.setValidated(true);
if (StringUtil.isNotBlank(role)) { if (StringUtils.isNotBlank(role)) {
KeyValue kv = new KeyValue(); KeyValue kv = new KeyValue();
kv.setKey("role"); kv.setKey("role");
kv.setValue(role); kv.setValue(role);
@ -439,13 +439,13 @@ public class ExtractPerson implements Serializable {
null); null);
relation.setValidated(true); relation.setValidated(true);
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
KeyValue kv = new KeyValue(); KeyValue kv = new KeyValue();
kv.setKey("startDate"); kv.setKey("startDate");
kv.setValue(row.getStartDate()); kv.setValue(row.getStartDate());
properties.add(kv); properties.add(kv);
} }
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) { if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
KeyValue kv = new KeyValue(); KeyValue kv = new KeyValue();
kv.setKey("endDate"); kv.setKey("endDate");
kv.setValue(row.getEndDate()); kv.setValue(row.getEndDate());

View File

@ -8,259 +8,115 @@ import java.util.List;
/** /**
* This class represent an instance of Pubmed Article extracted from the native XML * This class represent an instance of Pubmed Article extracted from the native XML
* *
* @author Sandro La Bruzzo
*/ */
public class PMArticle implements Serializable { public class PMArticle implements Serializable {
/**
* the Pubmed Identifier
*/
private String pmid; private String pmid;
private String pmcId; private String pmcId;
/**
* the DOI
*/
private String doi; private String doi;
/**
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
*/
private String date; private String date;
/**
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
*/
private PMJournal journal; private PMJournal journal;
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
*/
private String title; private String title;
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*/
private String description; private String description;
/**
* the language in which an article was published is recorded in <Language>.
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
*/
private String language; private String language;
private List<PMSubject> subjects;
/** private List<PMSubject> publicationTypes = new ArrayList<>();
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
*/
private final List<PMSubject> subjects = new ArrayList<>();
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*/
private final List<PMSubject> publicationTypes = new ArrayList<>();
/**
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
*/
private List<PMAuthor> authors = new ArrayList<>(); private List<PMAuthor> authors = new ArrayList<>();
private List<PMGrant> grants = new ArrayList<>();
/**
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
*/
private final List<PMGrant> grants = new ArrayList<>();
/**
* get the DOI
* @return a DOI
*/
public String getDoi() {
return doi;
}
/**
* Set the DOI
* @param doi a DOI
*/
public void setDoi(String doi) {
this.doi = doi;
}
/**
* get the Pubmed Identifier
* @return the PMID
*/
public String getPmid() { public String getPmid() {
return pmid; return pmid;
} }
/**
* set the Pubmed Identifier
* @param pmid the Pubmed Identifier
*/
public void setPmid(String pmid) { public void setPmid(String pmid) {
this.pmid = pmid; this.pmid = pmid;
} }
/**
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
*
* @return the Pubmed Date
*/
public String getDate() {
return date;
}
/**
* Set the pubmed Date
* @param date
*/
public void setDate(String date) {
this.date = date;
}
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
*
* @return the pubmed Journal Extracted
*/
public PMJournal getJournal() {
return journal;
}
/**
* Set the mapped pubmed Journal
* @param journal
*/
public void setJournal(PMJournal journal) {
this.journal = journal;
}
/**
* <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
* those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
* All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
* Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
* Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
*
* @return the extracted pubmed Title
*/
public String getTitle() {
return title;
}
/**
* set the pubmed title
* @param title
*/
public void setTitle(String title) {
this.title = title;
}
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*
* @return the Mapped Pubmed Article Abstracts
*/
public String getDescription() {
return description;
}
/**
* Set the Mapped Pubmed Article Abstracts
* @param description
*/
public void setDescription(String description) {
this.description = description;
}
/**
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
*
* @return get the Mapped Authors lists
*/
public List<PMAuthor> getAuthors() {
return authors;
}
/**
* Set the Mapped Authors lists
* @param authors
*/
public void setAuthors(List<PMAuthor> authors) {
this.authors = authors;
}
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*
* @return the mapped Subjects
*/
public List<PMSubject> getSubjects() {
return subjects;
}
/**
*
* the language in which an article was published is recorded in <Language>.
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
*
* @return The mapped Language
*/
public String getLanguage() {
return language;
}
/**
*
* Set The mapped Language
*
* @param language the mapped Language
*/
public void setLanguage(String language) {
this.language = language;
}
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*
* @return the mapped Publication Type
*/
public List<PMSubject> getPublicationTypes() {
return publicationTypes;
}
/**
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
* @return the mapped grants
*/
public List<PMGrant> getGrants() {
return grants;
}
public String getPmcId() { public String getPmcId() {
return pmcId; return pmcId;
} }
public PMArticle setPmcId(String pmcId) { public void setPmcId(String pmcId) {
this.pmcId = pmcId; this.pmcId = pmcId;
return this; }
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public PMJournal getJournal() {
return journal;
}
public void setJournal(PMJournal journal) {
this.journal = journal;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public List<PMSubject> getSubjects() {
return subjects;
}
public void setSubjects(List<PMSubject> subjects) {
this.subjects = subjects;
}
public List<PMSubject> getPublicationTypes() {
return publicationTypes;
}
public void setPublicationTypes(List<PMSubject> publicationTypes) {
this.publicationTypes = publicationTypes;
}
public List<PMAuthor> getAuthors() {
return authors;
}
public void setAuthors(List<PMAuthor> authors) {
this.authors = authors;
}
public List<PMGrant> getGrants() {
return grants;
}
public void setGrants(List<PMGrant> grants) {
this.grants = grants;
} }
} }

View File

@ -1,8 +1,7 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true}, {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true},
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the mdstore path to save", "paramRequired": true}
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
] ]

View File

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.sx.bio.ebi
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import java.io.ByteArrayInputStream
import javax.xml.stream.XMLInputFactory
class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val sourcePath = parser.get("sourcePath")
log.info(s"SourcePath is '$sourcePath'")
val targetPath = parser.get("targetPath")
log.info(s"TargetPath is '$targetPath'")
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
createPubmedDump(spark, sourcePath, targetPath, vocabularies)
}
def createPubmedDump(
spark: SparkSession,
sourcePath: String,
targetPath: String,
vocabularies: VocabularyGroup
): Unit = {
require(spark != null)
implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle])
import spark.implicits._
val df = spark.read.option("lineSep", "</PubmedArticle>").text(sourcePath)
val mapper = new ObjectMapper()
df.as[String]
.map(s => {
val id = s.indexOf("<PubmedArticle>")
if (id >= 0) s"${s.substring(id)}</PubmedArticle>" else null
})
.filter(s => s != null)
.map { i =>
try {
new PMParser2().parse(i)
} catch {
case _: Exception => {
throw new RuntimeException(s"Error parsing article: $i")
}
}
}
.dropDuplicates("pmid")
.map { a =>
val oaf = PubMedToOaf.convert(a, vocabularies)
if (oaf != null)
mapper.writeValueAsString(oaf)
else
null
}
.as[String]
.filter(s => s != null)
.write
.option("compression", "gzip")
.mode("overwrite")
.text(targetPath)
}
}
object SparkCreatePubmedDump {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run()
}
}

View File

@ -0,0 +1,264 @@
package eu.dnetlib.dhp.sx.bio.pubmed
import org.apache.commons.lang3.StringUtils
import javax.xml.stream.XMLEventReader
import scala.collection.JavaConverters._
import scala.xml.{MetaData, NodeSeq}
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
class PMParser2 {
/** Extracts the value of an attribute from a MetaData object.
* @param attrs the MetaData object
* @param key the key of the attribute
* @return the value of the attribute or null if the attribute is not found
*/
private def extractAttributes(attrs: MetaData, key: String): String = {
val res = attrs.get(key)
if (res.isDefined) {
val s = res.get
if (s != null && s.nonEmpty)
s.head.text
else
null
} else null
}
/** Validates and formats a date given the year, month, and day as strings.
*
* @param year the year as a string
* @param month the month as a string
* @param day the day as a string
* @return the formatted date as "YYYY-MM-DD" or null if the date is invalid
*/
private def validate_Date(year: String, month: String, day: String): String = {
try {
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
} catch {
case _: Throwable => null
}
}
/** Extracts the grant information from a NodeSeq object.
*
* @param gNode the NodeSeq object
* @return the grant information or an empty list if the grant information is not found
*/
private def extractGrant(gNode: NodeSeq): List[PMGrant] = {
gNode
.map(node => {
val grantId = (node \ "GrantID").text
val agency = (node \ "Agency").text
val country = (node \ "Country").text
new PMGrant(grantId, agency, country)
})
.toList
}
/** Extracts the journal information from a NodeSeq object.
*
* @param jNode the NodeSeq object
* @return the journal information or null if the journal information is not found
*/
private def extractJournal(jNode: NodeSeq): PMJournal = {
val journal = new PMJournal
journal.setTitle((jNode \ "Title").text)
journal.setIssn((jNode \ "ISSN").text)
journal.setVolume((jNode \ "JournalIssue" \ "Volume").text)
journal.setIssue((jNode \ "JournalIssue" \ "Issue").text)
if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle))
journal
else
null
}
private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = {
aNode
.map(author => {
val a = new PMAuthor
a.setLastName((author \ "LastName").text)
a.setForeName((author \ "ForeName").text)
a
})
.toList
}
def parse(input: String): PMArticle = {
val xml = scala.xml.XML.loadString(input)
val article = new PMArticle
val grantNodes = xml \ "MedlineCitation" \\ "Grant"
article.setGrants(extractGrant(grantNodes).asJava)
val journal = xml \ "MedlineCitation" \ "Article" \ "Journal"
article.setJournal(extractJournal(journal))
val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
article.setAuthors(
authors
.map(author => {
val a = new PMAuthor
a.setLastName((author \ "LastName").text)
a.setForeName((author \ "ForeName").text)
a
})
.toList
.asJava
)
val pmId = xml \ "MedlineCitation" \ "PMID"
val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId"
articleIds.foreach(articleId => {
val idType = (articleId \ "@IdType").text
val id = articleId.text
if ("doi".equalsIgnoreCase(idType)) article.setDoi(id)
if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id)
})
article.setPmid(pmId.text)
val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted"
val currentDate =
validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text)
if (currentDate != null) article.setDate(currentDate)
val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle"
article.setTitle(articleTitle.text)
val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText"
if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty)
article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim)
val language = xml \ "MedlineCitation" \ "Article" \ "Language"
article.setLanguage(language.text)
val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading"
article.setSubjects(
subjects
.take(20)
.map(subject => {
val descriptorName = (subject \ "DescriptorName").text
val ui = (subject \ "DescriptorName" \ "@UI").text
val s = new PMSubject
s.setValue(descriptorName)
s.setMeshId(ui)
s
})
.toList
.asJava
)
val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType"
article.setPublicationTypes(
publicationTypes
.map(pt => {
val s = new PMSubject
s.setValue(pt.text)
s
})
.toList
.asJava
)
article
}
def parse2(xml: XMLEventReader): PMArticle = {
var currentArticle: PMArticle = null
var currentSubject: PMSubject = null
var currentAuthor: PMAuthor = null
var currentJournal: PMJournal = null
var currentGrant: PMGrant = null
var currNode: String = null
var currentYear = "0"
var currentMonth = "01"
var currentDay = "01"
var currentArticleType: String = null
while (xml.hasNext) {
val ne = xml.next
ne match {
case EvElemStart(_, label, attrs, _) =>
currNode = label
label match {
case "PubmedArticle" => currentArticle = new PMArticle
case "Author" => currentAuthor = new PMAuthor
case "Journal" => currentJournal = new PMJournal
case "Grant" => currentGrant = new PMGrant
case "PublicationType" | "DescriptorName" =>
currentSubject = new PMSubject
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
case _ =>
}
case EvElemEnd(_, label) =>
label match {
case "PubmedArticle" => return currentArticle
case "Author" => currentArticle.getAuthors.add(currentAuthor)
case "Journal" => currentArticle.setJournal(currentJournal)
case "Grant" => currentArticle.getGrants.add(currentGrant)
case "PubMedPubDate" =>
if (currentArticle.getDate == null)
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
case _ =>
}
case EvText(text) =>
if (currNode != null && text.trim.nonEmpty)
currNode match {
case "ArticleTitle" => {
if (currentArticle.getTitle == null)
currentArticle.setTitle(text.trim)
else
currentArticle.setTitle(currentArticle.getTitle + text.trim)
}
case "AbstractText" => {
if (currentArticle.getDescription == null)
currentArticle.setDescription(text.trim)
else
currentArticle.setDescription(currentArticle.getDescription + text.trim)
}
case "PMID" => currentArticle.setPmid(text.trim)
case "ArticleId" =>
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
case "Language" => currentArticle.setLanguage(text.trim)
case "ISSN" => currentJournal.setIssn(text.trim)
case "GrantID" => currentGrant.setGrantID(text.trim)
case "Agency" => currentGrant.setAgency(text.trim)
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
case "Year" => currentYear = text.trim
case "Month" => currentMonth = text.trim
case "Day" => currentDay = text.trim
case "Volume" => currentJournal.setVolume(text.trim)
case "Issue" => currentJournal.setIssue(text.trim)
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
case "LastName" => {
if (currentAuthor != null)
currentAuthor.setLastName(text.trim)
}
case "ForeName" =>
if (currentAuthor != null)
currentAuthor.setForeName(text.trim)
case "Title" =>
if (currentJournal.getTitle == null)
currentJournal.setTitle(text.trim)
else
currentJournal.setTitle(currentJournal.getTitle + text.trim)
case _ =>
}
case _ =>
}
}
null
}
}

View File

@ -0,0 +1,222 @@
<PubmedArticle>
<MedlineCitation Status="MEDLINE" IndexingMethod="Automated" Owner="NLM">
<PMID Version="1">37885214</PMID>
<DateCompleted>
<Year>2024</Year>
<Month>02</Month>
<Day>14</Day>
</DateCompleted>
<DateRevised>
<Year>2024</Year>
<Month>02</Month>
<Day>14</Day>
</DateRevised>
<Article PubModel="Print-Electronic">
<Journal>
<ISSN IssnType="Electronic">2752-7549</ISSN>
<JournalIssue CitedMedium="Internet">
<Volume>40</Volume>
<Issue>5</Issue>
<PubDate>
<MedlineDate>2023 Sep-Oct</MedlineDate>
</PubDate>
</JournalIssue>
<Title>Journal of pediatric hematology/oncology nursing</Title>
<ISOAbbreviation>J Pediatr Hematol Oncol Nurs</ISOAbbreviation>
</Journal>
<ArticleTitle>Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country.</ArticleTitle>
<Pagination>
<MedlinePgn>295-304</MedlinePgn>
</Pagination>
<ELocationID EIdType="doi" ValidYN="Y">10.1177/27527530231193972</ELocationID>
<Abstract>
<AbstractText><b>Background:</b> Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. <b>Method:</b> Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. <b>Results:</b> Filipino parents (<i>N</i>=156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. <b>Conclusions:</b> Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience.</AbstractText>
</Abstract>
<AuthorList CompleteYN="Y">
<Author ValidYN="Y">
<LastName>Banayat</LastName>
<ForeName>Aprille Campos</ForeName>
<Initials>AC</Initials>
<Identifier Source="ORCID">0000-0001-9339-9871</Identifier>
<AffiliationInfo>
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
</AffiliationInfo>
</Author>
<Author ValidYN="Y">
<LastName>Abad</LastName>
<ForeName>Peter James B</ForeName>
<Initials>PJB</Initials>
<AffiliationInfo>
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
</AffiliationInfo>
</Author>
<Author ValidYN="Y">
<LastName>Bonito</LastName>
<ForeName>Sheila R</ForeName>
<Initials>SR</Initials>
<AffiliationInfo>
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
</AffiliationInfo>
</Author>
<Author ValidYN="Y">
<LastName>Manahan</LastName>
<ForeName>Lydia T</ForeName>
<Initials>LT</Initials>
<AffiliationInfo>
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
</AffiliationInfo>
</Author>
<Author ValidYN="Y">
<LastName>Peralta</LastName>
<ForeName>Arnold B</ForeName>
<Initials>AB</Initials>
<AffiliationInfo>
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
</AffiliationInfo>
</Author>
</AuthorList>
<Language>eng</Language>
<PublicationTypeList>
<PublicationType UI="D016428">Journal Article</PublicationType>
</PublicationTypeList>
<ArticleDate DateType="Electronic">
<Year>2023</Year>
<Month>10</Month>
<Day>26</Day>
</ArticleDate>
</Article>
<MedlineJournalInfo>
<Country>United States</Country>
<MedlineTA>J Pediatr Hematol Oncol Nurs</MedlineTA>
<NlmUniqueID>9918282681506676</NlmUniqueID>
<ISSNLinking>2752-7530</ISSNLinking>
</MedlineJournalInfo>
<CitationSubset>IM</CitationSubset>
<MeshHeadingList>
<MeshHeading>
<DescriptorName UI="D002648" MajorTopicYN="N">Child</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D010290" MajorTopicYN="Y">Parents</DescriptorName>
<QualifierName UI="Q000523" MajorTopicYN="N">psychology</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D012944" MajorTopicYN="N">Social Support</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D029181" MajorTopicYN="N">Spirituality</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D012067" MajorTopicYN="N">Religion</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D009369" MajorTopicYN="Y">Neoplasms</DescriptorName>
<QualifierName UI="Q000628" MajorTopicYN="N">therapy</QualifierName>
</MeshHeading>
</MeshHeadingList>
<KeywordList Owner="NOTNLM">
<Keyword MajorTopicYN="N">cancer</Keyword>
<Keyword MajorTopicYN="N">mixed methods</Keyword>
<Keyword MajorTopicYN="N">parent</Keyword>
<Keyword MajorTopicYN="N">pediatric</Keyword>
<Keyword MajorTopicYN="N">research</Keyword>
<Keyword MajorTopicYN="N">supportive care</Keyword>
</KeywordList>
<CoiStatement>Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article.</CoiStatement>
</MedlineCitation>
<PubmedData>
<History>
<PubMedPubDate PubStatus="medline">
<Year>2024</Year>
<Month>2</Month>
<Day>12</Day>
<Hour>18</Hour>
<Minute>42</Minute>
</PubMedPubDate>
<PubMedPubDate PubStatus="pubmed">
<Year>2023</Year>
<Month>10</Month>
<Day>27</Day>
<Hour>6</Hour>
<Minute>42</Minute>
</PubMedPubDate>
<PubMedPubDate PubStatus="entrez">
<Year>2023</Year>
<Month>10</Month>
<Day>27</Day>
<Hour>3</Hour>
<Minute>43</Minute>
</PubMedPubDate>
</History>
<PublicationStatus>ppublish</PublicationStatus>
<ArticleIdList>
<ArticleId IdType="pubmed">37885214</ArticleId>
<ArticleId IdType="doi">10.1177/27527530231193972</ArticleId>
</ArticleIdList>
</PubmedData>
</PubmedArticle>
<DeleteCitation>
<PMID Version="1">30522158</PMID>
<PMID Version="1">32769323</PMID>
<PMID Version="1">34061701</PMID>
<PMID Version="1">34661197</PMID>
<PMID Version="1">34837091</PMID>
<PMID Version="1">35035475</PMID>
<PMID Version="1">35211699</PMID>
<PMID Version="1">35557982</PMID>
<PMID Version="1">35782783</PMID>
<PMID Version="1">35795240</PMID>
<PMID Version="1">35832688</PMID>
<PMID Version="1">35847411</PMID>
<PMID Version="1">36081602</PMID>
<PMID Version="1">36081858</PMID>
<PMID Version="1">36468085</PMID>
<PMID Version="1">36468934</PMID>
<PMID Version="1">36580086</PMID>
<PMID Version="1">36589526</PMID>
<PMID Version="1">36619609</PMID>
<PMID Version="1">36649460</PMID>
<PMID Version="1">36654909</PMID>
<PMID Version="1">36655054</PMID>
<PMID Version="1">36700856</PMID>
<PMID Version="1">36705625</PMID>
<PMID Version="1">36713939</PMID>
<PMID Version="1">36714172</PMID>
<PMID Version="1">36741203</PMID>
<PMID Version="1">36741905</PMID>
<PMID Version="1">36743825</PMID>
<PMID Version="1">36788221</PMID>
<PMID Version="1">36844926</PMID>
<PMID Version="1">36846546</PMID>
<PMID Version="1">36935776</PMID>
<PMID Version="1">36946757</PMID>
<PMID Version="1">36972191</PMID>
<PMID Version="1">37034422</PMID>
<PMID Version="1">37124311</PMID>
<PMID Version="1">37152108</PMID>
<PMID Version="1">37171968</PMID>
<PMID Version="1">37273889</PMID>
<PMID Version="1">37333905</PMID>
<PMID Version="1">37387733</PMID>
<PMID Version="1">37431449</PMID>
<PMID Version="1">37576947</PMID>
<PMID Version="1">37601162</PMID>
<PMID Version="1">37711214</PMID>
<PMID Version="1">37901290</PMID>
<PMID Version="1">37981909</PMID>
<PMID Version="1">37981945</PMID>
<PMID Version="1">37982005</PMID>
<PMID Version="1">38037601</PMID>
<PMID Version="1">38037602</PMID>
<PMID Version="1">38150730</PMID>
<PMID Version="1">38274640</PMID>
<PMID Version="1">38332671</PMID>
<PMID Version="1">38334184</PMID>
<PMID Version="1">38335456</PMID>
<PMID Version="1">38349506</PMID>
<PMID Version="1">38349576</PMID>
<PMID Version="1">38353676</PMID>
</DeleteCitation>

View File

@ -5,7 +5,10 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.utils.PidType
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf} import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf}
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -13,8 +16,9 @@ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.LoggerFactory
import java.io.{BufferedReader, InputStream, InputStreamReader} import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream import java.util.zip.GZIPInputStream
import javax.xml.stream.XMLInputFactory import javax.xml.stream.XMLInputFactory
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
@ -48,6 +52,17 @@ class BioScholixTest extends AbstractVocabularyTest {
} }
} }
@Test
def testParsingPubmed2(): Unit = {
val mapper = new ObjectMapper()
val xml = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml"))
val parser = new PMParser2()
val article = parser.parse(xml)
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
}
@Test @Test
def testEBIData() = { def testEBIData() = {
val inputFactory = XMLInputFactory.newInstance val inputFactory = XMLInputFactory.newInstance
@ -124,6 +139,15 @@ class BioScholixTest extends AbstractVocabularyTest {
} }
} }
@Test
def testPubmedSplitting(): Unit = {
val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()
new SparkCreatePubmedDump("", Array.empty, LoggerFactory.getLogger(getClass))
.createPubmedDump(spark, "/home/sandro/Downloads/pubmed", "/home/sandro/Downloads/pubmed_mapped", vocabularies)
}
@Test @Test
def testPubmedOriginalID(): Unit = { def testPubmedOriginalID(): Unit = {
val article: PMArticle = new PMArticle val article: PMArticle = new PMArticle