forked from D-Net/dnet-hadoop
Organized getters and setters in the PMArticle class for better readability and maintainability.
This commit is contained in:
parent
a42c8b7c85
commit
a8ed5a3b04
|
@ -26,16 +26,16 @@
|
|||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-actionmanager</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-aggregation</artifactId>-->
|
||||
<!-- <artifactId>dhp-actionmanager</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||
|
@ -56,61 +56,61 @@
|
|||
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-mapper</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-provision</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-impact-indicators</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-actionsets</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-promote</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-swh</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-stats-build</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-mapper</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-provision</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-impact-indicators</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-actionsets</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-hist-snaps</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-monitor-irish</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-promote</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-swh</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-raw-data-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-stats-build</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -29,7 +30,6 @@ import org.apache.spark.sql.Dataset;
|
|||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.spark_project.jetty.util.StringUtil;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
|
@ -206,7 +206,7 @@ public class ExtractPerson implements Serializable {
|
|||
null);
|
||||
relation.setValidated(true);
|
||||
|
||||
if (StringUtil.isNotBlank(role)) {
|
||||
if (StringUtils.isNotBlank(role)) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("role");
|
||||
kv.setValue(role);
|
||||
|
@ -439,13 +439,13 @@ public class ExtractPerson implements Serializable {
|
|||
null);
|
||||
relation.setValidated(true);
|
||||
|
||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
|
||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("startDate");
|
||||
kv.setValue(row.getStartDate());
|
||||
properties.add(kv);
|
||||
}
|
||||
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
|
||||
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("endDate");
|
||||
kv.setValue(row.getEndDate());
|
||||
|
|
|
@ -8,259 +8,115 @@ import java.util.List;
|
|||
/**
|
||||
* This class represent an instance of Pubmed Article extracted from the native XML
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
|
||||
public class PMArticle implements Serializable {
|
||||
|
||||
/**
|
||||
* the Pubmed Identifier
|
||||
*/
|
||||
private String pmid;
|
||||
|
||||
private String pmcId;
|
||||
|
||||
/**
|
||||
* the DOI
|
||||
*/
|
||||
private String doi;
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*/
|
||||
private String date;
|
||||
/**
|
||||
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
|
||||
*/
|
||||
private PMJournal journal;
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*/
|
||||
private String title;
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*/
|
||||
private String description;
|
||||
/**
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
|
||||
*/
|
||||
private final List<PMSubject> subjects = new ArrayList<>();
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*/
|
||||
private final List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*/
|
||||
private List<PMSubject> subjects;
|
||||
private List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
private List<PMAuthor> authors = new ArrayList<>();
|
||||
private List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
*/
|
||||
private final List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* get the DOI
|
||||
* @return a DOI
|
||||
*/
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the DOI
|
||||
* @param doi a DOI
|
||||
*/
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the Pubmed Identifier
|
||||
* @return the PMID
|
||||
*/
|
||||
public String getPmid() {
|
||||
return pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the Pubmed Identifier
|
||||
* @param pmid the Pubmed Identifier
|
||||
*/
|
||||
public void setPmid(String pmid) {
|
||||
this.pmid = pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*
|
||||
* @return the Pubmed Date
|
||||
*/
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pubmed Date
|
||||
* @param date
|
||||
*/
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
|
||||
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
|
||||
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*
|
||||
* @return the pubmed Journal Extracted
|
||||
*/
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the mapped pubmed Journal
|
||||
* @param journal
|
||||
*/
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
|
||||
* those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
|
||||
* All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
|
||||
* Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
|
||||
* Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
|
||||
*
|
||||
* @return the extracted pubmed Title
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the pubmed title
|
||||
* @param title
|
||||
*/
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*
|
||||
* @return the Mapped Pubmed Article Abstracts
|
||||
*/
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Pubmed Article Abstracts
|
||||
* @param description
|
||||
*/
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*
|
||||
* @return get the Mapped Authors lists
|
||||
*/
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Authors lists
|
||||
* @param authors
|
||||
*/
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Subjects
|
||||
*/
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*
|
||||
* @return The mapped Language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Set The mapped Language
|
||||
*
|
||||
* @param language the mapped Language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Publication Type
|
||||
*/
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
* @return the mapped grants
|
||||
*/
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public String getPmcId() {
|
||||
return pmcId;
|
||||
}
|
||||
|
||||
public PMArticle setPmcId(String pmcId) {
|
||||
public void setPmcId(String pmcId) {
|
||||
this.pmcId = pmcId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(List<PMSubject> subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
public void setPublicationTypes(List<PMSubject> publicationTypes) {
|
||||
this.publicationTypes = publicationTypes;
|
||||
}
|
||||
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public void setGrants(List<PMGrant> grants) {
|
||||
this.grants = grants;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
|
||||
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the mdstore path to save", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -0,0 +1,90 @@
|
|||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser}
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.io.ByteArrayInputStream
|
||||
import javax.xml.stream.XMLInputFactory
|
||||
|
||||
class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"TargetPath is '$targetPath'")
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
|
||||
createPubmedDump(spark, sourcePath, targetPath, vocabularies)
|
||||
|
||||
}
|
||||
|
||||
def createPubmedDump(
|
||||
spark: SparkSession,
|
||||
sourcePath: String,
|
||||
targetPath: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Unit = {
|
||||
require(spark != null)
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle])
|
||||
|
||||
import spark.implicits._
|
||||
val df = spark.read.option("lineSep", "</PubmedArticle>").text(sourcePath)
|
||||
val mapper = new ObjectMapper()
|
||||
df.as[String]
|
||||
.map(s => {
|
||||
val id = s.indexOf("<PubmedArticle>")
|
||||
if (id >= 0) s"${s.substring(id)}</PubmedArticle>" else null
|
||||
})
|
||||
.filter(s => s != null)
|
||||
.map { i =>
|
||||
try {
|
||||
new PMParser2().parse(i)
|
||||
} catch {
|
||||
case _: Exception => {
|
||||
throw new RuntimeException(s"Error parsing article: $i")
|
||||
}
|
||||
}
|
||||
}
|
||||
.dropDuplicates("pmid")
|
||||
.map { a =>
|
||||
val oaf = PubMedToOaf.convert(a, vocabularies)
|
||||
if (oaf != null)
|
||||
mapper.writeValueAsString(oaf)
|
||||
else
|
||||
null
|
||||
}
|
||||
.as[String]
|
||||
.filter(s => s != null)
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode("overwrite")
|
||||
.text(targetPath)
|
||||
}
|
||||
}
|
||||
|
||||
object SparkCreatePubmedDump {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run()
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,264 @@
|
|||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
|
||||
import javax.xml.stream.XMLEventReader
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.xml.{MetaData, NodeSeq}
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
|
||||
|
||||
class PMParser2 {
|
||||
|
||||
/** Extracts the value of an attribute from a MetaData object.
|
||||
* @param attrs the MetaData object
|
||||
* @param key the key of the attribute
|
||||
* @return the value of the attribute or null if the attribute is not found
|
||||
*/
|
||||
private def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s = res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
} else null
|
||||
}
|
||||
|
||||
/** Validates and formats a date given the year, month, and day as strings.
|
||||
*
|
||||
* @param year the year as a string
|
||||
* @param month the month as a string
|
||||
* @param day the day as a string
|
||||
* @return the formatted date as "YYYY-MM-DD" or null if the date is invalid
|
||||
*/
|
||||
private def validate_Date(year: String, month: String, day: String): String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable => null
|
||||
}
|
||||
}
|
||||
|
||||
/** Extracts the grant information from a NodeSeq object.
|
||||
*
|
||||
* @param gNode the NodeSeq object
|
||||
* @return the grant information or an empty list if the grant information is not found
|
||||
*/
|
||||
private def extractGrant(gNode: NodeSeq): List[PMGrant] = {
|
||||
gNode
|
||||
.map(node => {
|
||||
val grantId = (node \ "GrantID").text
|
||||
val agency = (node \ "Agency").text
|
||||
val country = (node \ "Country").text
|
||||
new PMGrant(grantId, agency, country)
|
||||
})
|
||||
.toList
|
||||
}
|
||||
|
||||
/** Extracts the journal information from a NodeSeq object.
|
||||
*
|
||||
* @param jNode the NodeSeq object
|
||||
* @return the journal information or null if the journal information is not found
|
||||
*/
|
||||
private def extractJournal(jNode: NodeSeq): PMJournal = {
|
||||
val journal = new PMJournal
|
||||
journal.setTitle((jNode \ "Title").text)
|
||||
journal.setIssn((jNode \ "ISSN").text)
|
||||
journal.setVolume((jNode \ "JournalIssue" \ "Volume").text)
|
||||
journal.setIssue((jNode \ "JournalIssue" \ "Issue").text)
|
||||
if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle))
|
||||
journal
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = {
|
||||
aNode
|
||||
.map(author => {
|
||||
val a = new PMAuthor
|
||||
a.setLastName((author \ "LastName").text)
|
||||
a.setForeName((author \ "ForeName").text)
|
||||
a
|
||||
})
|
||||
.toList
|
||||
}
|
||||
|
||||
def parse(input: String): PMArticle = {
|
||||
val xml = scala.xml.XML.loadString(input)
|
||||
val article = new PMArticle
|
||||
|
||||
val grantNodes = xml \ "MedlineCitation" \\ "Grant"
|
||||
article.setGrants(extractGrant(grantNodes).asJava)
|
||||
|
||||
val journal = xml \ "MedlineCitation" \ "Article" \ "Journal"
|
||||
article.setJournal(extractJournal(journal))
|
||||
|
||||
val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
|
||||
|
||||
article.setAuthors(
|
||||
authors
|
||||
.map(author => {
|
||||
val a = new PMAuthor
|
||||
a.setLastName((author \ "LastName").text)
|
||||
a.setForeName((author \ "ForeName").text)
|
||||
a
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
|
||||
val pmId = xml \ "MedlineCitation" \ "PMID"
|
||||
|
||||
val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId"
|
||||
articleIds.foreach(articleId => {
|
||||
val idType = (articleId \ "@IdType").text
|
||||
val id = articleId.text
|
||||
if ("doi".equalsIgnoreCase(idType)) article.setDoi(id)
|
||||
if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id)
|
||||
})
|
||||
article.setPmid(pmId.text)
|
||||
|
||||
val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted"
|
||||
val currentDate =
|
||||
validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text)
|
||||
if (currentDate != null) article.setDate(currentDate)
|
||||
|
||||
val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle"
|
||||
article.setTitle(articleTitle.text)
|
||||
|
||||
val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText"
|
||||
if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty)
|
||||
article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim)
|
||||
|
||||
val language = xml \ "MedlineCitation" \ "Article" \ "Language"
|
||||
article.setLanguage(language.text)
|
||||
|
||||
val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading"
|
||||
article.setSubjects(
|
||||
subjects
|
||||
.take(20)
|
||||
.map(subject => {
|
||||
val descriptorName = (subject \ "DescriptorName").text
|
||||
val ui = (subject \ "DescriptorName" \ "@UI").text
|
||||
val s = new PMSubject
|
||||
s.setValue(descriptorName)
|
||||
s.setMeshId(ui)
|
||||
s
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType"
|
||||
article.setPublicationTypes(
|
||||
publicationTypes
|
||||
.map(pt => {
|
||||
val s = new PMSubject
|
||||
s.setValue(pt.text)
|
||||
s
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
|
||||
article
|
||||
}
|
||||
|
||||
def parse2(xml: XMLEventReader): PMArticle = {
|
||||
var currentArticle: PMArticle = null
|
||||
var currentSubject: PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (xml.hasNext) {
|
||||
val ne = xml.next
|
||||
ne match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" =>
|
||||
if (currentArticle.getDate == null)
|
||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode != null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle == null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription == null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" =>
|
||||
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume(text.trim)
|
||||
case "Issue" => currentJournal.setIssue(text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" =>
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle == null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,222 @@
|
|||
<PubmedArticle>
|
||||
<MedlineCitation Status="MEDLINE" IndexingMethod="Automated" Owner="NLM">
|
||||
<PMID Version="1">37885214</PMID>
|
||||
<DateCompleted>
|
||||
<Year>2024</Year>
|
||||
<Month>02</Month>
|
||||
<Day>14</Day>
|
||||
</DateCompleted>
|
||||
<DateRevised>
|
||||
<Year>2024</Year>
|
||||
<Month>02</Month>
|
||||
<Day>14</Day>
|
||||
</DateRevised>
|
||||
<Article PubModel="Print-Electronic">
|
||||
<Journal>
|
||||
<ISSN IssnType="Electronic">2752-7549</ISSN>
|
||||
<JournalIssue CitedMedium="Internet">
|
||||
<Volume>40</Volume>
|
||||
<Issue>5</Issue>
|
||||
<PubDate>
|
||||
<MedlineDate>2023 Sep-Oct</MedlineDate>
|
||||
</PubDate>
|
||||
</JournalIssue>
|
||||
<Title>Journal of pediatric hematology/oncology nursing</Title>
|
||||
<ISOAbbreviation>J Pediatr Hematol Oncol Nurs</ISOAbbreviation>
|
||||
</Journal>
|
||||
<ArticleTitle>Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country.</ArticleTitle>
|
||||
<Pagination>
|
||||
<MedlinePgn>295-304</MedlinePgn>
|
||||
</Pagination>
|
||||
<ELocationID EIdType="doi" ValidYN="Y">10.1177/27527530231193972</ELocationID>
|
||||
<Abstract>
|
||||
<AbstractText><b>Background:</b> Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. <b>Method:</b> Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. <b>Results:</b> Filipino parents (<i>N</i> = 156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. <b>Conclusions:</b> Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience.</AbstractText>
|
||||
</Abstract>
|
||||
<AuthorList CompleteYN="Y">
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Banayat</LastName>
|
||||
<ForeName>Aprille Campos</ForeName>
|
||||
<Initials>AC</Initials>
|
||||
<Identifier Source="ORCID">0000-0001-9339-9871</Identifier>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Abad</LastName>
|
||||
<ForeName>Peter James B</ForeName>
|
||||
<Initials>PJB</Initials>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Bonito</LastName>
|
||||
<ForeName>Sheila R</ForeName>
|
||||
<Initials>SR</Initials>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Manahan</LastName>
|
||||
<ForeName>Lydia T</ForeName>
|
||||
<Initials>LT</Initials>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Peralta</LastName>
|
||||
<ForeName>Arnold B</ForeName>
|
||||
<Initials>AB</Initials>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
</AuthorList>
|
||||
<Language>eng</Language>
|
||||
<PublicationTypeList>
|
||||
<PublicationType UI="D016428">Journal Article</PublicationType>
|
||||
</PublicationTypeList>
|
||||
<ArticleDate DateType="Electronic">
|
||||
<Year>2023</Year>
|
||||
<Month>10</Month>
|
||||
<Day>26</Day>
|
||||
</ArticleDate>
|
||||
</Article>
|
||||
<MedlineJournalInfo>
|
||||
<Country>United States</Country>
|
||||
<MedlineTA>J Pediatr Hematol Oncol Nurs</MedlineTA>
|
||||
<NlmUniqueID>9918282681506676</NlmUniqueID>
|
||||
<ISSNLinking>2752-7530</ISSNLinking>
|
||||
</MedlineJournalInfo>
|
||||
<CitationSubset>IM</CitationSubset>
|
||||
<MeshHeadingList>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D002648" MajorTopicYN="N">Child</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D010290" MajorTopicYN="Y">Parents</DescriptorName>
|
||||
<QualifierName UI="Q000523" MajorTopicYN="N">psychology</QualifierName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D012944" MajorTopicYN="N">Social Support</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D029181" MajorTopicYN="N">Spirituality</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D012067" MajorTopicYN="N">Religion</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D009369" MajorTopicYN="Y">Neoplasms</DescriptorName>
|
||||
<QualifierName UI="Q000628" MajorTopicYN="N">therapy</QualifierName>
|
||||
</MeshHeading>
|
||||
</MeshHeadingList>
|
||||
<KeywordList Owner="NOTNLM">
|
||||
<Keyword MajorTopicYN="N">cancer</Keyword>
|
||||
<Keyword MajorTopicYN="N">mixed methods</Keyword>
|
||||
<Keyword MajorTopicYN="N">parent</Keyword>
|
||||
<Keyword MajorTopicYN="N">pediatric</Keyword>
|
||||
<Keyword MajorTopicYN="N">research</Keyword>
|
||||
<Keyword MajorTopicYN="N">supportive care</Keyword>
|
||||
</KeywordList>
|
||||
<CoiStatement>Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article.</CoiStatement>
|
||||
</MedlineCitation>
|
||||
<PubmedData>
|
||||
<History>
|
||||
<PubMedPubDate PubStatus="medline">
|
||||
<Year>2024</Year>
|
||||
<Month>2</Month>
|
||||
<Day>12</Day>
|
||||
<Hour>18</Hour>
|
||||
<Minute>42</Minute>
|
||||
</PubMedPubDate>
|
||||
<PubMedPubDate PubStatus="pubmed">
|
||||
<Year>2023</Year>
|
||||
<Month>10</Month>
|
||||
<Day>27</Day>
|
||||
<Hour>6</Hour>
|
||||
<Minute>42</Minute>
|
||||
</PubMedPubDate>
|
||||
<PubMedPubDate PubStatus="entrez">
|
||||
<Year>2023</Year>
|
||||
<Month>10</Month>
|
||||
<Day>27</Day>
|
||||
<Hour>3</Hour>
|
||||
<Minute>43</Minute>
|
||||
</PubMedPubDate>
|
||||
</History>
|
||||
<PublicationStatus>ppublish</PublicationStatus>
|
||||
<ArticleIdList>
|
||||
<ArticleId IdType="pubmed">37885214</ArticleId>
|
||||
<ArticleId IdType="doi">10.1177/27527530231193972</ArticleId>
|
||||
</ArticleIdList>
|
||||
</PubmedData>
|
||||
</PubmedArticle>
|
||||
<DeleteCitation>
|
||||
<PMID Version="1">30522158</PMID>
|
||||
<PMID Version="1">32769323</PMID>
|
||||
<PMID Version="1">34061701</PMID>
|
||||
<PMID Version="1">34661197</PMID>
|
||||
<PMID Version="1">34837091</PMID>
|
||||
<PMID Version="1">35035475</PMID>
|
||||
<PMID Version="1">35211699</PMID>
|
||||
<PMID Version="1">35557982</PMID>
|
||||
<PMID Version="1">35782783</PMID>
|
||||
<PMID Version="1">35795240</PMID>
|
||||
<PMID Version="1">35832688</PMID>
|
||||
<PMID Version="1">35847411</PMID>
|
||||
<PMID Version="1">36081602</PMID>
|
||||
<PMID Version="1">36081858</PMID>
|
||||
<PMID Version="1">36468085</PMID>
|
||||
<PMID Version="1">36468934</PMID>
|
||||
<PMID Version="1">36580086</PMID>
|
||||
<PMID Version="1">36589526</PMID>
|
||||
<PMID Version="1">36619609</PMID>
|
||||
<PMID Version="1">36649460</PMID>
|
||||
<PMID Version="1">36654909</PMID>
|
||||
<PMID Version="1">36655054</PMID>
|
||||
<PMID Version="1">36700856</PMID>
|
||||
<PMID Version="1">36705625</PMID>
|
||||
<PMID Version="1">36713939</PMID>
|
||||
<PMID Version="1">36714172</PMID>
|
||||
<PMID Version="1">36741203</PMID>
|
||||
<PMID Version="1">36741905</PMID>
|
||||
<PMID Version="1">36743825</PMID>
|
||||
<PMID Version="1">36788221</PMID>
|
||||
<PMID Version="1">36844926</PMID>
|
||||
<PMID Version="1">36846546</PMID>
|
||||
<PMID Version="1">36935776</PMID>
|
||||
<PMID Version="1">36946757</PMID>
|
||||
<PMID Version="1">36972191</PMID>
|
||||
<PMID Version="1">37034422</PMID>
|
||||
<PMID Version="1">37124311</PMID>
|
||||
<PMID Version="1">37152108</PMID>
|
||||
<PMID Version="1">37171968</PMID>
|
||||
<PMID Version="1">37273889</PMID>
|
||||
<PMID Version="1">37333905</PMID>
|
||||
<PMID Version="1">37387733</PMID>
|
||||
<PMID Version="1">37431449</PMID>
|
||||
<PMID Version="1">37576947</PMID>
|
||||
<PMID Version="1">37601162</PMID>
|
||||
<PMID Version="1">37711214</PMID>
|
||||
<PMID Version="1">37901290</PMID>
|
||||
<PMID Version="1">37981909</PMID>
|
||||
<PMID Version="1">37981945</PMID>
|
||||
<PMID Version="1">37982005</PMID>
|
||||
<PMID Version="1">38037601</PMID>
|
||||
<PMID Version="1">38037602</PMID>
|
||||
<PMID Version="1">38150730</PMID>
|
||||
<PMID Version="1">38274640</PMID>
|
||||
<PMID Version="1">38332671</PMID>
|
||||
<PMID Version="1">38334184</PMID>
|
||||
<PMID Version="1">38335456</PMID>
|
||||
<PMID Version="1">38349506</PMID>
|
||||
<PMID Version="1">38349576</PMID>
|
||||
<PMID Version="1">38353676</PMID>
|
||||
</DeleteCitation>
|
|
@ -5,7 +5,10 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
|||
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
@ -13,8 +16,9 @@ import org.junit.jupiter.api.Assertions._
|
|||
import org.junit.jupiter.api.extension.ExtendWith
|
||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
import org.slf4j.LoggerFactory
|
||||
|
||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||
import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
|
||||
import java.util.zip.GZIPInputStream
|
||||
import javax.xml.stream.XMLInputFactory
|
||||
import scala.collection.JavaConverters._
|
||||
|
@ -48,6 +52,17 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
def testParsingPubmed2(): Unit = {
|
||||
val mapper = new ObjectMapper()
|
||||
val xml = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml"))
|
||||
val parser = new PMParser2()
|
||||
val article = parser.parse(xml)
|
||||
|
||||
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputFactory = XMLInputFactory.newInstance
|
||||
|
@ -124,6 +139,15 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPubmedSplitting(): Unit = {
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()
|
||||
new SparkCreatePubmedDump("", Array.empty, LoggerFactory.getLogger(getClass))
|
||||
.createPubmedDump(spark, "/home/sandro/Downloads/pubmed", "/home/sandro/Downloads/pubmed_mapped", vocabularies)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPubmedOriginalID(): Unit = {
|
||||
val article: PMArticle = new PMArticle
|
||||
|
|
Loading…
Reference in New Issue