diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml
index d8e17ed46..c4f9b262e 100644
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@@ -26,16 +26,16 @@
 
     <dependencies>
 
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-actionmanager</artifactId>
-            <version>${project.version}</version>
-        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
-<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <artifactId>dhp-actionmanager</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-aggregation</artifactId>
+            <version>${project.version}</version>
+        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-blacklist</artifactId>-->
@@ -56,61 +56,61 @@
 <!--            <artifactId>dhp-enrichment</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-graph-mapper</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-graph-provision</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-impact-indicators</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-actionsets</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-hist-snaps</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-monitor-irish</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-promote</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-update</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-swh</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-usage-raw-data-update</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-usage-stats-build</artifactId>
-            <version>${project.version}</version>
-        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-graph-mapper</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-graph-provision</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-impact-indicators</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-actionsets</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-hist-snaps</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-monitor-irish</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-promote</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-update</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-swh</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-usage-raw-data-update</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-usage-stats-build</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
     </dependencies>
 
 
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
index bf2c19c3d..db31bb43f 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@@ -15,6 +15,7 @@ import java.util.stream.Collectors;
 
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
@@ -29,7 +30,6 @@ import org.apache.spark.sql.Dataset;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.spark_project.jetty.util.StringUtil;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
@@ -206,7 +206,7 @@ public class ExtractPerson implements Serializable {
 				null);
 		relation.setValidated(true);
 
-		if (StringUtil.isNotBlank(role)) {
+		if (StringUtils.isNotBlank(role)) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("role");
 			kv.setValue(role);
@@ -439,13 +439,13 @@ public class ExtractPerson implements Serializable {
 				null);
 		relation.setValidated(true);
 
-		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
+		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("startDate");
 			kv.setValue(row.getStartDate());
 			properties.add(kv);
 		}
-		if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
+		if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("endDate");
 			kv.setValue(row.getEndDate());
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
index 3fb814606..6191f6446 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
@@ -8,259 +8,115 @@ import java.util.List;
 /**
  * This class represent an instance of Pubmed Article extracted from the native XML
  *
- * @author Sandro La Bruzzo
  */
-
 public class PMArticle implements Serializable {
 
-	/**
-	 * the Pubmed Identifier
-	 */
 	private String pmid;
-
 	private String pmcId;
-
-	/**
-	 * the DOI
-	 */
 	private String doi;
-	/**
-	 * the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
-	 * All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
-	 */
 	private String date;
-	/**
-	 * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
-	 */
 	private PMJournal journal;
-	/**
-	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
-	 */
 	private String title;
-	/**
-	 * English-language abstracts are taken directly from the published article.
-	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
-	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
-	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
-	 */
 	private String description;
-	/**
-	 * the language in which an article was published is recorded in <Language>.
-	 * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
-	 * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
-	 *  Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
-	 */
 	private String language;
-
-	/**
-	 * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations.	 *
-	 */
-	private final List<PMSubject> subjects = new ArrayList<>();
-	/**
-	 * This element is used to identify the type of article indexed for MEDLINE;
-	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
-	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
-	 */
-	private final List<PMSubject> publicationTypes = new ArrayList<>();
-	/**
-	 * Personal and collective (corporate) author names published with the article are found in <AuthorList>.
-	 */
+	private List<PMSubject> subjects;
+	private List<PMSubject> publicationTypes = new ArrayList<>();
 	private List<PMAuthor> authors = new ArrayList<>();
+	private List<PMGrant> grants = new ArrayList<>();
 
-	/**
-	 * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
-	 * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
-	 */
-	private final List<PMGrant> grants = new ArrayList<>();
-
-	/**
-	 * get the DOI
-	 * @return a DOI
-	 */
-	public String getDoi() {
-		return doi;
-	}
-
-	/**
-	 * Set the DOI
-	 * @param doi a DOI
-	 */
-	public void setDoi(String doi) {
-		this.doi = doi;
-	}
-
-	/**
-	 * get the Pubmed Identifier
-	 * @return the PMID
-	 */
 	public String getPmid() {
 		return pmid;
 	}
 
-	/**
-	 * set the Pubmed Identifier
-	 * @param pmid the Pubmed Identifier
-	 */
 	public void setPmid(String pmid) {
 		this.pmid = pmid;
 	}
 
-	/**
-	 * the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
-	 * All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
-	 *
-	 * @return the Pubmed Date
-	 */
-	public String getDate() {
-		return date;
-	}
-
-	/**
-	 * Set the pubmed Date
-	 * @param date
-	 */
-	public void setDate(String date) {
-		this.date = date;
-	}
-
-	/**
-	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
-	 * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
-	 * The NLM journal title abbreviation is exported in the <MedlineTA> element.
-	 *
-	 * @return the pubmed Journal Extracted
-	 */
-	public PMJournal getJournal() {
-		return journal;
-	}
-
-	/**
-	 * Set the mapped pubmed Journal
-	 * @param journal
-	 */
-	public void setJournal(PMJournal journal) {
-		this.journal = journal;
-	}
-
-	/**
-	 * <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
-	 * those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
-	 * All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
-	 * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
-	 * Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
-	 *
-	 *  @return the extracted pubmed Title
-	 */
-	public String getTitle() {
-		return title;
-	}
-
-	/**
-	 * set the pubmed title
-	 * @param title
-	 */
-	public void setTitle(String title) {
-		this.title = title;
-	}
-
-	/**
-	 * English-language abstracts are taken directly from the published article.
-	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
-	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
-	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
-	 *
-	 * @return the Mapped Pubmed Article Abstracts
-	 */
-	public String getDescription() {
-		return description;
-	}
-
-	/**
-	 * Set the Mapped Pubmed Article Abstracts
-	 * @param description
-	 */
-	public void setDescription(String description) {
-		this.description = description;
-	}
-
-	/**
-	 * Personal and collective (corporate) author names published with the article are found in <AuthorList>.
-	 *
-	 * @return get the Mapped Authors lists
-	 */
-	public List<PMAuthor> getAuthors() {
-		return authors;
-	}
-
-	/**
-	 * Set the Mapped Authors lists
-	 * @param authors
-	 */
-	public void setAuthors(List<PMAuthor> authors) {
-		this.authors = authors;
-	}
-
-	/**
-	 * This element is used to identify the type of article indexed for MEDLINE;
-	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
-	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
-	 *
-	 * @return the mapped Subjects
-	 */
-	public List<PMSubject> getSubjects() {
-		return subjects;
-	}
-
-	/**
-	 *
-	 * the language in which an article was published is recorded in <Language>.
-	 * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
-	 * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
-	 *  Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
-	 *
-	 * @return The mapped Language
-	 */
-	public String getLanguage() {
-		return language;
-	}
-
-	/**
-	 *
-	 *  Set The mapped Language
-	 *
-	 * @param language the mapped Language
-	 */
-	public void setLanguage(String language) {
-		this.language = language;
-	}
-
-	/**
-	 *  This element is used to identify the type of article indexed for MEDLINE;
-	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
-	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
-	 *
-	 * @return the mapped Publication Type
-	 */
-	public List<PMSubject> getPublicationTypes() {
-		return publicationTypes;
-	}
-
-	/**
-	 * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
-	 * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
-	 * @return the mapped grants
-	 */
-
-	public List<PMGrant> getGrants() {
-		return grants;
-	}
-
 	public String getPmcId() {
 		return pmcId;
 	}
 
-	public PMArticle setPmcId(String pmcId) {
+	public void setPmcId(String pmcId) {
 		this.pmcId = pmcId;
-		return this;
+	}
+
+	public String getDoi() {
+		return doi;
+	}
+
+	public void setDoi(String doi) {
+		this.doi = doi;
+	}
+
+	public String getDate() {
+		return date;
+	}
+
+	public void setDate(String date) {
+		this.date = date;
+	}
+
+	public PMJournal getJournal() {
+		return journal;
+	}
+
+	public void setJournal(PMJournal journal) {
+		this.journal = journal;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getDescription() {
+		return description;
+	}
+
+	public void setDescription(String description) {
+		this.description = description;
+	}
+
+	public String getLanguage() {
+		return language;
+	}
+
+	public void setLanguage(String language) {
+		this.language = language;
+	}
+
+	public List<PMSubject> getSubjects() {
+		return subjects;
+	}
+
+	public void setSubjects(List<PMSubject> subjects) {
+		this.subjects = subjects;
+	}
+
+	public List<PMSubject> getPublicationTypes() {
+		return publicationTypes;
+	}
+
+	public void setPublicationTypes(List<PMSubject> publicationTypes) {
+		this.publicationTypes = publicationTypes;
+	}
+
+	public List<PMAuthor> getAuthors() {
+		return authors;
+	}
+
+	public void setAuthors(List<PMAuthor> authors) {
+		this.authors = authors;
+	}
+
+	public List<PMGrant> getGrants() {
+		return grants;
+	}
+
+	public void setGrants(List<PMGrant> grants) {
+		this.grants = grants;
 	}
 }
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
index 3ba83764d..8326fab0f 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
@@ -1,8 +1,7 @@
 [
   {"paramName":"mt",  "paramLongName":"master",         "paramDescription": "should be local or yarn",                  "paramRequired": true},
   {"paramName":"i",   "paramLongName":"isLookupUrl",    "paramDescription": "isLookupUrl",                              "paramRequired": true},
-  {"paramName":"w",   "paramLongName":"workingPath",    "paramDescription": "the path of the sequencial file to read",  "paramRequired": true},
-  {"paramName":"mo",   "paramLongName":"mdstoreOutputVersion",     "paramDescription": "the oaf path ",                            "paramRequired": true},
-  {"paramName":"s",   "paramLongName":"skipUpdate",     "paramDescription": "skip update ",                             "paramRequired": false},
-  {"paramName":"h",   "paramLongName":"hdfsServerUri",  "paramDescription": "the  working path ",                       "paramRequired": true}
+  {"paramName":"s",   "paramLongName":"sourcePath",    "paramDescription": "the baseline path",  "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath",     "paramDescription": "the mdstore path to save",                 "paramRequired": true}
+
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
new file mode 100644
index 000000000..c21bfd7c3
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
@@ -0,0 +1,90 @@
+package eu.dnetlib.dhp.sx.bio.ebi
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser}
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf}
+import eu.dnetlib.dhp.utils.ISLookupClientFactory
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.io.ByteArrayInputStream
+import javax.xml.stream.XMLInputFactory
+
+class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
+  override def run(): Unit = {
+    val isLookupUrl: String = parser.get("isLookupUrl")
+    log.info("isLookupUrl: {}", isLookupUrl)
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"SourcePath is '$sourcePath'")
+    val targetPath = parser.get("targetPath")
+    log.info(s"TargetPath is '$targetPath'")
+
+    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
+    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
+
+    createPubmedDump(spark, sourcePath, targetPath, vocabularies)
+
+  }
+
+  def createPubmedDump(
+    spark: SparkSession,
+    sourcePath: String,
+    targetPath: String,
+    vocabularies: VocabularyGroup
+  ): Unit = {
+    require(spark != null)
+
+    implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle])
+
+    import spark.implicits._
+    val df = spark.read.option("lineSep", "</PubmedArticle>").text(sourcePath)
+    val mapper = new ObjectMapper()
+    df.as[String]
+      .map(s => {
+        val id = s.indexOf("<PubmedArticle>")
+        if (id >= 0) s"${s.substring(id)}</PubmedArticle>" else null
+      })
+      .filter(s => s != null)
+      .map { i =>
+        try {
+          new PMParser2().parse(i)
+        } catch {
+          case _: Exception => {
+            throw new RuntimeException(s"Error parsing article: $i")
+          }
+        }
+      }
+      .dropDuplicates("pmid")
+      .map { a =>
+        val oaf = PubMedToOaf.convert(a, vocabularies)
+        if (oaf != null)
+          mapper.writeValueAsString(oaf)
+        else
+          null
+      }
+      .as[String]
+      .filter(s => s != null)
+      .write
+      .option("compression", "gzip")
+      .mode("overwrite")
+      .text(targetPath)
+  }
+}
+
+object SparkCreatePubmedDump {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+
+    new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run()
+
+  }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
new file mode 100644
index 000000000..c9e868185
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
@@ -0,0 +1,264 @@
+package eu.dnetlib.dhp.sx.bio.pubmed
+
+import org.apache.commons.lang3.StringUtils
+
+import javax.xml.stream.XMLEventReader
+import scala.collection.JavaConverters._
+import scala.xml.{MetaData, NodeSeq}
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
+
+class PMParser2 {
+
+  /** Extracts the value of an attribute from a MetaData object.
+    * @param attrs the MetaData object
+    * @param key the key of the attribute
+    * @return the value of the attribute or null if the attribute is not found
+    */
+  private def extractAttributes(attrs: MetaData, key: String): String = {
+
+    val res = attrs.get(key)
+    if (res.isDefined) {
+      val s = res.get
+      if (s != null && s.nonEmpty)
+        s.head.text
+      else
+        null
+    } else null
+  }
+
+  /** Validates and formats a date given the year, month, and day as strings.
+    *
+    * @param year  the year as a string
+    * @param month the month as a string
+    * @param day   the day as a string
+    * @return      the formatted date as "YYYY-MM-DD" or null if the date is invalid
+    */
+  private def validate_Date(year: String, month: String, day: String): String = {
+    try {
+      f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
+
+    } catch {
+      case _: Throwable => null
+    }
+  }
+
+  /** Extracts the grant information from a NodeSeq object.
+    *
+    * @param gNode the NodeSeq object
+    * @return the grant information or an empty list if the grant information is not found
+    */
+  private def extractGrant(gNode: NodeSeq): List[PMGrant] = {
+    gNode
+      .map(node => {
+        val grantId = (node \ "GrantID").text
+        val agency = (node \ "Agency").text
+        val country = (node \ "Country").text
+        new PMGrant(grantId, agency, country)
+      })
+      .toList
+  }
+
+  /** Extracts the journal information from a NodeSeq object.
+    *
+    * @param jNode the NodeSeq object
+    * @return the journal information or null if the journal information is not found
+    */
+  private def extractJournal(jNode: NodeSeq): PMJournal = {
+    val journal = new PMJournal
+    journal.setTitle((jNode \ "Title").text)
+    journal.setIssn((jNode \ "ISSN").text)
+    journal.setVolume((jNode \ "JournalIssue" \ "Volume").text)
+    journal.setIssue((jNode \ "JournalIssue" \ "Issue").text)
+    if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle))
+      journal
+    else
+      null
+  }
+
+  private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = {
+    aNode
+      .map(author => {
+        val a = new PMAuthor
+        a.setLastName((author \ "LastName").text)
+        a.setForeName((author \ "ForeName").text)
+        a
+      })
+      .toList
+  }
+
+  def parse(input: String): PMArticle = {
+    val xml = scala.xml.XML.loadString(input)
+    val article = new PMArticle
+
+    val grantNodes = xml \ "MedlineCitation" \\ "Grant"
+    article.setGrants(extractGrant(grantNodes).asJava)
+
+    val journal = xml \ "MedlineCitation" \ "Article" \ "Journal"
+    article.setJournal(extractJournal(journal))
+
+    val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
+
+    article.setAuthors(
+      authors
+        .map(author => {
+          val a = new PMAuthor
+          a.setLastName((author \ "LastName").text)
+          a.setForeName((author \ "ForeName").text)
+          a
+        })
+        .toList
+        .asJava
+    )
+
+    val pmId = xml \ "MedlineCitation" \ "PMID"
+
+    val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId"
+    articleIds.foreach(articleId => {
+      val idType = (articleId \ "@IdType").text
+      val id = articleId.text
+      if ("doi".equalsIgnoreCase(idType)) article.setDoi(id)
+      if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id)
+    })
+    article.setPmid(pmId.text)
+
+    val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted"
+    val currentDate =
+      validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text)
+    if (currentDate != null) article.setDate(currentDate)
+
+    val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle"
+    article.setTitle(articleTitle.text)
+
+    val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText"
+    if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty)
+      article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim)
+
+    val language = xml \ "MedlineCitation" \ "Article" \ "Language"
+    article.setLanguage(language.text)
+
+    val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading"
+    article.setSubjects(
+      subjects
+        .take(20)
+        .map(subject => {
+          val descriptorName = (subject \ "DescriptorName").text
+          val ui = (subject \ "DescriptorName" \ "@UI").text
+          val s = new PMSubject
+          s.setValue(descriptorName)
+          s.setMeshId(ui)
+          s
+        })
+        .toList
+        .asJava
+    )
+    val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType"
+    article.setPublicationTypes(
+      publicationTypes
+        .map(pt => {
+          val s = new PMSubject
+          s.setValue(pt.text)
+          s
+        })
+        .toList
+        .asJava
+    )
+
+    article
+  }
+
+  def parse2(xml: XMLEventReader): PMArticle = {
+    var currentArticle: PMArticle = null
+    var currentSubject: PMSubject = null
+    var currentAuthor: PMAuthor = null
+    var currentJournal: PMJournal = null
+    var currentGrant: PMGrant = null
+    var currNode: String = null
+    var currentYear = "0"
+    var currentMonth = "01"
+    var currentDay = "01"
+    var currentArticleType: String = null
+
+    while (xml.hasNext) {
+      val ne = xml.next
+      ne match {
+        case EvElemStart(_, label, attrs, _) =>
+          currNode = label
+
+          label match {
+            case "PubmedArticle" => currentArticle = new PMArticle
+            case "Author"        => currentAuthor = new PMAuthor
+            case "Journal"       => currentJournal = new PMJournal
+            case "Grant"         => currentGrant = new PMGrant
+            case "PublicationType" | "DescriptorName" =>
+              currentSubject = new PMSubject
+              currentSubject.setMeshId(extractAttributes(attrs, "UI"))
+            case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
+            case _           =>
+          }
+        case EvElemEnd(_, label) =>
+          label match {
+            case "PubmedArticle" => return currentArticle
+            case "Author"        => currentArticle.getAuthors.add(currentAuthor)
+            case "Journal"       => currentArticle.setJournal(currentJournal)
+            case "Grant"         => currentArticle.getGrants.add(currentGrant)
+            case "PubMedPubDate" =>
+              if (currentArticle.getDate == null)
+                currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
+            case "PubDate"         => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
+            case "DescriptorName"  => currentArticle.getSubjects.add(currentSubject)
+            case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
+            case _                 =>
+          }
+        case EvText(text) =>
+          if (currNode != null && text.trim.nonEmpty)
+            currNode match {
+              case "ArticleTitle" => {
+                if (currentArticle.getTitle == null)
+                  currentArticle.setTitle(text.trim)
+                else
+                  currentArticle.setTitle(currentArticle.getTitle + text.trim)
+              }
+              case "AbstractText" => {
+                if (currentArticle.getDescription == null)
+                  currentArticle.setDescription(text.trim)
+                else
+                  currentArticle.setDescription(currentArticle.getDescription + text.trim)
+              }
+              case "PMID" => currentArticle.setPmid(text.trim)
+              case "ArticleId" =>
+                if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
+                if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
+              case "Language"                           => currentArticle.setLanguage(text.trim)
+              case "ISSN"                               => currentJournal.setIssn(text.trim)
+              case "GrantID"                            => currentGrant.setGrantID(text.trim)
+              case "Agency"                             => currentGrant.setAgency(text.trim)
+              case "Country"                            => if (currentGrant != null) currentGrant.setCountry(text.trim)
+              case "Year"                               => currentYear = text.trim
+              case "Month"                              => currentMonth = text.trim
+              case "Day"                                => currentDay = text.trim
+              case "Volume"                             => currentJournal.setVolume(text.trim)
+              case "Issue"                              => currentJournal.setIssue(text.trim)
+              case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
+              case "LastName" => {
+                if (currentAuthor != null)
+                  currentAuthor.setLastName(text.trim)
+              }
+              case "ForeName" =>
+                if (currentAuthor != null)
+                  currentAuthor.setForeName(text.trim)
+              case "Title" =>
+                if (currentJournal.getTitle == null)
+                  currentJournal.setTitle(text.trim)
+                else
+                  currentJournal.setTitle(currentJournal.getTitle + text.trim)
+              case _ =>
+
+            }
+        case _ =>
+      }
+
+    }
+    null
+  }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
new file mode 100644
index 000000000..4b4d860d7
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
@@ -0,0 +1,222 @@
+<PubmedArticle>
+    <MedlineCitation Status="MEDLINE" IndexingMethod="Automated" Owner="NLM">
+        <PMID Version="1">37885214</PMID>
+        <DateCompleted>
+            <Year>2024</Year>
+            <Month>02</Month>
+            <Day>14</Day>
+        </DateCompleted>
+        <DateRevised>
+            <Year>2024</Year>
+            <Month>02</Month>
+            <Day>14</Day>
+        </DateRevised>
+        <Article PubModel="Print-Electronic">
+            <Journal>
+                <ISSN IssnType="Electronic">2752-7549</ISSN>
+                <JournalIssue CitedMedium="Internet">
+                    <Volume>40</Volume>
+                    <Issue>5</Issue>
+                    <PubDate>
+                        <MedlineDate>2023 Sep-Oct</MedlineDate>
+                    </PubDate>
+                </JournalIssue>
+                <Title>Journal of pediatric hematology/oncology nursing</Title>
+                <ISOAbbreviation>J Pediatr Hematol Oncol Nurs</ISOAbbreviation>
+            </Journal>
+            <ArticleTitle>Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country.</ArticleTitle>
+            <Pagination>
+                <MedlinePgn>295-304</MedlinePgn>
+            </Pagination>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1177/27527530231193972</ELocationID>
+            <Abstract>
+                <AbstractText><b>Background:</b> Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. <b>Method:</b> Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. <b>Results:</b> Filipino parents (<i>N</i> = 156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. <b>Conclusions:</b> Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience.</AbstractText>
+            </Abstract>
+            <AuthorList CompleteYN="Y">
+                <Author ValidYN="Y">
+                    <LastName>Banayat</LastName>
+                    <ForeName>Aprille Campos</ForeName>
+                    <Initials>AC</Initials>
+                    <Identifier Source="ORCID">0000-0001-9339-9871</Identifier>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Abad</LastName>
+                    <ForeName>Peter James B</ForeName>
+                    <Initials>PJB</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Bonito</LastName>
+                    <ForeName>Sheila R</ForeName>
+                    <Initials>SR</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Manahan</LastName>
+                    <ForeName>Lydia T</ForeName>
+                    <Initials>LT</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Peralta</LastName>
+                    <ForeName>Arnold B</ForeName>
+                    <Initials>AB</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+            </AuthorList>
+            <Language>eng</Language>
+            <PublicationTypeList>
+                <PublicationType UI="D016428">Journal Article</PublicationType>
+            </PublicationTypeList>
+            <ArticleDate DateType="Electronic">
+                <Year>2023</Year>
+                <Month>10</Month>
+                <Day>26</Day>
+            </ArticleDate>
+        </Article>
+        <MedlineJournalInfo>
+            <Country>United States</Country>
+            <MedlineTA>J Pediatr Hematol Oncol Nurs</MedlineTA>
+            <NlmUniqueID>9918282681506676</NlmUniqueID>
+            <ISSNLinking>2752-7530</ISSNLinking>
+        </MedlineJournalInfo>
+        <CitationSubset>IM</CitationSubset>
+        <MeshHeadingList>
+            <MeshHeading>
+                <DescriptorName UI="D002648" MajorTopicYN="N">Child</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D010290" MajorTopicYN="Y">Parents</DescriptorName>
+                <QualifierName UI="Q000523" MajorTopicYN="N">psychology</QualifierName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D012944" MajorTopicYN="N">Social Support</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D029181" MajorTopicYN="N">Spirituality</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D012067" MajorTopicYN="N">Religion</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D009369" MajorTopicYN="Y">Neoplasms</DescriptorName>
+                <QualifierName UI="Q000628" MajorTopicYN="N">therapy</QualifierName>
+            </MeshHeading>
+        </MeshHeadingList>
+        <KeywordList Owner="NOTNLM">
+            <Keyword MajorTopicYN="N">cancer</Keyword>
+            <Keyword MajorTopicYN="N">mixed methods</Keyword>
+            <Keyword MajorTopicYN="N">parent</Keyword>
+            <Keyword MajorTopicYN="N">pediatric</Keyword>
+            <Keyword MajorTopicYN="N">research</Keyword>
+            <Keyword MajorTopicYN="N">supportive care</Keyword>
+        </KeywordList>
+        <CoiStatement>Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article.</CoiStatement>
+    </MedlineCitation>
+    <PubmedData>
+        <History>
+            <PubMedPubDate PubStatus="medline">
+                <Year>2024</Year>
+                <Month>2</Month>
+                <Day>12</Day>
+                <Hour>18</Hour>
+                <Minute>42</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="pubmed">
+                <Year>2023</Year>
+                <Month>10</Month>
+                <Day>27</Day>
+                <Hour>6</Hour>
+                <Minute>42</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="entrez">
+                <Year>2023</Year>
+                <Month>10</Month>
+                <Day>27</Day>
+                <Hour>3</Hour>
+                <Minute>43</Minute>
+            </PubMedPubDate>
+        </History>
+        <PublicationStatus>ppublish</PublicationStatus>
+        <ArticleIdList>
+            <ArticleId IdType="pubmed">37885214</ArticleId>
+            <ArticleId IdType="doi">10.1177/27527530231193972</ArticleId>
+        </ArticleIdList>
+    </PubmedData>
+</PubmedArticle>
+<DeleteCitation>
+<PMID Version="1">30522158</PMID>
+<PMID Version="1">32769323</PMID>
+<PMID Version="1">34061701</PMID>
+<PMID Version="1">34661197</PMID>
+<PMID Version="1">34837091</PMID>
+<PMID Version="1">35035475</PMID>
+<PMID Version="1">35211699</PMID>
+<PMID Version="1">35557982</PMID>
+<PMID Version="1">35782783</PMID>
+<PMID Version="1">35795240</PMID>
+<PMID Version="1">35832688</PMID>
+<PMID Version="1">35847411</PMID>
+<PMID Version="1">36081602</PMID>
+<PMID Version="1">36081858</PMID>
+<PMID Version="1">36468085</PMID>
+<PMID Version="1">36468934</PMID>
+<PMID Version="1">36580086</PMID>
+<PMID Version="1">36589526</PMID>
+<PMID Version="1">36619609</PMID>
+<PMID Version="1">36649460</PMID>
+<PMID Version="1">36654909</PMID>
+<PMID Version="1">36655054</PMID>
+<PMID Version="1">36700856</PMID>
+<PMID Version="1">36705625</PMID>
+<PMID Version="1">36713939</PMID>
+<PMID Version="1">36714172</PMID>
+<PMID Version="1">36741203</PMID>
+<PMID Version="1">36741905</PMID>
+<PMID Version="1">36743825</PMID>
+<PMID Version="1">36788221</PMID>
+<PMID Version="1">36844926</PMID>
+<PMID Version="1">36846546</PMID>
+<PMID Version="1">36935776</PMID>
+<PMID Version="1">36946757</PMID>
+<PMID Version="1">36972191</PMID>
+<PMID Version="1">37034422</PMID>
+<PMID Version="1">37124311</PMID>
+<PMID Version="1">37152108</PMID>
+<PMID Version="1">37171968</PMID>
+<PMID Version="1">37273889</PMID>
+<PMID Version="1">37333905</PMID>
+<PMID Version="1">37387733</PMID>
+<PMID Version="1">37431449</PMID>
+<PMID Version="1">37576947</PMID>
+<PMID Version="1">37601162</PMID>
+<PMID Version="1">37711214</PMID>
+<PMID Version="1">37901290</PMID>
+<PMID Version="1">37981909</PMID>
+<PMID Version="1">37981945</PMID>
+<PMID Version="1">37982005</PMID>
+<PMID Version="1">38037601</PMID>
+<PMID Version="1">38037602</PMID>
+<PMID Version="1">38150730</PMID>
+<PMID Version="1">38274640</PMID>
+<PMID Version="1">38332671</PMID>
+<PMID Version="1">38334184</PMID>
+<PMID Version="1">38335456</PMID>
+<PMID Version="1">38349506</PMID>
+<PMID Version="1">38349576</PMID>
+<PMID Version="1">38353676</PMID>
+</DeleteCitation>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index c4af14c40..1374b741d 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -5,7 +5,10 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.utils.PidType
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
 import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
-import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
+import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf}
+import org.apache.commons.io.IOUtils
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
@@ -13,8 +16,9 @@ import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.extension.ExtendWith
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
+import org.slf4j.LoggerFactory
 
-import java.io.{BufferedReader, InputStream, InputStreamReader}
+import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
@@ -48,6 +52,17 @@ class BioScholixTest extends AbstractVocabularyTest {
     }
   }
 
+  @Test
+  def testParsingPubmed2(): Unit = {
+    val mapper = new ObjectMapper()
+    val xml = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml"))
+    val parser = new PMParser2()
+    val article = parser.parse(xml)
+
+    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
+
+  }
+
   @Test
   def testEBIData() = {
     val inputFactory = XMLInputFactory.newInstance
@@ -124,6 +139,15 @@ class BioScholixTest extends AbstractVocabularyTest {
     }
   }
 
+  @Test
+  def testPubmedSplitting(): Unit = {
+
+    val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()
+    new SparkCreatePubmedDump("", Array.empty, LoggerFactory.getLogger(getClass))
+      .createPubmedDump(spark, "/home/sandro/Downloads/pubmed", "/home/sandro/Downloads/pubmed_mapped", vocabularies)
+
+  }
+
   @Test
   def testPubmedOriginalID(): Unit = {
     val article: PMArticle = new PMArticle