From a8ed5a3b048b560a6c2b794834e27e4adcdcac97 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Mon, 4 Nov 2024 17:45:28 +0100
Subject: [PATCH 1/8] Organized getters and setters in the PMArticle class for
 better readability and maintainability.

---
 dhp-shade-package/pom.xml                     | 122 +++----
 .../personentity/ExtractPerson.java           |   8 +-
 .../dnetlib/dhp/sx/bio/pubmed/PMArticle.java  | 312 +++++-------------
 .../sx/bio/ebi/baseline_to_oaf_params.json    |   7 +-
 .../sx/bio/ebi/SparkCreatePubmedDump.scala    |  90 +++++
 .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala | 264 +++++++++++++++
 .../dhp/sx/graph/bio/single_pubmed.xml        | 222 +++++++++++++
 .../dnetlib/dhp/sx/bio/BioScholixTest.scala   |  28 +-
 8 files changed, 754 insertions(+), 299 deletions(-)
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
 create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml
index d8e17ed465..c4f9b262e9 100644
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@@ -26,16 +26,16 @@
 
     <dependencies>
 
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-actionmanager</artifactId>
-            <version>${project.version}</version>
-        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
-<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <artifactId>dhp-actionmanager</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-aggregation</artifactId>
+            <version>${project.version}</version>
+        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-blacklist</artifactId>-->
@@ -56,61 +56,61 @@
 <!--            <artifactId>dhp-enrichment</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-graph-mapper</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-graph-provision</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-impact-indicators</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-actionsets</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-hist-snaps</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-monitor-irish</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-promote</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-update</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-swh</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-usage-raw-data-update</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-usage-stats-build</artifactId>
-            <version>${project.version}</version>
-        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-graph-mapper</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-graph-provision</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-impact-indicators</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-actionsets</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-hist-snaps</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-monitor-irish</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-promote</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-update</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-swh</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-usage-raw-data-update</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-usage-stats-build</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
     </dependencies>
 
 
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
index bf2c19c3da..db31bb43f0 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@@ -15,6 +15,7 @@ import java.util.stream.Collectors;
 
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
@@ -29,7 +30,6 @@ import org.apache.spark.sql.Dataset;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.spark_project.jetty.util.StringUtil;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
@@ -206,7 +206,7 @@ public class ExtractPerson implements Serializable {
 				null);
 		relation.setValidated(true);
 
-		if (StringUtil.isNotBlank(role)) {
+		if (StringUtils.isNotBlank(role)) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("role");
 			kv.setValue(role);
@@ -439,13 +439,13 @@ public class ExtractPerson implements Serializable {
 				null);
 		relation.setValidated(true);
 
-		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
+		if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("startDate");
 			kv.setValue(row.getStartDate());
 			properties.add(kv);
 		}
-		if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
+		if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
 			KeyValue kv = new KeyValue();
 			kv.setKey("endDate");
 			kv.setValue(row.getEndDate());
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
index 3fb814606a..6191f6446b 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java
@@ -8,259 +8,115 @@ import java.util.List;
 /**
  * This class represent an instance of Pubmed Article extracted from the native XML
  *
- * @author Sandro La Bruzzo
  */
-
 public class PMArticle implements Serializable {
 
-	/**
-	 * the Pubmed Identifier
-	 */
 	private String pmid;
-
 	private String pmcId;
-
-	/**
-	 * the DOI
-	 */
 	private String doi;
-	/**
-	 * the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
-	 * All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
-	 */
 	private String date;
-	/**
-	 * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
-	 */
 	private PMJournal journal;
-	/**
-	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
-	 */
 	private String title;
-	/**
-	 * English-language abstracts are taken directly from the published article.
-	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
-	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
-	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
-	 */
 	private String description;
-	/**
-	 * the language in which an article was published is recorded in <Language>.
-	 * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
-	 * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
-	 *  Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
-	 */
 	private String language;
-
-	/**
-	 * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations.	 *
-	 */
-	private final List<PMSubject> subjects = new ArrayList<>();
-	/**
-	 * This element is used to identify the type of article indexed for MEDLINE;
-	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
-	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
-	 */
-	private final List<PMSubject> publicationTypes = new ArrayList<>();
-	/**
-	 * Personal and collective (corporate) author names published with the article are found in <AuthorList>.
-	 */
+	private List<PMSubject> subjects;
+	private List<PMSubject> publicationTypes = new ArrayList<>();
 	private List<PMAuthor> authors = new ArrayList<>();
+	private List<PMGrant> grants = new ArrayList<>();
 
-	/**
-	 * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
-	 * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
-	 */
-	private final List<PMGrant> grants = new ArrayList<>();
-
-	/**
-	 * get the DOI
-	 * @return a DOI
-	 */
-	public String getDoi() {
-		return doi;
-	}
-
-	/**
-	 * Set the DOI
-	 * @param doi a DOI
-	 */
-	public void setDoi(String doi) {
-		this.doi = doi;
-	}
-
-	/**
-	 * get the Pubmed Identifier
-	 * @return the PMID
-	 */
 	public String getPmid() {
 		return pmid;
 	}
 
-	/**
-	 * set the Pubmed Identifier
-	 * @param pmid the Pubmed Identifier
-	 */
 	public void setPmid(String pmid) {
 		this.pmid = pmid;
 	}
 
-	/**
-	 * the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
-	 * All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
-	 *
-	 * @return the Pubmed Date
-	 */
-	public String getDate() {
-		return date;
-	}
-
-	/**
-	 * Set the pubmed Date
-	 * @param date
-	 */
-	public void setDate(String date) {
-		this.date = date;
-	}
-
-	/**
-	 * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
-	 * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
-	 * The NLM journal title abbreviation is exported in the <MedlineTA> element.
-	 *
-	 * @return the pubmed Journal Extracted
-	 */
-	public PMJournal getJournal() {
-		return journal;
-	}
-
-	/**
-	 * Set the mapped pubmed Journal
-	 * @param journal
-	 */
-	public void setJournal(PMJournal journal) {
-		this.journal = journal;
-	}
-
-	/**
-	 * <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
-	 * those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
-	 * All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
-	 * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
-	 * Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
-	 *
-	 *  @return the extracted pubmed Title
-	 */
-	public String getTitle() {
-		return title;
-	}
-
-	/**
-	 * set the pubmed title
-	 * @param title
-	 */
-	public void setTitle(String title) {
-		this.title = title;
-	}
-
-	/**
-	 * English-language abstracts are taken directly from the published article.
-	 * If the article does not have a published abstract, the National Library of Medicine does not create one,
-	 * thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
-	 * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
-	 *
-	 * @return the Mapped Pubmed Article Abstracts
-	 */
-	public String getDescription() {
-		return description;
-	}
-
-	/**
-	 * Set the Mapped Pubmed Article Abstracts
-	 * @param description
-	 */
-	public void setDescription(String description) {
-		this.description = description;
-	}
-
-	/**
-	 * Personal and collective (corporate) author names published with the article are found in <AuthorList>.
-	 *
-	 * @return get the Mapped Authors lists
-	 */
-	public List<PMAuthor> getAuthors() {
-		return authors;
-	}
-
-	/**
-	 * Set the Mapped Authors lists
-	 * @param authors
-	 */
-	public void setAuthors(List<PMAuthor> authors) {
-		this.authors = authors;
-	}
-
-	/**
-	 * This element is used to identify the type of article indexed for MEDLINE;
-	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
-	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
-	 *
-	 * @return the mapped Subjects
-	 */
-	public List<PMSubject> getSubjects() {
-		return subjects;
-	}
-
-	/**
-	 *
-	 * the language in which an article was published is recorded in <Language>.
-	 * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
-	 * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
-	 *  Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
-	 *
-	 * @return The mapped Language
-	 */
-	public String getLanguage() {
-		return language;
-	}
-
-	/**
-	 *
-	 *  Set The mapped Language
-	 *
-	 * @param language the mapped Language
-	 */
-	public void setLanguage(String language) {
-		this.language = language;
-	}
-
-	/**
-	 *  This element is used to identify the type of article indexed for MEDLINE;
-	 * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
-	 * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
-	 *
-	 * @return the mapped Publication Type
-	 */
-	public List<PMSubject> getPublicationTypes() {
-		return publicationTypes;
-	}
-
-	/**
-	 * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
-	 * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
-	 * @return the mapped grants
-	 */
-
-	public List<PMGrant> getGrants() {
-		return grants;
-	}
-
 	public String getPmcId() {
 		return pmcId;
 	}
 
-	public PMArticle setPmcId(String pmcId) {
+	public void setPmcId(String pmcId) {
 		this.pmcId = pmcId;
-		return this;
+	}
+
+	public String getDoi() {
+		return doi;
+	}
+
+	public void setDoi(String doi) {
+		this.doi = doi;
+	}
+
+	public String getDate() {
+		return date;
+	}
+
+	public void setDate(String date) {
+		this.date = date;
+	}
+
+	public PMJournal getJournal() {
+		return journal;
+	}
+
+	public void setJournal(PMJournal journal) {
+		this.journal = journal;
+	}
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getDescription() {
+		return description;
+	}
+
+	public void setDescription(String description) {
+		this.description = description;
+	}
+
+	public String getLanguage() {
+		return language;
+	}
+
+	public void setLanguage(String language) {
+		this.language = language;
+	}
+
+	public List<PMSubject> getSubjects() {
+		return subjects;
+	}
+
+	public void setSubjects(List<PMSubject> subjects) {
+		this.subjects = subjects;
+	}
+
+	public List<PMSubject> getPublicationTypes() {
+		return publicationTypes;
+	}
+
+	public void setPublicationTypes(List<PMSubject> publicationTypes) {
+		this.publicationTypes = publicationTypes;
+	}
+
+	public List<PMAuthor> getAuthors() {
+		return authors;
+	}
+
+	public void setAuthors(List<PMAuthor> authors) {
+		this.authors = authors;
+	}
+
+	public List<PMGrant> getGrants() {
+		return grants;
+	}
+
+	public void setGrants(List<PMGrant> grants) {
+		this.grants = grants;
 	}
 }
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
index 3ba83764df..8326fab0f3 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
@@ -1,8 +1,7 @@
 [
   {"paramName":"mt",  "paramLongName":"master",         "paramDescription": "should be local or yarn",                  "paramRequired": true},
   {"paramName":"i",   "paramLongName":"isLookupUrl",    "paramDescription": "isLookupUrl",                              "paramRequired": true},
-  {"paramName":"w",   "paramLongName":"workingPath",    "paramDescription": "the path of the sequencial file to read",  "paramRequired": true},
-  {"paramName":"mo",   "paramLongName":"mdstoreOutputVersion",     "paramDescription": "the oaf path ",                            "paramRequired": true},
-  {"paramName":"s",   "paramLongName":"skipUpdate",     "paramDescription": "skip update ",                             "paramRequired": false},
-  {"paramName":"h",   "paramLongName":"hdfsServerUri",  "paramDescription": "the  working path ",                       "paramRequired": true}
+  {"paramName":"s",   "paramLongName":"sourcePath",    "paramDescription": "the baseline path",  "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath",     "paramDescription": "the mdstore path to save",                 "paramRequired": true}
+
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
new file mode 100644
index 0000000000..c21bfd7c3b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
@@ -0,0 +1,90 @@
+package eu.dnetlib.dhp.sx.bio.ebi
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser}
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf}
+import eu.dnetlib.dhp.utils.ISLookupClientFactory
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.io.ByteArrayInputStream
+import javax.xml.stream.XMLInputFactory
+
+class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
+  override def run(): Unit = {
+    val isLookupUrl: String = parser.get("isLookupUrl")
+    log.info("isLookupUrl: {}", isLookupUrl)
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"SourcePath is '$sourcePath'")
+    val targetPath = parser.get("targetPath")
+    log.info(s"TargetPath is '$targetPath'")
+
+    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
+    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
+
+    createPubmedDump(spark, sourcePath, targetPath, vocabularies)
+
+  }
+
+  def createPubmedDump(
+    spark: SparkSession,
+    sourcePath: String,
+    targetPath: String,
+    vocabularies: VocabularyGroup
+  ): Unit = {
+    require(spark != null)
+
+    implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle])
+
+    import spark.implicits._
+    val df = spark.read.option("lineSep", "</PubmedArticle>").text(sourcePath)
+    val mapper = new ObjectMapper()
+    df.as[String]
+      .map(s => {
+        val id = s.indexOf("<PubmedArticle>")
+        if (id >= 0) s"${s.substring(id)}</PubmedArticle>" else null
+      })
+      .filter(s => s != null)
+      .map { i =>
+        try {
+          new PMParser2().parse(i)
+        } catch {
+          case _: Exception => {
+            throw new RuntimeException(s"Error parsing article: $i")
+          }
+        }
+      }
+      .dropDuplicates("pmid")
+      .map { a =>
+        val oaf = PubMedToOaf.convert(a, vocabularies)
+        if (oaf != null)
+          mapper.writeValueAsString(oaf)
+        else
+          null
+      }
+      .as[String]
+      .filter(s => s != null)
+      .write
+      .option("compression", "gzip")
+      .mode("overwrite")
+      .text(targetPath)
+  }
+}
+
+object SparkCreatePubmedDump {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+
+    new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run()
+
+  }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
new file mode 100644
index 0000000000..c9e8681852
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
@@ -0,0 +1,264 @@
+package eu.dnetlib.dhp.sx.bio.pubmed
+
+import org.apache.commons.lang3.StringUtils
+
+import javax.xml.stream.XMLEventReader
+import scala.collection.JavaConverters._
+import scala.xml.{MetaData, NodeSeq}
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
+
+class PMParser2 {
+
+  /** Extracts the value of an attribute from a MetaData object.
+    * @param attrs the MetaData object
+    * @param key the key of the attribute
+    * @return the value of the attribute or null if the attribute is not found
+    */
+  private def extractAttributes(attrs: MetaData, key: String): String = {
+
+    val res = attrs.get(key)
+    if (res.isDefined) {
+      val s = res.get
+      if (s != null && s.nonEmpty)
+        s.head.text
+      else
+        null
+    } else null
+  }
+
+  /** Validates and formats a date given the year, month, and day as strings.
+    *
+    * @param year  the year as a string
+    * @param month the month as a string
+    * @param day   the day as a string
+    * @return      the formatted date as "YYYY-MM-DD" or null if the date is invalid
+    */
+  private def validate_Date(year: String, month: String, day: String): String = {
+    try {
+      f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
+
+    } catch {
+      case _: Throwable => null
+    }
+  }
+
+  /** Extracts the grant information from a NodeSeq object.
+    *
+    * @param gNode the NodeSeq object
+    * @return the grant information or an empty list if the grant information is not found
+    */
+  private def extractGrant(gNode: NodeSeq): List[PMGrant] = {
+    gNode
+      .map(node => {
+        val grantId = (node \ "GrantID").text
+        val agency = (node \ "Agency").text
+        val country = (node \ "Country").text
+        new PMGrant(grantId, agency, country)
+      })
+      .toList
+  }
+
+  /** Extracts the journal information from a NodeSeq object.
+    *
+    * @param jNode the NodeSeq object
+    * @return the journal information or null if the journal information is not found
+    */
+  private def extractJournal(jNode: NodeSeq): PMJournal = {
+    val journal = new PMJournal
+    journal.setTitle((jNode \ "Title").text)
+    journal.setIssn((jNode \ "ISSN").text)
+    journal.setVolume((jNode \ "JournalIssue" \ "Volume").text)
+    journal.setIssue((jNode \ "JournalIssue" \ "Issue").text)
+    if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle))
+      journal
+    else
+      null
+  }
+
+  private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = {
+    aNode
+      .map(author => {
+        val a = new PMAuthor
+        a.setLastName((author \ "LastName").text)
+        a.setForeName((author \ "ForeName").text)
+        a
+      })
+      .toList
+  }
+
+  def parse(input: String): PMArticle = {
+    val xml = scala.xml.XML.loadString(input)
+    val article = new PMArticle
+
+    val grantNodes = xml \ "MedlineCitation" \\ "Grant"
+    article.setGrants(extractGrant(grantNodes).asJava)
+
+    val journal = xml \ "MedlineCitation" \ "Article" \ "Journal"
+    article.setJournal(extractJournal(journal))
+
+    val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
+
+    article.setAuthors(
+      authors
+        .map(author => {
+          val a = new PMAuthor
+          a.setLastName((author \ "LastName").text)
+          a.setForeName((author \ "ForeName").text)
+          a
+        })
+        .toList
+        .asJava
+    )
+
+    val pmId = xml \ "MedlineCitation" \ "PMID"
+
+    val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId"
+    articleIds.foreach(articleId => {
+      val idType = (articleId \ "@IdType").text
+      val id = articleId.text
+      if ("doi".equalsIgnoreCase(idType)) article.setDoi(id)
+      if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id)
+    })
+    article.setPmid(pmId.text)
+
+    val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted"
+    val currentDate =
+      validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text)
+    if (currentDate != null) article.setDate(currentDate)
+
+    val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle"
+    article.setTitle(articleTitle.text)
+
+    val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText"
+    if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty)
+      article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim)
+
+    val language = xml \ "MedlineCitation" \ "Article" \ "Language"
+    article.setLanguage(language.text)
+
+    val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading"
+    article.setSubjects(
+      subjects
+        .take(20)
+        .map(subject => {
+          val descriptorName = (subject \ "DescriptorName").text
+          val ui = (subject \ "DescriptorName" \ "@UI").text
+          val s = new PMSubject
+          s.setValue(descriptorName)
+          s.setMeshId(ui)
+          s
+        })
+        .toList
+        .asJava
+    )
+    val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType"
+    article.setPublicationTypes(
+      publicationTypes
+        .map(pt => {
+          val s = new PMSubject
+          s.setValue(pt.text)
+          s
+        })
+        .toList
+        .asJava
+    )
+
+    article
+  }
+
+  def parse2(xml: XMLEventReader): PMArticle = {
+    var currentArticle: PMArticle = null
+    var currentSubject: PMSubject = null
+    var currentAuthor: PMAuthor = null
+    var currentJournal: PMJournal = null
+    var currentGrant: PMGrant = null
+    var currNode: String = null
+    var currentYear = "0"
+    var currentMonth = "01"
+    var currentDay = "01"
+    var currentArticleType: String = null
+
+    while (xml.hasNext) {
+      val ne = xml.next
+      ne match {
+        case EvElemStart(_, label, attrs, _) =>
+          currNode = label
+
+          label match {
+            case "PubmedArticle" => currentArticle = new PMArticle
+            case "Author"        => currentAuthor = new PMAuthor
+            case "Journal"       => currentJournal = new PMJournal
+            case "Grant"         => currentGrant = new PMGrant
+            case "PublicationType" | "DescriptorName" =>
+              currentSubject = new PMSubject
+              currentSubject.setMeshId(extractAttributes(attrs, "UI"))
+            case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
+            case _           =>
+          }
+        case EvElemEnd(_, label) =>
+          label match {
+            case "PubmedArticle" => return currentArticle
+            case "Author"        => currentArticle.getAuthors.add(currentAuthor)
+            case "Journal"       => currentArticle.setJournal(currentJournal)
+            case "Grant"         => currentArticle.getGrants.add(currentGrant)
+            case "PubMedPubDate" =>
+              if (currentArticle.getDate == null)
+                currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
+            case "PubDate"         => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
+            case "DescriptorName"  => currentArticle.getSubjects.add(currentSubject)
+            case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
+            case _                 =>
+          }
+        case EvText(text) =>
+          if (currNode != null && text.trim.nonEmpty)
+            currNode match {
+              case "ArticleTitle" => {
+                if (currentArticle.getTitle == null)
+                  currentArticle.setTitle(text.trim)
+                else
+                  currentArticle.setTitle(currentArticle.getTitle + text.trim)
+              }
+              case "AbstractText" => {
+                if (currentArticle.getDescription == null)
+                  currentArticle.setDescription(text.trim)
+                else
+                  currentArticle.setDescription(currentArticle.getDescription + text.trim)
+              }
+              case "PMID" => currentArticle.setPmid(text.trim)
+              case "ArticleId" =>
+                if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
+                if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
+              case "Language"                           => currentArticle.setLanguage(text.trim)
+              case "ISSN"                               => currentJournal.setIssn(text.trim)
+              case "GrantID"                            => currentGrant.setGrantID(text.trim)
+              case "Agency"                             => currentGrant.setAgency(text.trim)
+              case "Country"                            => if (currentGrant != null) currentGrant.setCountry(text.trim)
+              case "Year"                               => currentYear = text.trim
+              case "Month"                              => currentMonth = text.trim
+              case "Day"                                => currentDay = text.trim
+              case "Volume"                             => currentJournal.setVolume(text.trim)
+              case "Issue"                              => currentJournal.setIssue(text.trim)
+              case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
+              case "LastName" => {
+                if (currentAuthor != null)
+                  currentAuthor.setLastName(text.trim)
+              }
+              case "ForeName" =>
+                if (currentAuthor != null)
+                  currentAuthor.setForeName(text.trim)
+              case "Title" =>
+                if (currentJournal.getTitle == null)
+                  currentJournal.setTitle(text.trim)
+                else
+                  currentJournal.setTitle(currentJournal.getTitle + text.trim)
+              case _ =>
+
+            }
+        case _ =>
+      }
+
+    }
+    null
+  }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
new file mode 100644
index 0000000000..4b4d860d7e
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
@@ -0,0 +1,222 @@
+<PubmedArticle>
+    <MedlineCitation Status="MEDLINE" IndexingMethod="Automated" Owner="NLM">
+        <PMID Version="1">37885214</PMID>
+        <DateCompleted>
+            <Year>2024</Year>
+            <Month>02</Month>
+            <Day>14</Day>
+        </DateCompleted>
+        <DateRevised>
+            <Year>2024</Year>
+            <Month>02</Month>
+            <Day>14</Day>
+        </DateRevised>
+        <Article PubModel="Print-Electronic">
+            <Journal>
+                <ISSN IssnType="Electronic">2752-7549</ISSN>
+                <JournalIssue CitedMedium="Internet">
+                    <Volume>40</Volume>
+                    <Issue>5</Issue>
+                    <PubDate>
+                        <MedlineDate>2023 Sep-Oct</MedlineDate>
+                    </PubDate>
+                </JournalIssue>
+                <Title>Journal of pediatric hematology/oncology nursing</Title>
+                <ISOAbbreviation>J Pediatr Hematol Oncol Nurs</ISOAbbreviation>
+            </Journal>
+            <ArticleTitle>Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country.</ArticleTitle>
+            <Pagination>
+                <MedlinePgn>295-304</MedlinePgn>
+            </Pagination>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1177/27527530231193972</ELocationID>
+            <Abstract>
+                <AbstractText><b>Background:</b> Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. <b>Method:</b> Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. <b>Results:</b> Filipino parents (<i>N</i> = 156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. <b>Conclusions:</b> Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience.</AbstractText>
+            </Abstract>
+            <AuthorList CompleteYN="Y">
+                <Author ValidYN="Y">
+                    <LastName>Banayat</LastName>
+                    <ForeName>Aprille Campos</ForeName>
+                    <Initials>AC</Initials>
+                    <Identifier Source="ORCID">0000-0001-9339-9871</Identifier>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Abad</LastName>
+                    <ForeName>Peter James B</ForeName>
+                    <Initials>PJB</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Bonito</LastName>
+                    <ForeName>Sheila R</ForeName>
+                    <Initials>SR</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Manahan</LastName>
+                    <ForeName>Lydia T</ForeName>
+                    <Initials>LT</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Peralta</LastName>
+                    <ForeName>Arnold B</ForeName>
+                    <Initials>AB</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+            </AuthorList>
+            <Language>eng</Language>
+            <PublicationTypeList>
+                <PublicationType UI="D016428">Journal Article</PublicationType>
+            </PublicationTypeList>
+            <ArticleDate DateType="Electronic">
+                <Year>2023</Year>
+                <Month>10</Month>
+                <Day>26</Day>
+            </ArticleDate>
+        </Article>
+        <MedlineJournalInfo>
+            <Country>United States</Country>
+            <MedlineTA>J Pediatr Hematol Oncol Nurs</MedlineTA>
+            <NlmUniqueID>9918282681506676</NlmUniqueID>
+            <ISSNLinking>2752-7530</ISSNLinking>
+        </MedlineJournalInfo>
+        <CitationSubset>IM</CitationSubset>
+        <MeshHeadingList>
+            <MeshHeading>
+                <DescriptorName UI="D002648" MajorTopicYN="N">Child</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D010290" MajorTopicYN="Y">Parents</DescriptorName>
+                <QualifierName UI="Q000523" MajorTopicYN="N">psychology</QualifierName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D012944" MajorTopicYN="N">Social Support</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D029181" MajorTopicYN="N">Spirituality</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D012067" MajorTopicYN="N">Religion</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D009369" MajorTopicYN="Y">Neoplasms</DescriptorName>
+                <QualifierName UI="Q000628" MajorTopicYN="N">therapy</QualifierName>
+            </MeshHeading>
+        </MeshHeadingList>
+        <KeywordList Owner="NOTNLM">
+            <Keyword MajorTopicYN="N">cancer</Keyword>
+            <Keyword MajorTopicYN="N">mixed methods</Keyword>
+            <Keyword MajorTopicYN="N">parent</Keyword>
+            <Keyword MajorTopicYN="N">pediatric</Keyword>
+            <Keyword MajorTopicYN="N">research</Keyword>
+            <Keyword MajorTopicYN="N">supportive care</Keyword>
+        </KeywordList>
+        <CoiStatement>Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article.</CoiStatement>
+    </MedlineCitation>
+    <PubmedData>
+        <History>
+            <PubMedPubDate PubStatus="medline">
+                <Year>2024</Year>
+                <Month>2</Month>
+                <Day>12</Day>
+                <Hour>18</Hour>
+                <Minute>42</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="pubmed">
+                <Year>2023</Year>
+                <Month>10</Month>
+                <Day>27</Day>
+                <Hour>6</Hour>
+                <Minute>42</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="entrez">
+                <Year>2023</Year>
+                <Month>10</Month>
+                <Day>27</Day>
+                <Hour>3</Hour>
+                <Minute>43</Minute>
+            </PubMedPubDate>
+        </History>
+        <PublicationStatus>ppublish</PublicationStatus>
+        <ArticleIdList>
+            <ArticleId IdType="pubmed">37885214</ArticleId>
+            <ArticleId IdType="doi">10.1177/27527530231193972</ArticleId>
+        </ArticleIdList>
+    </PubmedData>
+</PubmedArticle>
+<DeleteCitation>
+<PMID Version="1">30522158</PMID>
+<PMID Version="1">32769323</PMID>
+<PMID Version="1">34061701</PMID>
+<PMID Version="1">34661197</PMID>
+<PMID Version="1">34837091</PMID>
+<PMID Version="1">35035475</PMID>
+<PMID Version="1">35211699</PMID>
+<PMID Version="1">35557982</PMID>
+<PMID Version="1">35782783</PMID>
+<PMID Version="1">35795240</PMID>
+<PMID Version="1">35832688</PMID>
+<PMID Version="1">35847411</PMID>
+<PMID Version="1">36081602</PMID>
+<PMID Version="1">36081858</PMID>
+<PMID Version="1">36468085</PMID>
+<PMID Version="1">36468934</PMID>
+<PMID Version="1">36580086</PMID>
+<PMID Version="1">36589526</PMID>
+<PMID Version="1">36619609</PMID>
+<PMID Version="1">36649460</PMID>
+<PMID Version="1">36654909</PMID>
+<PMID Version="1">36655054</PMID>
+<PMID Version="1">36700856</PMID>
+<PMID Version="1">36705625</PMID>
+<PMID Version="1">36713939</PMID>
+<PMID Version="1">36714172</PMID>
+<PMID Version="1">36741203</PMID>
+<PMID Version="1">36741905</PMID>
+<PMID Version="1">36743825</PMID>
+<PMID Version="1">36788221</PMID>
+<PMID Version="1">36844926</PMID>
+<PMID Version="1">36846546</PMID>
+<PMID Version="1">36935776</PMID>
+<PMID Version="1">36946757</PMID>
+<PMID Version="1">36972191</PMID>
+<PMID Version="1">37034422</PMID>
+<PMID Version="1">37124311</PMID>
+<PMID Version="1">37152108</PMID>
+<PMID Version="1">37171968</PMID>
+<PMID Version="1">37273889</PMID>
+<PMID Version="1">37333905</PMID>
+<PMID Version="1">37387733</PMID>
+<PMID Version="1">37431449</PMID>
+<PMID Version="1">37576947</PMID>
+<PMID Version="1">37601162</PMID>
+<PMID Version="1">37711214</PMID>
+<PMID Version="1">37901290</PMID>
+<PMID Version="1">37981909</PMID>
+<PMID Version="1">37981945</PMID>
+<PMID Version="1">37982005</PMID>
+<PMID Version="1">38037601</PMID>
+<PMID Version="1">38037602</PMID>
+<PMID Version="1">38150730</PMID>
+<PMID Version="1">38274640</PMID>
+<PMID Version="1">38332671</PMID>
+<PMID Version="1">38334184</PMID>
+<PMID Version="1">38335456</PMID>
+<PMID Version="1">38349506</PMID>
+<PMID Version="1">38349576</PMID>
+<PMID Version="1">38353676</PMID>
+</DeleteCitation>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index c4af14c409..1374b741d7 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -5,7 +5,10 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
 import eu.dnetlib.dhp.schema.oaf.utils.PidType
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
 import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
-import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
+import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf}
+import org.apache.commons.io.IOUtils
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
@@ -13,8 +16,9 @@ import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.extension.ExtendWith
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
+import org.slf4j.LoggerFactory
 
-import java.io.{BufferedReader, InputStream, InputStreamReader}
+import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
@@ -48,6 +52,17 @@ class BioScholixTest extends AbstractVocabularyTest {
     }
   }
 
+  @Test
+  def testParsingPubmed2(): Unit = {
+    val mapper = new ObjectMapper()
+    val xml = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml"))
+    val parser = new PMParser2()
+    val article = parser.parse(xml)
+
+    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
+
+  }
+
   @Test
   def testEBIData() = {
     val inputFactory = XMLInputFactory.newInstance
@@ -124,6 +139,15 @@ class BioScholixTest extends AbstractVocabularyTest {
     }
   }
 
+  @Test
+  def testPubmedSplitting(): Unit = {
+
+    val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()
+    new SparkCreatePubmedDump("", Array.empty, LoggerFactory.getLogger(getClass))
+      .createPubmedDump(spark, "/home/sandro/Downloads/pubmed", "/home/sandro/Downloads/pubmed_mapped", vocabularies)
+
+  }
+
   @Test
   def testPubmedOriginalID(): Unit = {
     val article: PMArticle = new PMArticle

From c1cef5d685373ad28dac341b56fc459cd43ff606 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Tue, 5 Nov 2024 10:38:40 +0100
Subject: [PATCH 2/8] removed old library joda time replaced with standard
 java.time introduced in java 8

---
 .../plugin/gtr2/Gtr2PublicationsIterator.java | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java
index 5b8f48680a..779c43712a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java
@@ -1,6 +1,8 @@
 
 package eu.dnetlib.dhp.collection.plugin.gtr2;
 
+import java.nio.charset.StandardCharsets;
+import java.time.LocalDate;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -16,9 +18,7 @@ import org.dom4j.Document;
 import org.dom4j.DocumentException;
 import org.dom4j.DocumentHelper;
 import org.dom4j.Element;
-import org.joda.time.DateTime;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
+import java.time.format.DateTimeFormatter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -33,7 +33,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
 	private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class);
 
 	private final HttpConnector2 connector;
-	private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
+	private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
 
 	private static final int MAX_ATTEMPTS = 10;
 
@@ -41,7 +41,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
 	private int currPage;
 	private int endPage;
 	private boolean incremental = false;
-	private DateTime fromDate;
+	private LocalDate fromDate;
 
 	private final Map<String, String> cache = new HashMap<>();
 
@@ -188,28 +188,28 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
 
 	private Document loadURL(final String cleanUrl, final int attempt) {
 		try {
-			log.debug("  * Downloading Url: " + cleanUrl);
-			final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes("UTF-8");
+            log.debug("  * Downloading Url: {}", cleanUrl);
+			final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes(StandardCharsets.UTF_8);
 			return DocumentHelper.parseText(new String(bytes));
 		} catch (final Throwable e) {
-			log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e);
+            log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e);
 			if (attempt >= MAX_ATTEMPTS) {
-				throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
+				throw new RuntimeException("Error downloading url: " + cleanUrl, e);
 			}
 			try {
 				Thread.sleep(60000); // I wait for a minute
 			} catch (final InterruptedException e1) {
-				throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
+				throw new RuntimeException("Error downloading url: " + cleanUrl, e);
 			}
 			return loadURL(cleanUrl, attempt + 1);
 		}
 	}
 
-	private DateTime parseDate(final String s) {
-		return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
+	private LocalDate parseDate(final String s) {
+		return LocalDate.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
 	}
 
-	private boolean isAfter(final String d, final DateTime fromDate) {
+	private boolean isAfter(final String d, final LocalDate fromDate) {
 		return StringUtils.isNotBlank(d) && parseDate(d).isAfter(fromDate);
 	}
 }

From 0d0904f4ec2b1d4424d5d8d5d6d782049c0544cd Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Mon, 11 Nov 2024 10:27:23 +0100
Subject: [PATCH 3/8] updated workflow baseline to direct transform on OAF

---
 .../sx/bio/ebi/baseline_to_oaf_params.json    |  8 ++---
 .../dhp/sx/bio/pubmed/oozie_app/workflow.xml  | 17 ++++-------
 .../sx/bio/ebi/SparkCreatePubmedDump.scala    | 29 ++++++++++++-------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
index 8326fab0f3..0fcc03266e 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
@@ -1,7 +1,7 @@
 [
-  {"paramName":"mt",  "paramLongName":"master",         "paramDescription": "should be local or yarn",                  "paramRequired": true},
-  {"paramName":"i",   "paramLongName":"isLookupUrl",    "paramDescription": "isLookupUrl",                              "paramRequired": true},
-  {"paramName":"s",   "paramLongName":"sourcePath",    "paramDescription": "the baseline path",  "paramRequired": true},
-  {"paramName":"t",   "paramLongName":"targetPath",     "paramDescription": "the mdstore path to save",                 "paramRequired": true}
+  {"paramName":"mt",  "paramLongName":"master",               "paramDescription": "should be local or yarn",                  "paramRequired": true},
+  {"paramName":"i",   "paramLongName":"isLookupUrl",          "paramDescription": "isLookupUrl",                              "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath",           "paramDescription": "the baseline path",                        "paramRequired": true},
+  {"paramName":"mo",  "paramLongName":"mdstoreOutputVersion", "paramDescription": "the mdstore path to save",                 "paramRequired": true}
 
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
index 30eb414698..0f4c5c2495 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
@@ -16,11 +16,6 @@
             <name>mdStoreManagerURI</name>
             <description>the path of the cleaned mdstore</description>
         </property>
-        <property>
-            <name>skipUpdate</name>
-            <value>false</value>
-            <description>The request block size</description>
-        </property>
     </parameters>
 
     <start to="StartTransaction"/>
@@ -44,16 +39,16 @@
             <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
             <capture-output/>
         </java>
-        <ok to="ConvertDataset"/>
+        <ok to="TransformPubMed"/>
         <error to="RollBack"/>
     </action>
 
-    <action name="ConvertDataset">
+    <action name="TransformPubMed">
         <spark xmlns="uri:oozie:spark-action:0.2">
             <master>yarn</master>
             <mode>cluster</mode>
-            <name>Convert Baseline to OAF Dataset</name>
-            <class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
+            <name>Convert Baseline Pubmed to OAF Dataset</name>
+            <class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump</class>
             <jar>dhp-aggregation-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}
@@ -65,12 +60,10 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>
-            <arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
+            <arg>--sourcePath</arg><arg>${baselineWorkingPath}</arg>
             <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
             <arg>--master</arg><arg>yarn</arg>
             <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
-            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
-            <arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
         </spark>
         <ok to="CommitVersion"/>
         <error to="RollBack"/>
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
index c21bfd7c3b..1bdd2a4bcc 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala
@@ -1,18 +1,14 @@
 package eu.dnetlib.dhp.sx.bio.ebi
 
 import com.fasterxml.jackson.databind.ObjectMapper
-import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser}
+import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.oaf.Oaf
-import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf}
+import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
+import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser2, PubMedToOaf}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 
-import java.io.ByteArrayInputStream
-import javax.xml.stream.XMLInputFactory
-
 class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
     extends AbstractScalaApplication(propertyPath, args, log: Logger) {
 
@@ -24,16 +20,26 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg
     log.info("isLookupUrl: {}", isLookupUrl)
     val sourcePath = parser.get("sourcePath")
     log.info(s"SourcePath is '$sourcePath'")
-    val targetPath = parser.get("targetPath")
-    log.info(s"TargetPath is '$targetPath'")
+    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
+    log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
+    val mapper = new ObjectMapper()
+    val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
+    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
+    log.info(s"outputBasePath is '$outputBasePath'")
 
     val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
     val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
 
-    createPubmedDump(spark, sourcePath, targetPath, vocabularies)
+    createPubmedDump(spark, sourcePath, outputBasePath, vocabularies)
 
   }
 
+  /** This method creates a dump of the pubmed articles
+    * @param spark the spark session
+    * @param sourcePath the path of the source file
+    * @param targetPath the path of the target file
+    * @param vocabularies the vocabularies
+    */
   def createPubmedDump(
     spark: SparkSession,
     sourcePath: String,
@@ -54,6 +60,7 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg
       })
       .filter(s => s != null)
       .map { i =>
+        //remove try catch
         try {
           new PMParser2().parse(i)
         } catch {

From 19ce783e58f4f9176f4dc9a98d9bb250dc615e0d Mon Sep 17 00:00:00 2001
From: "sandro.labruzzo" <sandro.labruzzo@gmail.com>
Date: Mon, 11 Nov 2024 12:28:02 +0100
Subject: [PATCH 4/8] renamed workflow

---
 .../eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
index 0f4c5c2495..2a51b40509 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
@@ -1,4 +1,4 @@
-<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
     <parameters>
         <property>
             <name>baselineWorkingPath</name>

From 474f365286b7f83b8d58e4969277b3e67ebdd0cc Mon Sep 17 00:00:00 2001
From: "sandro.labruzzo" <sandro.labruzzo@gmail.com>
Date: Mon, 11 Nov 2024 12:37:27 +0100
Subject: [PATCH 5/8] removed wrong test

---
 .../test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index 1374b741d7..c942ca1329 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -6,9 +6,9 @@ import eu.dnetlib.dhp.schema.oaf.utils.PidType
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
 import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
 import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump
-import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf}
+import eu.dnetlib.dhp.sx.bio.pubmed._
 import org.apache.commons.io.IOUtils
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST.{JField, JObject, JString}
 import org.json4s.jackson.JsonMethods.parse
@@ -18,13 +18,12 @@ import org.junit.jupiter.api.{BeforeEach, Test}
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.LoggerFactory
 
-import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
+import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
-import scala.xml.pull.XMLEventReader
 
 @ExtendWith(Array(classOf[MockitoExtension]))
 class BioScholixTest extends AbstractVocabularyTest {
@@ -139,7 +138,6 @@ class BioScholixTest extends AbstractVocabularyTest {
     }
   }
 
-  @Test
   def testPubmedSplitting(): Unit = {
 
     val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()

From a1d5ad5c2609c91b60b97600163072d551dcc440 Mon Sep 17 00:00:00 2001
From: "sandro.labruzzo" <sandro.labruzzo@gmail.com>
Date: Wed, 13 Nov 2024 09:51:13 +0100
Subject: [PATCH 6/8] code formatted

---
 .../dhp/schema/oaf/utils/MergeUtils.java      | 423 +++++++++---------
 .../personentity/ExtractPerson.java           |  22 +-
 .../plugin/gtr2/Gtr2PublicationsIterator.java |   6 +-
 3 files changed, 228 insertions(+), 223 deletions(-)

diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
index dc76860f81..cd85065830 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@@ -16,8 +16,6 @@ import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
-import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
-import eu.dnetlib.dhp.schema.common.EntityType;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
@@ -25,8 +23,10 @@ import org.apache.commons.lang3.tuple.Pair;
 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.base.Joiner;
 
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.common.AccessRightComparator;
+import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
@@ -46,7 +46,7 @@ public class MergeUtils {
 	}
 
 	public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
-											   boolean checkDelegateAuthority, VocabularyGroup vocs) {
+		boolean checkDelegateAuthority, VocabularyGroup vocs) {
 
 		ArrayList<T> sortedEntities = new ArrayList<>();
 		oafEntityIterator.forEachRemaining(sortedEntities::add);
@@ -74,14 +74,15 @@ public class MergeUtils {
 			if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 				return (T) mergedResult;
 			} else {
-				final Qualifier expectedResultType = vocs.lookupTermBySynonym(
+				final Qualifier expectedResultType = vocs
+					.lookupTermBySynonym(
 						ModelConstants.DNET_RESULT_TYPOLOGIES,
 						i.getInstancetype().getClassid());
 
 				if (Objects.isNull(expectedResultType)) {
 					throw new IllegalArgumentException(
-							"instance type not bound to any result type in dnet:result_typologies: " +
-									i.getInstancetype().getClassid());
+						"instance type not bound to any result type in dnet:result_typologies: " +
+							i.getInstancetype().getClassid());
 				}
 
 				// there is a clash among the result types
@@ -122,10 +123,10 @@ public class MergeUtils {
 			return mergeRelation((Relation) left, (Relation) right);
 		} else {
 			throw new RuntimeException(
-					String
-							.format(
-									"MERGE_FROM_AND_GET incompatible types: %s, %s",
-									left.getClass().getCanonicalName(), right.getClass().getCanonicalName()));
+				String
+					.format(
+						"MERGE_FROM_AND_GET incompatible types: %s, %s",
+						left.getClass().getCanonicalName(), right.getClass().getCanonicalName()));
 		}
 	}
 
@@ -164,10 +165,10 @@ public class MergeUtils {
 			return mergeProject((Project) left, (Project) right);
 		} else {
 			throw new RuntimeException(
-					String
-							.format(
-									"MERGE_FROM_AND_GET incompatible types: %s, %s",
-									left.getClass().getCanonicalName(), right.getClass().getCanonicalName()));
+				String
+					.format(
+						"MERGE_FROM_AND_GET incompatible types: %s, %s",
+						left.getClass().getCanonicalName(), right.getClass().getCanonicalName()));
 		}
 	}
 
@@ -258,7 +259,7 @@ public class MergeUtils {
 	}
 
 	private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
-											 Function<T, K> keyExtractor, BinaryOperator<T> merger) {
+		Function<T, K> keyExtractor, BinaryOperator<T> merger) {
 		if (left == null || left.isEmpty()) {
 			return right != null ? right : new ArrayList<>();
 		} else if (right == null || right.isEmpty()) {
@@ -269,11 +270,11 @@ public class MergeUtils {
 		List<T> l = trust >= 0 ? right : left;
 
 		return new ArrayList<>(Stream
-				.concat(h.stream(), l.stream())
-				.filter(Objects::nonNull)
-				.distinct()
-				.collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new))
-				.values());
+			.concat(h.stream(), l.stream())
+			.filter(Objects::nonNull)
+			.distinct()
+			.collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new))
+			.values());
 	}
 
 	private static <T, K> List<T> unionDistinctLists(final List<T> left, final List<T> right, int trust) {
@@ -287,10 +288,10 @@ public class MergeUtils {
 		List<T> l = trust >= 0 ? right : left;
 
 		return Stream
-				.concat(h.stream(), l.stream())
-				.filter(Objects::nonNull)
-				.distinct()
-				.collect(Collectors.toList());
+			.concat(h.stream(), l.stream())
+			.filter(Objects::nonNull)
+			.distinct()
+			.collect(Collectors.toList());
 	}
 
 	private static List<String> unionDistinctListOfString(final List<String> l, final List<String> r) {
@@ -301,10 +302,10 @@ public class MergeUtils {
 		}
 
 		return Stream
-				.concat(l.stream(), r.stream())
-				.filter(StringUtils::isNotBlank)
-				.distinct()
-				.collect(Collectors.toList());
+			.concat(l.stream(), r.stream())
+			.filter(StringUtils::isNotBlank)
+			.distinct()
+			.collect(Collectors.toList());
 	}
 
 	// TODO review
@@ -330,7 +331,7 @@ public class MergeUtils {
 	}
 
 	private static List<StructuredProperty> unionTitle(List<StructuredProperty> left, List<StructuredProperty> right,
-													   int trust) {
+		int trust) {
 		if (left == null) {
 			return right;
 		} else if (right == null) {
@@ -341,10 +342,10 @@ public class MergeUtils {
 		List<StructuredProperty> l = trust >= 0 ? right : left;
 
 		return Stream
-				.concat(h.stream(), l.stream())
-				.filter(Objects::isNull)
-				.distinct()
-				.collect(Collectors.toList());
+			.concat(h.stream(), l.stream())
+			.filter(Objects::isNull)
+			.distinct()
+			.collect(Collectors.toList());
 	}
 
 	/**
@@ -379,8 +380,8 @@ public class MergeUtils {
 		merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
 		merged.setDateofcollection(LocalDateTime.now().toString());
 		merged
-				.setDateoftransformation(
-						chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust));
+			.setDateoftransformation(
+				chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust));
 		merged.setExtraInfo(unionDistinctLists(merged.getExtraInfo(), enrich.getExtraInfo(), trust));
 		// When merging records OAI provenance becomes null
 		merged.setOaiprovenance(null);
@@ -397,7 +398,7 @@ public class MergeUtils {
 		checkArgument(Objects.equals(merge.getTarget(), enrich.getTarget()), "target ids must be equal");
 		checkArgument(Objects.equals(merge.getRelType(), enrich.getRelType()), "relType(s) must be equal");
 		checkArgument(
-				Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal");
+			Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal");
 		checkArgument(Objects.equals(merge.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal");
 
 		// merge.setProvenance(mergeLists(merge.getProvenance(), enrich.getProvenance()));
@@ -408,10 +409,10 @@ public class MergeUtils {
 			merge.setValidationDate(ModelSupport.oldest(merge.getValidationDate(), enrich.getValidationDate()));
 		} catch (ParseException e) {
 			throw new IllegalArgumentException(String
-					.format(
-							"invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(),
-							merge.getTarget(),
-							merge.getValidationDate()));
+				.format(
+					"invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(),
+					merge.getTarget(),
+					merge.getValidationDate()));
 		}
 
 		// TODO keyvalue merge
@@ -425,7 +426,7 @@ public class MergeUtils {
 		T merge = mergeOafEntityFields(original, enrich, trust);
 
 		if (merge.getProcessingchargeamount() == null
-				|| StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) {
+			|| StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) {
 			merge.setProcessingchargeamount(enrich.getProcessingchargeamount());
 			merge.setProcessingchargecurrency(enrich.getProcessingchargecurrency());
 		}
@@ -457,8 +458,8 @@ public class MergeUtils {
 		}
 
 		merge
-				.setDateofacceptance(
-						mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust));
+			.setDateofacceptance(
+				mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust));
 
 		merge.setPublisher(coalesce(merge.getPublisher(), enrich.getPublisher()));
 		merge.setEmbargoenddate(coalesce(merge.getEmbargoenddate(), enrich.getEmbargoenddate()));
@@ -473,7 +474,7 @@ public class MergeUtils {
 		merge.setCoverage(unionDistinctLists(merge.getCoverage(), enrich.getCoverage(), trust));
 
 		if (enrich.getBestaccessright() != null
-				&& new AccessRightComparator<>()
+			&& new AccessRightComparator<>()
 				.compare(enrich.getBestaccessright(), merge.getBestaccessright()) < 0) {
 			merge.setBestaccessright(enrich.getBestaccessright());
 		}
@@ -486,8 +487,8 @@ public class MergeUtils {
 
 		// ok
 		merge
-				.setExternalReference(
-						mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust));
+			.setExternalReference(
+				mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust));
 
 		// instance enrichment or union
 		// review instance equals => add pid to comparision
@@ -495,17 +496,17 @@ public class MergeUtils {
 			merge.setInstance(mergeInstances(merge.getInstance(), enrich.getInstance(), trust));
 		} else {
 			final List<Instance> enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance()
-					: enrich.getInstance();
+				: enrich.getInstance();
 			final List<Instance> enrichedInstances = isAnEnrichment(merge) ? enrich.getInstance()
-					: merge.getInstance();
+				: merge.getInstance();
 			if (isAnEnrichment(merge))
 				merge.setDataInfo(enrich.getDataInfo());
 			merge.setInstance(enrichInstances(enrichedInstances, enrichmentInstances));
 		}
 
 		merge
-				.setEoscifguidelines(
-						mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust));
+			.setEoscifguidelines(
+				mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust));
 		merge.setIsGreen(booleanOR(merge.getIsGreen(), enrich.getIsGreen()));
 		// OK but should be list of values
 		merge.setOpenAccessColor(coalesce(merge.getOpenAccessColor(), enrich.getOpenAccessColor()));
@@ -531,7 +532,7 @@ public class MergeUtils {
 						LocalDate enrich_date = LocalDate.parse(enrich.getValue(), DateTimeFormatter.ISO_DATE);
 
 						if (enrich_date.getYear() > 1300
-								&& (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) {
+							&& (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) {
 							return enrich;
 						}
 					} catch (NullPointerException | DateTimeParseException e) {
@@ -549,56 +550,56 @@ public class MergeUtils {
 
 	private static List<Instance> mergeInstances(List<Instance> v1, List<Instance> v2, int trust) {
 		return mergeLists(
-				v1, v2, trust,
-				MergeUtils::instanceKeyExtractor,
-				MergeUtils::instanceMerger);
+			v1, v2, trust,
+			MergeUtils::instanceKeyExtractor,
+			MergeUtils::instanceMerger);
 	}
 
 	private static List<EoscIfGuidelines> mergeEosciifguidelines(List<EoscIfGuidelines> v1, List<EoscIfGuidelines> v2,
-																 int trust) {
+		int trust) {
 		return mergeLists(
-				v1, v2, trust, er -> Joiner
-						.on("||")
-						.useForNull("")
-						.join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()),
-				(r, l) -> r);
+			v1, v2, trust, er -> Joiner
+				.on("||")
+				.useForNull("")
+				.join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()),
+			(r, l) -> r);
 
 	}
 
 	private static List<ExternalReference> mergeExternalReference(List<ExternalReference> v1,
-																  List<ExternalReference> v2, int trust) {
+		List<ExternalReference> v2, int trust) {
 		return mergeLists(
-				v1, v2, trust, er -> Joiner
-						.on(',')
-						.useForNull("")
-						.join(
-								er.getSitename(), er.getLabel(),
-								er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(),
-								er.getQuery(), toString(er.getDataInfo())),
-				(r, l) -> r);
+			v1, v2, trust, er -> Joiner
+				.on(',')
+				.useForNull("")
+				.join(
+					er.getSitename(), er.getLabel(),
+					er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(),
+					er.getQuery(), toString(er.getDataInfo())),
+			(r, l) -> r);
 	}
 
 	private static String toString(DataInfo di) {
 		return Joiner
-				.on(',')
-				.useForNull("")
-				.join(
-						di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(),
-						di.getInferenceprovenance(), toString(di.getProvenanceaction()));
+			.on(',')
+			.useForNull("")
+			.join(
+				di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(),
+				di.getInferenceprovenance(), toString(di.getProvenanceaction()));
 	}
 
 	private static String toString(Qualifier q) {
 		return Joiner
-				.on(',')
-				.useForNull("")
-				.join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename());
+			.on(',')
+			.useForNull("")
+			.join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename());
 	}
 
 	private static String toString(StructuredProperty sp) {
 		return Joiner
-				.on(',')
-				.useForNull("")
-				.join(toString(sp.getQualifier()), sp.getValue());
+			.on(',')
+			.useForNull("")
+			.join(toString(sp.getQualifier()), sp.getValue());
 	}
 
 	private static <T extends StructuredProperty> List<T> mergeStructuredProperties(List<T> v1, List<T> v2, int trust) {
@@ -637,17 +638,17 @@ public class MergeUtils {
 		// 2. @@
 		// 3. ||
 		return String
-				.join(
-						"::",
-						kvKeyExtractor(i.getHostedby()),
-						kvKeyExtractor(i.getCollectedfrom()),
-						qualifierKeyExtractor(i.getAccessright()),
-						qualifierKeyExtractor(i.getInstancetype()),
-						Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
-						Optional
-								.ofNullable(i.getPid())
-								.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
-								.orElse(null));
+			.join(
+				"::",
+				kvKeyExtractor(i.getHostedby()),
+				kvKeyExtractor(i.getCollectedfrom()),
+				qualifierKeyExtractor(i.getAccessright()),
+				qualifierKeyExtractor(i.getInstancetype()),
+				Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
+				Optional
+					.ofNullable(i.getPid())
+					.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
+					.orElse(null));
 	}
 
 	private static Instance instanceMerger(Instance i1, Instance i2) {
@@ -658,30 +659,30 @@ public class MergeUtils {
 		i.setInstancetype(i1.getInstancetype());
 		i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1));
 		i
-				.setAlternateIdentifier(
-						mergeLists(
-								i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor,
-								(sp1, sp2) -> sp1));
+			.setAlternateIdentifier(
+				mergeLists(
+					i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor,
+					(sp1, sp2) -> sp1));
 
 		i
-				.setRefereed(
-						Collections
-								.min(
-										Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()),
-										new RefereedComparator()));
+			.setRefereed(
+				Collections
+					.min(
+						Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()),
+						new RefereedComparator()));
 		i
-				.setInstanceTypeMapping(
-						mergeLists(
-								i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0,
-								MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1));
+			.setInstanceTypeMapping(
+				mergeLists(
+					i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0,
+					MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1));
 		i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext()));
 		i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance()));
 		i.setLicense(coalesce(i1.getLicense(), i2.getLicense()));
 		i.setProcessingchargeamount(coalesce(i1.getProcessingchargeamount(), i2.getProcessingchargeamount()));
 		i.setProcessingchargecurrency(coalesce(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency()));
 		i
-				.setMeasures(
-						mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1));
+			.setMeasures(
+				mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1));
 
 		i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl()));
 
@@ -690,14 +691,14 @@ public class MergeUtils {
 
 	private static String measureKeyExtractor(Measure m) {
 		return String
-				.join(
-						"::",
-						m.getId(),
-						m
-								.getUnit()
-								.stream()
-								.map(KeyValue::getKey)
-								.collect(Collectors.joining("::")));
+			.join(
+				"::",
+				m.getId(),
+				m
+					.getUnit()
+					.stream()
+					.map(KeyValue::getKey)
+					.collect(Collectors.joining("::")));
 	}
 
 	private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
@@ -708,16 +709,16 @@ public class MergeUtils {
 		}
 
 		return Stream
-				.of(d1, d2)
-				.min(
-						Comparator
-								.comparing(
-										f -> DateParserUtils
-												.parseDate(f.getValue())
-												.toInstant()
-												.atZone(ZoneId.systemDefault())
-												.toLocalDate()))
-				.orElse(d1);
+			.of(d1, d2)
+			.min(
+				Comparator
+					.comparing(
+						f -> DateParserUtils
+							.parseDate(f.getValue())
+							.toInstant()
+							.atZone(ZoneId.systemDefault())
+							.toLocalDate()))
+			.orElse(d1);
 	}
 
 	private static String selectFulltext(String ft1, String ft2) {
@@ -732,12 +733,12 @@ public class MergeUtils {
 
 	private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) {
 		return String
-				.join(
-						"::",
-						itm.getOriginalType(),
-						itm.getTypeCode(),
-						itm.getTypeLabel(),
-						itm.getVocabularyName());
+			.join(
+				"::",
+				itm.getOriginalType(),
+				itm.getTypeCode(),
+				itm.getTypeLabel(),
+				itm.getVocabularyName());
 	}
 
 	private static String kvKeyExtractor(KeyValue kv) {
@@ -754,13 +755,13 @@ public class MergeUtils {
 
 	private static String spKeyExtractor(StructuredProperty sp) {
 		return Optional
-				.ofNullable(sp)
-				.map(
-						s -> Joiner
-								.on("||")
-								.useForNull("")
-								.join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
-				.orElse(null);
+			.ofNullable(sp)
+			.map(
+				s -> Joiner
+					.on("||")
+					.useForNull("")
+					.join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
+			.orElse(null);
 	}
 
 	private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
@@ -782,8 +783,8 @@ public class MergeUtils {
 		merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
 		merge.setCodeRepositoryUrl(chooseReference(merge.getCodeRepositoryUrl(), enrich.getCodeRepositoryUrl(), trust));
 		merge
-				.setProgrammingLanguage(
-						chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust));
+			.setProgrammingLanguage(
+				chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust));
 
 		return merge;
 	}
@@ -797,11 +798,11 @@ public class MergeUtils {
 		merge.setSize(chooseReference(merge.getSize(), enrich.getSize(), trust));
 		merge.setVersion(chooseReference(merge.getVersion(), enrich.getVersion(), trust));
 		merge
-				.setLastmetadataupdate(
-						chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust));
+			.setLastmetadataupdate(
+				chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust));
 		merge
-				.setMetadataversionnumber(
-						chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust));
+			.setMetadataversionnumber(
+				chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust));
 		merge.setGeolocation(unionDistinctLists(merge.getGeolocation(), enrich.getGeolocation(), trust));
 
 		return merge;
@@ -823,26 +824,26 @@ public class MergeUtils {
 		merged.setLegalshortname(chooseReference(merged.getLegalshortname(), enrich.getLegalshortname(), trust));
 		merged.setLegalname(chooseReference(merged.getLegalname(), enrich.getLegalname(), trust));
 		merged
-				.setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust));
+			.setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust));
 		merged.setWebsiteurl(chooseReference(merged.getWebsiteurl(), enrich.getWebsiteurl(), trust));
 		merged.setLogourl(chooseReference(merged.getLogourl(), enrich.getLogourl(), trust));
 		merged.setEclegalbody(chooseReference(merged.getEclegalbody(), enrich.getEclegalbody(), trust));
 		merged.setEclegalperson(chooseReference(merged.getEclegalperson(), enrich.getEclegalperson(), trust));
 		merged.setEcnonprofit(chooseReference(merged.getEcnonprofit(), enrich.getEcnonprofit(), trust));
 		merged
-				.setEcresearchorganization(
-						chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust));
+			.setEcresearchorganization(
+				chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust));
 		merged
-				.setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust));
+			.setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust));
 		merged
-				.setEcinternationalorganizationeurinterests(
-						chooseReference(
-								merged.getEcinternationalorganizationeurinterests(),
-								enrich.getEcinternationalorganizationeurinterests(), trust));
+			.setEcinternationalorganizationeurinterests(
+				chooseReference(
+					merged.getEcinternationalorganizationeurinterests(),
+					enrich.getEcinternationalorganizationeurinterests(), trust));
 		merged
-				.setEcinternationalorganization(
-						chooseReference(
-								merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust));
+			.setEcinternationalorganization(
+				chooseReference(
+					merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust));
 		merged.setEcenterprise(chooseReference(merged.getEcenterprise(), enrich.getEcenterprise(), trust));
 		merged.setEcsmevalidated(chooseReference(merged.getEcsmevalidated(), enrich.getEcsmevalidated(), trust));
 		merged.setEcnutscode(chooseReference(merged.getEcnutscode(), enrich.getEcnutscode(), trust));
@@ -866,8 +867,8 @@ public class MergeUtils {
 		merged.setDuration(chooseReference(merged.getDuration(), enrich.getDuration(), trust));
 		merged.setEcsc39(chooseReference(merged.getEcsc39(), enrich.getEcsc39(), trust));
 		merged
-				.setOamandatepublications(
-						chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust));
+			.setOamandatepublications(
+				chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust));
 		merged.setEcarticle29_3(chooseReference(merged.getEcarticle29_3(), enrich.getEcarticle29_3(), trust));
 		merged.setSubjects(unionDistinctLists(merged.getSubjects(), enrich.getSubjects(), trust));
 		merged.setFundingtree(unionDistinctLists(merged.getFundingtree(), enrich.getFundingtree(), trust));
@@ -893,8 +894,8 @@ public class MergeUtils {
 		}
 
 		merged
-				.setH2020classification(
-						unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust));
+			.setH2020classification(
+				unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust));
 
 		return merged;
 	}
@@ -921,7 +922,7 @@ public class MergeUtils {
 	 * @return list of instances possibly enriched
 	 */
 	private static List<Instance> enrichInstances(final List<Instance> toEnrichInstances,
-												  final List<Instance> enrichmentInstances) {
+		final List<Instance> enrichmentInstances) {
 		final List<Instance> enrichmentResult = new ArrayList<>();
 
 		if (toEnrichInstances == null) {
@@ -959,42 +960,42 @@ public class MergeUtils {
 	 */
 	private static Map<String, Instance> toInstanceMap(final List<Instance> ri) {
 		return ri
-				.stream()
-				.filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null)
-				.flatMap(i -> {
-					final List<Pair<String, Instance>> result = new ArrayList<>();
-					if (i.getPid() != null)
-						i
-								.getPid()
-								.stream()
-								.filter(MergeUtils::validPid)
-								.forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i)));
-					if (i.getAlternateIdentifier() != null)
-						i
-								.getAlternateIdentifier()
-								.stream()
-								.filter(MergeUtils::validPid)
-								.forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i)));
-					return result.stream();
-				})
-				.collect(
-						Collectors
-								.toMap(
-										Pair::getLeft,
-										Pair::getRight,
-										(a, b) -> a));
+			.stream()
+			.filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null)
+			.flatMap(i -> {
+				final List<Pair<String, Instance>> result = new ArrayList<>();
+				if (i.getPid() != null)
+					i
+						.getPid()
+						.stream()
+						.filter(MergeUtils::validPid)
+						.forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i)));
+				if (i.getAlternateIdentifier() != null)
+					i
+						.getAlternateIdentifier()
+						.stream()
+						.filter(MergeUtils::validPid)
+						.forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i)));
+				return result.stream();
+			})
+			.collect(
+				Collectors
+					.toMap(
+						Pair::getLeft,
+						Pair::getRight,
+						(a, b) -> a));
 	}
 
 	private static boolean isFromDelegatedAuthority(Result r) {
 		return Optional
-				.ofNullable(r.getInstance())
-				.map(
-						instance -> instance
-								.stream()
-								.filter(i -> Objects.nonNull(i.getCollectedfrom()))
-								.map(i -> i.getCollectedfrom().getKey())
-								.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
-				.orElse(false);
+			.ofNullable(r.getInstance())
+			.map(
+				instance -> instance
+					.stream()
+					.filter(i -> Objects.nonNull(i.getCollectedfrom()))
+					.map(i -> i.getCollectedfrom().getKey())
+					.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
+			.orElse(false);
 	}
 
 	/**
@@ -1030,15 +1031,15 @@ public class MergeUtils {
 	 * @return the list
 	 */
 	private static List<Instance> findEnrichmentsByPID(final List<StructuredProperty> pids,
-													   final Map<String, Instance> enrichments) {
+		final Map<String, Instance> enrichments) {
 		if (pids == null || enrichments == null)
 			return null;
 		return pids
-				.stream()
-				.map(MergeUtils::extractKeyFromPid)
-				.map(enrichments::get)
-				.filter(Objects::nonNull)
-				.collect(Collectors.toList());
+			.stream()
+			.map(MergeUtils::extractKeyFromPid)
+			.map(enrichments::get)
+			.filter(Objects::nonNull)
+			.collect(Collectors.toList());
 	}
 
 	/**
@@ -1049,8 +1050,8 @@ public class MergeUtils {
 	 */
 	private static boolean isAnEnrichment(OafEntity e) {
 		return e.getDataInfo() != null &&
-				e.getDataInfo().getProvenanceaction() != null
-				&& ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid());
+			e.getDataInfo().getProvenanceaction() != null
+			&& ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid());
 	}
 
 	/**
@@ -1073,17 +1074,17 @@ public class MergeUtils {
 		merge.setHostedby(firstNonNull(merge.getHostedby(), enrichment.getHostedby()));
 		merge.setUrl(unionDistinctLists(merge.getUrl(), enrichment.getUrl(), 0));
 		merge
-				.setDistributionlocation(
-						firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation()));
+			.setDistributionlocation(
+				firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation()));
 		merge.setCollectedfrom(firstNonNull(merge.getCollectedfrom(), enrichment.getCollectedfrom()));
 		// pid and alternateId are used for matching
 		merge.setDateofacceptance(firstNonNull(merge.getDateofacceptance(), enrichment.getDateofacceptance()));
 		merge
-				.setProcessingchargeamount(
-						firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount()));
+			.setProcessingchargeamount(
+				firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount()));
 		merge
-				.setProcessingchargecurrency(
-						firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency()));
+			.setProcessingchargecurrency(
+				firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency()));
 		merge.setRefereed(firstNonNull(merge.getRefereed(), enrichment.getRefereed()));
 		merge.setMeasures(unionDistinctLists(merge.getMeasures(), enrichment.getMeasures(), 0));
 		merge.setFulltext(firstNonNull(merge.getFulltext(), enrichment.getFulltext()));
@@ -1091,14 +1092,14 @@ public class MergeUtils {
 
 	private static int compareTrust(Oaf a, Oaf b) {
 		String left = Optional
-				.ofNullable(a.getDataInfo())
-				.map(DataInfo::getTrust)
-				.orElse("0.0");
+			.ofNullable(a.getDataInfo())
+			.map(DataInfo::getTrust)
+			.orElse("0.0");
 
 		String right = Optional
-				.ofNullable(b.getDataInfo())
-				.map(DataInfo::getTrust)
-				.orElse("0.0");
+			.ofNullable(b.getDataInfo())
+			.map(DataInfo::getTrust)
+			.orElse("0.0");
 
 		return left.compareTo(right);
 	}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
index 06924f05af..05f0837402 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java
@@ -346,15 +346,19 @@ public class ExtractPerson implements Serializable {
 						.structuredProperty(
 							op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
 							ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
-								OafMapperUtils.dataInfo(false,
-										null,
-										false,
-										false,
-										OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
-												ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
-												ModelConstants.DNET_PID_TYPES,
-												ModelConstants.DNET_PID_TYPES),
-								"0.91")));
+							OafMapperUtils
+								.dataInfo(
+									false,
+									null,
+									false,
+									false,
+									OafMapperUtils
+										.qualifier(
+											ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
+											ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
+											ModelConstants.DNET_PID_TYPES,
+											ModelConstants.DNET_PID_TYPES),
+									"0.91")));
 			person.setDateofcollection(op.getLastModifiedDate());
 			person.setOriginalId(Arrays.asList(op.getOrcid()));
 			person.setDataInfo(ORCIDDATAINFO);
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java
index 779c43712a..1b1ff8db43 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java
@@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.gtr2;
 
 import java.nio.charset.StandardCharsets;
 import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -18,7 +19,6 @@ import org.dom4j.Document;
 import org.dom4j.DocumentException;
 import org.dom4j.DocumentHelper;
 import org.dom4j.Element;
-import java.time.format.DateTimeFormatter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -188,11 +188,11 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
 
 	private Document loadURL(final String cleanUrl, final int attempt) {
 		try {
-            log.debug("  * Downloading Url: {}", cleanUrl);
+			log.debug("  * Downloading Url: {}", cleanUrl);
 			final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes(StandardCharsets.UTF_8);
 			return DocumentHelper.parseText(new String(bytes));
 		} catch (final Throwable e) {
-            log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e);
+			log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e);
 			if (attempt >= MAX_ATTEMPTS) {
 				throw new RuntimeException("Error downloading url: " + cleanUrl, e);
 			}

From ac0a94d62d7c34fc3953d47a9a2263ac5cfadb82 Mon Sep 17 00:00:00 2001
From: "sandro.labruzzo" <sandro.labruzzo@gmail.com>
Date: Wed, 13 Nov 2024 16:26:59 +0100
Subject: [PATCH 7/8] updated pubmed parser to add also ORCID id and
 affiliation string to authors

---
 .../dhp/sx/bio/pubmed/PMAffiliation.java      |  39 +++
 .../dnetlib/dhp/sx/bio/pubmed/PMAuthor.java   |  39 +++
 .../dhp/sx/bio/pubmed/PMIdentifier.java       |  53 +++++
 .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala |  30 ++-
 .../dhp/sx/bio/pubmed/PubMedToOaf.scala       |   6 +
 .../dhp/sx/graph/bio/single_pubmed.xml        | 223 +++++++-----------
 .../dnetlib/dhp/sx/bio/BioScholixTest.scala   |  64 ++++-
 7 files changed, 300 insertions(+), 154 deletions(-)
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java
new file mode 100644
index 0000000000..54aba87151
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java
@@ -0,0 +1,39 @@
+package eu.dnetlib.dhp.sx.bio.pubmed;
+
+/**
+ * The type Pubmed Affiliation.
+ *
+ * @author Sandro La Bruzzo
+ */
+public class PMAffiliation {
+
+    private  String name;
+
+    private PMIdentifier identifier;
+
+    public PMAffiliation() {
+
+    }
+    public PMAffiliation(String name, PMIdentifier identifier) {
+        this.name = name;
+        this.identifier = identifier;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public PMAffiliation setName(String name) {
+        this.name = name;
+        return this;
+    }
+
+    public PMIdentifier getIdentifier() {
+        return identifier;
+    }
+
+    public PMAffiliation setIdentifier(PMIdentifier identifier) {
+        this.identifier = identifier;
+        return this;
+    }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java
index 68ef6459e0..b0df256634 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java
@@ -12,6 +12,8 @@ public class PMAuthor implements Serializable {
 
 	private String lastName;
 	private String foreName;
+	private PMIdentifier identifier;
+	private PMAffiliation affiliation;
 
 	/**
 	 * Gets last name.
@@ -59,4 +61,41 @@ public class PMAuthor implements Serializable {
 			.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");
 	}
 
+	/**
+	 * Gets identifier.
+	 *
+	 * @return the identifier
+	 */
+	public PMIdentifier getIdentifier() {
+		return identifier;
+	}
+
+	/**
+	 * Sets identifier.
+	 *
+	 * @param identifier the identifier
+	 */
+	public void setIdentifier(PMIdentifier identifier) {
+		this.identifier = identifier;
+	}
+
+	/**
+	 * Gets affiliation.
+	 *
+	 * @return the affiliation
+	 */
+	public PMAffiliation getAffiliation() {
+		return affiliation;
+	}
+
+	/**
+	 * Sets affiliation.
+	 *
+	 * @param affiliation the affiliation
+	 */
+	public void setAffiliation(PMAffiliation affiliation) {
+		this.affiliation = affiliation;
+	}
+
+
 }
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java
new file mode 100644
index 0000000000..0c8c55e40e
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java
@@ -0,0 +1,53 @@
+package eu.dnetlib.dhp.sx.bio.pubmed;
+
+public class PMIdentifier {
+
+    private String pid;
+    private String type;
+
+
+    public PMIdentifier(String pid, String type) {
+        this.pid = cleanPid(pid);
+        this.type = type;
+    }
+
+    public PMIdentifier() {
+
+    }
+
+    private String cleanPid(String pid) {
+
+        if (pid == null) {
+            return null;
+        }
+
+        // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705
+        if (pid.matches("[0-9]{15}[0-9X]")) {
+            return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4");
+        }
+
+        // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543
+        if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) {
+            return pid.replaceAll("http://orcid.org/", "");
+        }
+        return pid;
+    }
+
+    public String getPid() {
+        return pid;
+    }
+
+    public PMIdentifier setPid(String pid) {
+        this.pid = cleanPid(pid);
+        return this;
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public PMIdentifier setType(String type) {
+        this.type = type;
+        return this;
+    }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
index c9e8681852..2eb4bea65c 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala
@@ -81,6 +81,26 @@ class PMParser2 {
         val a = new PMAuthor
         a.setLastName((author \ "LastName").text)
         a.setForeName((author \ "ForeName").text)
+        val id = (author \ "Identifier").text
+        val idType =(author \ "Identifier" \ "@Source").text
+
+        if(id != null && id.nonEmpty && idType != null && idType.nonEmpty) {
+          a.setIdentifier(new PMIdentifier(id, idType))
+        }
+
+
+        val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text
+        val affiliationId  = (author \ "AffiliationInfo" \ "Identifier").text
+        val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text
+
+        if(affiliation != null && affiliation.nonEmpty) {
+          val aff = new PMAffiliation()
+          aff.setName(affiliation)
+          if(affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty) {
+            aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType))
+          }
+          a.setAffiliation(aff)
+        }
         a
       })
       .toList
@@ -99,15 +119,7 @@ class PMParser2 {
     val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
 
     article.setAuthors(
-      authors
-        .map(author => {
-          val a = new PMAuthor
-          a.setLastName((author \ "LastName").text)
-          a.setForeName((author \ "ForeName").text)
-          a
-        })
-        .toList
-        .asJava
+      extractAuthors(authors).asJava
     )
 
     val pmId = xml \ "MedlineCitation" \ "PMID"
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala
index d59d73bd05..5e14c731a6 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala
@@ -294,6 +294,12 @@ object PubMedToOaf {
       author.setName(a.getForeName)
       author.setSurname(a.getLastName)
       author.setFullname(a.getFullName)
+      if(a.getIdentifier != null) {
+        author.setPid(List(OafMapperUtils.structuredProperty(a.getIdentifier.getPid,
+          OafMapperUtils.qualifier(a.getIdentifier.getType,a.getIdentifier.getType,ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES), dataInfo)).asJava)
+      }
+      if (a.getAffiliation!= null)
+        author.setRawAffiliationString(List(a.getAffiliation.getName).asJava)
       author.setRank(index + 1)
       author
     }(collection.breakOut)
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
index 4b4d860d7e..c2e503f57e 100644
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml
@@ -1,77 +1,56 @@
 <PubmedArticle>
-    <MedlineCitation Status="MEDLINE" IndexingMethod="Automated" Owner="NLM">
-        <PMID Version="1">37885214</PMID>
+    <MedlineCitation Status="MEDLINE" IndexingMethod="Curated" Owner="NLM">
+        <PMID Version="1">37318999</PMID>
         <DateCompleted>
             <Year>2024</Year>
             <Month>02</Month>
-            <Day>14</Day>
+            <Day>09</Day>
         </DateCompleted>
         <DateRevised>
             <Year>2024</Year>
             <Month>02</Month>
-            <Day>14</Day>
+            <Day>09</Day>
         </DateRevised>
         <Article PubModel="Print-Electronic">
             <Journal>
-                <ISSN IssnType="Electronic">2752-7549</ISSN>
+                <ISSN IssnType="Electronic">1522-1229</ISSN>
                 <JournalIssue CitedMedium="Internet">
-                    <Volume>40</Volume>
-                    <Issue>5</Issue>
+                    <Volume>47</Volume>
+                    <Issue>3</Issue>
                     <PubDate>
-                        <MedlineDate>2023 Sep-Oct</MedlineDate>
+                        <Year>2023</Year>
+                        <Month>Sep</Month>
+                        <Day>01</Day>
                     </PubDate>
                 </JournalIssue>
-                <Title>Journal of pediatric hematology/oncology nursing</Title>
-                <ISOAbbreviation>J Pediatr Hematol Oncol Nurs</ISOAbbreviation>
+                <Title>Advances in physiology education</Title>
+                <ISOAbbreviation>Adv Physiol Educ</ISOAbbreviation>
             </Journal>
-            <ArticleTitle>Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country.</ArticleTitle>
+            <ArticleTitle>Providing the choice of in-person or videoconference attendance in a clinical physiology course may harm learning outcomes for the entire cohort.</ArticleTitle>
             <Pagination>
-                <MedlinePgn>295-304</MedlinePgn>
+                <MedlinePgn>548-556</MedlinePgn>
             </Pagination>
-            <ELocationID EIdType="doi" ValidYN="Y">10.1177/27527530231193972</ELocationID>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1152/advan.00160.2022</ELocationID>
             <Abstract>
-                <AbstractText><b>Background:</b> Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. <b>Method:</b> Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. <b>Results:</b> Filipino parents (<i>N</i> = 156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. <b>Conclusions:</b> Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience.</AbstractText>
+                <AbstractText>Clinical Physiology 1 and 2 are flipped classes in which students watch prerecorded videos before class. During the 3-h class, students take practice assessments, work in groups on critical thinking exercises, work through case studies, and engage in drawing exercises. Due to the COVID pandemic, these courses were transitioned from in-person classes to online classes. Despite the university's return-to-class policy, some students were reluctant to return to in-person classes; therefore during the 2021-2022 academic year, Clinical Physiology 1 and 2 were offered as flipped, hybrid courses. In a hybrid format, students either attended the synchronous class in person or online. Here we evaluate the learning outcomes and the perceptions of the learning experience for students who attended Clinical Physiology 1 and 2 either online (2020-2021) or in a hybrid format (2021-2022). In addition to exam scores, in-class surveys and end of course evaluations were compiled to describe the student experience in the flipped hybrid setting. Retrospective linear mixed-model regression analysis of exam scores revealed that a hybrid modality (2021-2022) was associated with lower exam scores when controlling for sex, graduate/undergraduate status, delivery method, and the order in which the courses were taken (<i>F</i> test: <i>F</i> = 8.65, df1 = 2, df2 = 179.28, <i>P</i> = 0.0003). In addition, being a Black Indigenous Person of Color (BIPOC) student is associated with a lower exam score, controlling for the same previous factors (<i>F</i> test: <i>F</i> = 4.23, df1 = 1, df2 = 130.28, <i>P</i> = 0.04), albeit with lower confidence; the BIPOC representation in this sample is small (BIPOC: <i>n</i> = 144; total: <i>n</i> = 504). There is no significant interaction between the hybrid modality and race, meaning that BIPOC and White students are both negatively affected in a hybrid flipped course. Instructors should consider carefully about offering hybrid courses and build in extra student support.<b>NEW &amp; NOTEWORTHY</b> The transition from online to in-person teaching has been as challenging as the original transition to remote teaching with the onset of the pandemic. Since not all students were ready to return to the classroom, students could choose to take this course in person or online. This arrangement provided flexibility and opportunities for innovative class activities for students but introduced tradeoffs in lower test scores from the hybrid modality than fully online or fully in-person modalities.</AbstractText>
             </Abstract>
             <AuthorList CompleteYN="Y">
                 <Author ValidYN="Y">
-                    <LastName>Banayat</LastName>
-                    <ForeName>Aprille Campos</ForeName>
-                    <Initials>AC</Initials>
-                    <Identifier Source="ORCID">0000-0001-9339-9871</Identifier>
+                    <LastName>Anderson</LastName>
+                    <ForeName>Lisa Carney</ForeName>
+                    <Initials>LC</Initials>
+                    <Identifier Source="ORCID">0000-0003-2261-1921</Identifier>
                     <AffiliationInfo>
-                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                        <Affiliation>Department of Integrative Biology and Physiology, University of Minnesota, Minneapolis, Minnesota, United States.</Affiliation>
+                        <Identifier Source="ROR">https://ror.org/017zqws13</Identifier>
                     </AffiliationInfo>
                 </Author>
                 <Author ValidYN="Y">
-                    <LastName>Abad</LastName>
-                    <ForeName>Peter James B</ForeName>
-                    <Initials>PJB</Initials>
+                    <LastName>Jacobson</LastName>
+                    <ForeName>Tate</ForeName>
+                    <Initials>T</Initials>
                     <AffiliationInfo>
-                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
-                    </AffiliationInfo>
-                </Author>
-                <Author ValidYN="Y">
-                    <LastName>Bonito</LastName>
-                    <ForeName>Sheila R</ForeName>
-                    <Initials>SR</Initials>
-                    <AffiliationInfo>
-                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
-                    </AffiliationInfo>
-                </Author>
-                <Author ValidYN="Y">
-                    <LastName>Manahan</LastName>
-                    <ForeName>Lydia T</ForeName>
-                    <Initials>LT</Initials>
-                    <AffiliationInfo>
-                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
-                    </AffiliationInfo>
-                </Author>
-                <Author ValidYN="Y">
-                    <LastName>Peralta</LastName>
-                    <ForeName>Arnold B</ForeName>
-                    <Initials>AB</Initials>
-                    <AffiliationInfo>
-                        <Affiliation>College of Nursing, University of the Philippines Manila, Manila, Philippines.</Affiliation>
+                        <Affiliation>Department of Statistics, University of Minnesota, Minneapolis, Minnesota, United States.</Affiliation>
                     </AffiliationInfo>
                 </Author>
             </AuthorList>
@@ -81,142 +60,98 @@
             </PublicationTypeList>
             <ArticleDate DateType="Electronic">
                 <Year>2023</Year>
-                <Month>10</Month>
-                <Day>26</Day>
+                <Month>06</Month>
+                <Day>15</Day>
             </ArticleDate>
         </Article>
         <MedlineJournalInfo>
             <Country>United States</Country>
-            <MedlineTA>J Pediatr Hematol Oncol Nurs</MedlineTA>
-            <NlmUniqueID>9918282681506676</NlmUniqueID>
-            <ISSNLinking>2752-7530</ISSNLinking>
+            <MedlineTA>Adv Physiol Educ</MedlineTA>
+            <NlmUniqueID>100913944</NlmUniqueID>
+            <ISSNLinking>1043-4046</ISSNLinking>
         </MedlineJournalInfo>
         <CitationSubset>IM</CitationSubset>
         <MeshHeadingList>
             <MeshHeading>
-                <DescriptorName UI="D002648" MajorTopicYN="N">Child</DescriptorName>
+                <DescriptorName UI="D010827" MajorTopicYN="Y">Physiology</DescriptorName>
+                <QualifierName UI="Q000193" MajorTopicYN="N">education</QualifierName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D012189" MajorTopicYN="N">Retrospective Studies</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D007858" MajorTopicYN="N">Learning</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D058873" MajorTopicYN="N">Pandemics</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D000086382" MajorTopicYN="N">COVID-19</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D012044" MajorTopicYN="N">Regression Analysis</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D013334" MajorTopicYN="N">Students</DescriptorName>
             </MeshHeading>
             <MeshHeading>
                 <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
             </MeshHeading>
             <MeshHeading>
-                <DescriptorName UI="D010290" MajorTopicYN="Y">Parents</DescriptorName>
-                <QualifierName UI="Q000523" MajorTopicYN="N">psychology</QualifierName>
+                <DescriptorName UI="D008297" MajorTopicYN="N">Male</DescriptorName>
             </MeshHeading>
             <MeshHeading>
-                <DescriptorName UI="D012944" MajorTopicYN="N">Social Support</DescriptorName>
+                <DescriptorName UI="D005260" MajorTopicYN="N">Female</DescriptorName>
             </MeshHeading>
             <MeshHeading>
-                <DescriptorName UI="D029181" MajorTopicYN="N">Spirituality</DescriptorName>
+                <DescriptorName UI="D044465" MajorTopicYN="N">White People</DescriptorName>
             </MeshHeading>
             <MeshHeading>
-                <DescriptorName UI="D012067" MajorTopicYN="N">Religion</DescriptorName>
+                <DescriptorName UI="D044383" MajorTopicYN="N">Black People</DescriptorName>
             </MeshHeading>
             <MeshHeading>
-                <DescriptorName UI="D009369" MajorTopicYN="Y">Neoplasms</DescriptorName>
-                <QualifierName UI="Q000628" MajorTopicYN="N">therapy</QualifierName>
+                <DescriptorName UI="D020375" MajorTopicYN="N">Education, Distance</DescriptorName>
+            </MeshHeading>
+            <MeshHeading>
+                <DescriptorName UI="D003479" MajorTopicYN="N">Curriculum</DescriptorName>
             </MeshHeading>
         </MeshHeadingList>
         <KeywordList Owner="NOTNLM">
-            <Keyword MajorTopicYN="N">cancer</Keyword>
-            <Keyword MajorTopicYN="N">mixed methods</Keyword>
-            <Keyword MajorTopicYN="N">parent</Keyword>
-            <Keyword MajorTopicYN="N">pediatric</Keyword>
-            <Keyword MajorTopicYN="N">research</Keyword>
-            <Keyword MajorTopicYN="N">supportive care</Keyword>
+            <Keyword MajorTopicYN="N">flipped teaching</Keyword>
+            <Keyword MajorTopicYN="N">hybrid teaching</Keyword>
+            <Keyword MajorTopicYN="N">inequity</Keyword>
+            <Keyword MajorTopicYN="N">learning outcomes</Keyword>
+            <Keyword MajorTopicYN="N">responsive teaching</Keyword>
         </KeywordList>
-        <CoiStatement>Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article.</CoiStatement>
     </MedlineCitation>
     <PubmedData>
         <History>
             <PubMedPubDate PubStatus="medline">
-                <Year>2024</Year>
-                <Month>2</Month>
-                <Day>12</Day>
-                <Hour>18</Hour>
-                <Minute>42</Minute>
+                <Year>2023</Year>
+                <Month>7</Month>
+                <Day>21</Day>
+                <Hour>6</Hour>
+                <Minute>44</Minute>
             </PubMedPubDate>
             <PubMedPubDate PubStatus="pubmed">
                 <Year>2023</Year>
-                <Month>10</Month>
-                <Day>27</Day>
-                <Hour>6</Hour>
-                <Minute>42</Minute>
+                <Month>6</Month>
+                <Day>15</Day>
+                <Hour>19</Hour>
+                <Minute>14</Minute>
             </PubMedPubDate>
             <PubMedPubDate PubStatus="entrez">
                 <Year>2023</Year>
-                <Month>10</Month>
-                <Day>27</Day>
-                <Hour>3</Hour>
-                <Minute>43</Minute>
+                <Month>6</Month>
+                <Day>15</Day>
+                <Hour>12</Hour>
+                <Minute>53</Minute>
             </PubMedPubDate>
         </History>
         <PublicationStatus>ppublish</PublicationStatus>
         <ArticleIdList>
-            <ArticleId IdType="pubmed">37885214</ArticleId>
-            <ArticleId IdType="doi">10.1177/27527530231193972</ArticleId>
+            <ArticleId IdType="pubmed">37318999</ArticleId>
+            <ArticleId IdType="doi">10.1152/advan.00160.2022</ArticleId>
         </ArticleIdList>
     </PubmedData>
 </PubmedArticle>
-<DeleteCitation>
-<PMID Version="1">30522158</PMID>
-<PMID Version="1">32769323</PMID>
-<PMID Version="1">34061701</PMID>
-<PMID Version="1">34661197</PMID>
-<PMID Version="1">34837091</PMID>
-<PMID Version="1">35035475</PMID>
-<PMID Version="1">35211699</PMID>
-<PMID Version="1">35557982</PMID>
-<PMID Version="1">35782783</PMID>
-<PMID Version="1">35795240</PMID>
-<PMID Version="1">35832688</PMID>
-<PMID Version="1">35847411</PMID>
-<PMID Version="1">36081602</PMID>
-<PMID Version="1">36081858</PMID>
-<PMID Version="1">36468085</PMID>
-<PMID Version="1">36468934</PMID>
-<PMID Version="1">36580086</PMID>
-<PMID Version="1">36589526</PMID>
-<PMID Version="1">36619609</PMID>
-<PMID Version="1">36649460</PMID>
-<PMID Version="1">36654909</PMID>
-<PMID Version="1">36655054</PMID>
-<PMID Version="1">36700856</PMID>
-<PMID Version="1">36705625</PMID>
-<PMID Version="1">36713939</PMID>
-<PMID Version="1">36714172</PMID>
-<PMID Version="1">36741203</PMID>
-<PMID Version="1">36741905</PMID>
-<PMID Version="1">36743825</PMID>
-<PMID Version="1">36788221</PMID>
-<PMID Version="1">36844926</PMID>
-<PMID Version="1">36846546</PMID>
-<PMID Version="1">36935776</PMID>
-<PMID Version="1">36946757</PMID>
-<PMID Version="1">36972191</PMID>
-<PMID Version="1">37034422</PMID>
-<PMID Version="1">37124311</PMID>
-<PMID Version="1">37152108</PMID>
-<PMID Version="1">37171968</PMID>
-<PMID Version="1">37273889</PMID>
-<PMID Version="1">37333905</PMID>
-<PMID Version="1">37387733</PMID>
-<PMID Version="1">37431449</PMID>
-<PMID Version="1">37576947</PMID>
-<PMID Version="1">37601162</PMID>
-<PMID Version="1">37711214</PMID>
-<PMID Version="1">37901290</PMID>
-<PMID Version="1">37981909</PMID>
-<PMID Version="1">37981945</PMID>
-<PMID Version="1">37982005</PMID>
-<PMID Version="1">38037601</PMID>
-<PMID Version="1">38037602</PMID>
-<PMID Version="1">38150730</PMID>
-<PMID Version="1">38274640</PMID>
-<PMID Version="1">38332671</PMID>
-<PMID Version="1">38334184</PMID>
-<PMID Version="1">38335456</PMID>
-<PMID Version="1">38349506</PMID>
-<PMID Version="1">38349576</PMID>
-<PMID Version="1">38353676</PMID>
-</DeleteCitation>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index c942ca1329..4a926df015 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -19,9 +19,11 @@ import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.LoggerFactory
 
 import java.io.{BufferedReader, InputStream, InputStreamReader}
+import java.util.regex.Pattern
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
 
@@ -51,6 +53,64 @@ class BioScholixTest extends AbstractVocabularyTest {
     }
   }
 
+  @Test
+  def testPid(): Unit = {
+    val pids = List(
+      "0000000163025705",
+      "000000018494732X",
+      "0000000308873343",
+      "0000000335964515",
+      "0000000333457333",
+      "0000000335964515",
+      "0000000302921949",
+
+      "http://orcid.org/0000-0001-8567-3543",
+      "http://orcid.org/0000-0001-7868-8528",
+      "0000-0001-9189-1440",
+      "0000-0003-3727-9247",
+      "0000-0001-7246-1058",
+      "000000033962389X",
+      "0000000330371470",
+      "0000000171236123",
+      "0000000272569752",
+      "0000000293231371",
+      "http://orcid.org/0000-0003-3345-7333",
+      "0000000340145688",
+      "http://orcid.org/0000-0003-4894-1689"
+    )
+
+    pids.foreach(pid => {
+      val pidCleaned = new PMIdentifier(pid, "ORCID").getPid
+      // assert pid is in the format of ORCID
+      println(pidCleaned)
+      assertTrue(pidCleaned.matches("[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]"))
+    })
+  }
+
+  def extractAffiliation(s: String): List[String] = {
+    val regex: String = "<Affiliation>(.*)<\\/Affiliation>"
+    val pattern = Pattern.compile(regex, Pattern.MULTILINE)
+    val matcher = pattern.matcher(s)
+    val l: mutable.ListBuffer[String] = mutable.ListBuffer()
+    while (matcher.find()) {
+      l += matcher.group(1)
+    }
+    l.toList
+  }
+
+  case class AuthorPID(pidType: String, pid: String) {}
+
+  def extractAuthorIdentifier(s: String): List[AuthorPID] = {
+    val regex: String = "<Identifier Source=\"(.*)\">(.*)<\\/Identifier>"
+    val pattern = Pattern.compile(regex, Pattern.MULTILINE)
+    val matcher = pattern.matcher(s)
+    val l: mutable.ListBuffer[AuthorPID] = mutable.ListBuffer()
+    while (matcher.find()) {
+      l += AuthorPID(pidType = matcher.group(1), pid = matcher.group(2))
+    }
+    l.toList
+  }
+
   @Test
   def testParsingPubmed2(): Unit = {
     val mapper = new ObjectMapper()
@@ -58,7 +118,9 @@ class BioScholixTest extends AbstractVocabularyTest {
     val parser = new PMParser2()
     val article = parser.parse(xml)
 
-    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
+//    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
+
+    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(PubMedToOaf.convert(article, vocabularies)))
 
   }
 

From 5d344323983e5aa6cff4cd02557bf40d0dc6c5dd Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Tue, 19 Nov 2024 15:12:04 +0100
Subject: [PATCH 8/8] align MergeUtils with beta branch

---
 .../dhp/schema/oaf/utils/MergeUtils.java      | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
index cd85065830..c092f60355 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@@ -74,29 +74,29 @@ public class MergeUtils {
 			if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
 				return (T) mergedResult;
 			} else {
-				final Qualifier expectedResultType = vocs
-					.lookupTermBySynonym(
-						ModelConstants.DNET_RESULT_TYPOLOGIES,
-						i.getInstancetype().getClassid());
-
-				if (Objects.isNull(expectedResultType)) {
-					throw new IllegalArgumentException(
-						"instance type not bound to any result type in dnet:result_typologies: " +
-							i.getInstancetype().getClassid());
-				}
+				final String expectedResultType = Optional
+					.ofNullable(
+						vocs
+							.lookupTermBySynonym(
+								ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()))
+					.orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE)
+					.getClassid();
 
 				// there is a clash among the result types
-				if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) {
-					try {
-						String resulttype = expectedResultType.getClassid();
-						if (EntityType.otherresearchproduct.toString().equals(resulttype)) {
-							resulttype = "other";
-						}
-						Result result = (Result) ModelSupport.oafTypes.get(resulttype).newInstance();
-						return (T) mergeResultFields(result, mergedResult);
-					} catch (InstantiationException | IllegalAccessException e) {
-						throw new IllegalStateException(e);
-					}
+				if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) {
+
+					Result result = (Result) Optional
+						.ofNullable(ModelSupport.oafTypes.get(expectedResultType))
+						.map(r -> {
+							try {
+								return r.newInstance();
+							} catch (InstantiationException | IllegalAccessException e) {
+								throw new IllegalStateException(e);
+							}
+						})
+						.orElse(new OtherResearchProduct());
+					result.setId(mergedResult.getId());
+					return (T) mergeResultFields(result, mergedResult);
 				} else {
 					return (T) mergedResult;
 				}