From 326c9dc08ceac7613c187f3d3c3609731823f8bc Mon Sep 17 00:00:00 2001
From: Giambattista Bloisi <gbloisi@gmail.com>
Date: Wed, 2 Aug 2023 18:05:53 +0200
Subject: [PATCH 01/34] Changes in maven poms to build and test the project
 using Spark 3.4.x and scala 2.12

---
 dhp-common/pom.xml                            |  19 ++-
 dhp-pace-core/pom.xml                         |  12 --
 dhp-workflows/dhp-dedup-openaire/pom.xml      |  24 ---
 dhp-workflows/dhp-graph-provision/pom.xml     |  14 +-
 .../oa/provision/utils/TemplateFactory.java   |   3 -
 pom.xml                                       | 145 ++++++++++++------
 6 files changed, 116 insertions(+), 101 deletions(-)
diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 6198bd81ee..d64e7e7a09 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -62,16 +62,17 @@
 	</build>
 
 	<dependencies>
+		<dependency>
+			<groupId>edu.cmu</groupId>
+			<artifactId>secondstring</artifactId>
+		</dependency>
+
 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
 			<artifactId>dhp-pace-core</artifactId>
 			<version>${project.version}</version>
 		</dependency>
 
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@@ -118,10 +119,6 @@
 			<groupId>net.sf.saxon</groupId>
 			<artifactId>Saxon-HE</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.slf4j</groupId>
-			<artifactId>jcl-over-slf4j</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>org.apache.cxf</groupId>
 			<artifactId>cxf-rt-transports-http</artifactId>
@@ -129,6 +126,12 @@
 		<dependency>
 			<groupId>eu.dnetlib</groupId>
 			<artifactId>cnr-rmi-api</artifactId>
+			<exclusions>
+				<exclusion>
+					<artifactId>log4j</artifactId>
+					<groupId>log4j</groupId>
+				</exclusion>
+			</exclusions>
 		</dependency>
 
 		<dependency>
diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
index fd7f44fc94..a6d2538f29 100644
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@@ -53,14 +53,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>com.google.guava</groupId>
-			<artifactId>guava</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.google.code.gson</groupId>
-			<artifactId>gson</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@@ -85,10 +77,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.apache.commons</groupId>
-			<artifactId>commons-math3</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml
index a271efe8e4..2d40f44dae 100644
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@@ -54,24 +54,10 @@
             <artifactId>dhp-pace-core</artifactId>
             <version>${project.version}</version>
         </dependency>
-
         <dependency>
             <groupId>org.apache.commons</groupId>
             <artifactId>commons-lang3</artifactId>
         </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
-            <version>1.0.2</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.scala-lang.modules</groupId>
-            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
-            <version>2.11.0</version>
-        </dependency>
-
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_${scala.binary.version}</artifactId>
@@ -80,16 +66,10 @@
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_${scala.binary.version}</artifactId>
         </dependency>
-
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-graphx_${scala.binary.version}</artifactId>
         </dependency>
-
-        <dependency>
-            <groupId>com.arakelian</groupId>
-            <artifactId>java-jq</artifactId>
-        </dependency>
         <dependency>
             <groupId>dom4j</groupId>
             <artifactId>dom4j</artifactId>
@@ -102,10 +82,6 @@
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-databind</artifactId>
         </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-        </dependency>
         <dependency>
             <groupId>org.apache.httpcomponents</groupId>
             <artifactId>httpclient</artifactId>
diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml
index e62fcdf198..47b0566146 100644
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@@ -59,12 +59,6 @@
         <dependency>
             <groupId>com.jayway.jsonpath</groupId>
             <artifactId>json-path</artifactId>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.slf4j</groupId>
-                    <artifactId>slf4j-api</artifactId>
-                </exclusion>
-            </exclusions>
         </dependency>
         <dependency>
             <groupId>dom4j</groupId>
@@ -160,6 +154,14 @@
                     <groupId>org.apache.zookeeper</groupId>
                     <artifactId>zookeeper</artifactId>
                 </exclusion>
+                <exclusion>
+                    <artifactId>ant</artifactId>
+                    <groupId>org.apache.ant</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>antlr4-runtime</artifactId>
+                    <groupId>org.antlr</groupId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
index 87c0261ac0..7046b4cf0a 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
@@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
 import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 
-import javax.swing.text.html.Option;
-
 import org.apache.commons.lang3.StringUtils;
 import org.stringtemplate.v4.ST;
 
diff --git a/pom.xml b/pom.xml
index 3fd351c1db..fa4f16df39 100644
--- a/pom.xml
+++ b/pom.xml
@@ -204,10 +204,17 @@
 				<scope>test</scope>
 			</dependency>
 
+			<dependency>
+				<groupId>org.slf4j</groupId>
+				<artifactId>slf4j-api</artifactId>
+				<version>${org.slf4j.version}</version>
+				<scope>provided</scope>
+			</dependency>
+
 			<dependency>
 				<groupId>org.slf4j</groupId>
 				<artifactId>jcl-over-slf4j</artifactId>
-				<version>1.7.25</version>
+				<version>${org.slf4j.version}</version>
 				<scope>provided</scope>
 			</dependency>
 
@@ -217,22 +224,29 @@
 				<version>${dhp.commons.lang.version}</version>
 			</dependency>
 
+			<dependency>
+				<groupId>org.apache.commons</groupId>
+				<artifactId>commons-beanutils</artifactId>
+				<version>${commons-beanutils.version}</version>
+			</dependency>
+
+
 			<dependency>
 				<groupId>commons-validator</groupId>
 				<artifactId>commons-validator</artifactId>
-				<version>1.7</version>
+				<version>${commons-validator.version}</version>
 			</dependency>
 
 			<dependency>
 				<groupId>com.github.sisyphsu</groupId>
 				<artifactId>dateparser</artifactId>
-				<version>1.0.7</version>
+				<version>${dateparser.version}</version>
 			</dependency>
 
 			<dependency>
 				<groupId>me.xuender</groupId>
 				<artifactId>unidecode</artifactId>
-				<version>0.0.7</version>
+				<version>${unidecode.version}</version>
 			</dependency>
 
 			<dependency>
@@ -245,13 +259,13 @@
 			<dependency>
 				<groupId>commons-codec</groupId>
 				<artifactId>commons-codec</artifactId>
-				<version>1.9</version>
+				<version>${commons-codec.version}</version>
 			</dependency>
 
 			<dependency>
 				<groupId>commons-io</groupId>
 				<artifactId>commons-io</artifactId>
-				<version>2.4</version>
+				<version>${commons-io.version}</version>
 			</dependency>
 
 			<dependency>
@@ -415,6 +429,7 @@
 				<artifactId>cxf-rt-transports-http</artifactId>
 				<version>3.1.5</version>
 			</dependency>
+
 			<dependency>
 				<groupId>javax.persistence</groupId>
 				<artifactId>javax.persistence-api</artifactId>
@@ -504,16 +519,11 @@
 				<artifactId>commons-compress</artifactId>
 				<version>${common.compress.version}</version>
 			</dependency>
-
-			<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
 			<dependency>
 				<groupId>org.apache.commons</groupId>
 				<artifactId>commons-csv</artifactId>
 				<version>${common.csv.version}</version>
 			</dependency>
-
-
-			<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
 			<dependency>
 				<groupId>org.apache.poi</groupId>
 				<artifactId>poi-ooxml</artifactId>
@@ -568,14 +578,12 @@
 				<scope>provided</scope>
 			</dependency>
 
-
 			<dependency>
 				<groupId>org.apache.commons</groupId>
 				<artifactId>commons-math3</artifactId>
 				<version>3.6.1</version>
 			</dependency>
 
-
 			<dependency>
 				<groupId>com.google.code.gson</groupId>
 				<artifactId>gson</artifactId>
@@ -596,7 +604,7 @@
 			<dependency>
 				<groupId>org.reflections</groupId>
 				<artifactId>reflections</artifactId>
-				<version>0.9.10</version>
+				<version>${reflections.version}</version>
 			</dependency>
 
 			<dependency>
@@ -610,6 +618,12 @@
 				<artifactId>icu4j</artifactId>
 				<version>70.1</version>
 			</dependency>
+
+			<dependency>
+				<groupId>org.javassist</groupId>
+				<artifactId>javassist</artifactId>
+				<version>${javassist.version}</version>
+			</dependency>
 		</dependencies>
 	</dependencyManagement>
 
@@ -866,46 +880,62 @@
 		<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-		<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
 		<maven.compiler.source>1.8</maven.compiler.source>
 		<maven.compiler.target>1.8</maven.compiler.target>
-		<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
-		<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
-		<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
-		<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
-		<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
-		<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
-		<sparksolr.version>3.6.0</sparksolr.version>
-		<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
-		<dhp.jackson.version>2.9.6</dhp.jackson.version>
-		<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
-		<dhp.site.skip>true</dhp.site.skip>
-		<dhp.guava.version>11.0.2</dhp.guava.version>
+
+		<!-- scala version -->
 		<scala.version>2.11.12</scala.version>
 		<scala.binary.version>2.11</scala.binary.version>
-		<scala-xml.version>1.3.0</scala-xml.version>
-		<junit-jupiter.version>5.6.1</junit-jupiter.version>
-		<mockito-core.version>3.3.3</mockito-core.version>
-		<mongodb.driver.version>3.4.2</mongodb.driver.version>
-		<vtd.version>[2.12,3.0)</vtd.version>
-		<dhp-schemas.version>[4.17.2]</dhp-schemas.version>
+
+		<!-- plugin versions -->
+		<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
+		<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
+		<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
+		<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
+
+		<!-- dependency versions -->
+		<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
+
+		<apache.poi.version>4.1.2</apache.poi.version>
+		<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
+		<common.compress.version>1.20</common.compress.version>
+		<common.csv.version>1.8</common.csv.version>
+		<common.text.version>1.8</common.text.version>
+		<commons-beanutils.version>1.9.4</commons-beanutils.version>
+		<commons-codec.version>1.9</commons-codec.version>
+		<commons.collections.version>3.2.1</commons.collections.version>
+		<commons-io.version>2.4</commons-io.version>
+		<commons.logging.version>1.1.3</commons.logging.version>
+		<commons-validator.version>1.7</commons-validator.version>
+		<dateparser.version>1.0.7</dateparser.version>
+		<dhp-schemas.version>[3.17.1]</dhp-schemas.version>
+		<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
+		<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
+		<dhp.guava.version>11.0.2</dhp.guava.version>
+		<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
+		<dhp.jackson.version>2.9.6</dhp.jackson.version>
+		<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
+		<dhp.site.skip>true</dhp.site.skip>
+		<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
 		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
 		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
 		<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
-		<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
-		<solr.version>7.5.0</solr.version>
-		<okhttp.version>4.7.2</okhttp.version>
-		<common.compress.version>1.20</common.compress.version>
+		<google.gson.version>2.2.2</google.gson.version>
+		<javassist.version>3.19.0-GA</javassist.version>
 		<json4s.version>3.5.3</json4s.version>
 		<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
-		<common.csv.version>1.8</common.csv.version>
-		<apache.poi.version>4.1.2</apache.poi.version>
-		<common.text.version>1.8</common.text.version>
+		<junit-jupiter.version>5.6.1</junit-jupiter.version>
+		<mockito-core.version>3.3.3</mockito-core.version>
+		<mongodb.driver.version>3.4.2</mongodb.driver.version>
+		<okhttp.version>4.7.2</okhttp.version>
 		<org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
-		<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
-		<google.gson.version>2.2.2</google.gson.version>
-		<commons.logging.version>1.1.3</commons.logging.version>
-		<commons.collections.version>3.2.1</commons.collections.version>
+		<org.slf4j.version>1.7.25</org.slf4j.version>
+		<reflections.version>0.9.10</reflections.version>
+		<scala-xml.version>1.3.0</scala-xml.version>
+		<solr.version>7.5.0</solr.version>
+		<sparksolr.version>3.6.0</sparksolr.version>
+		<unidecode.version>0.0.7</unidecode.version>
+		<vtd.version>[2.12,3.0)</vtd.version>
 	</properties>
 
 	<!-- Build with scala 12 and Spark 3.4 -->
@@ -915,21 +945,40 @@
 			<properties>
 				<scala.binary.version>2.12</scala.binary.version>
 				<scala.version>2.12.18</scala.version>
-				<!-- scala-xml.version>2.1.0</scala-xml.version -->
+				<scala-xml.version>1.3.0</scala-xml.version>
 
+				<!-- plugin versions -->
+				<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
 
+				<!-- dependencies -->
+				<common.compress.version>1.22</common.compress.version>
+				<common.csv.version>1.8</common.csv.version>
+				<common.text.version>1.10.0</common.text.version>
+				<commons-beanutils.version>1.9.4</commons-beanutils.version>
+				<commons-codec.version>1.15</commons-codec.version>
+				<commons.collections.version>3.2.2</commons.collections.version>
+				<commons-io.version>2.11.0</commons-io.version>
+				<commons.logging.version>1.1.3</commons.logging.version>
+				<commons-validator.version>1.7</commons-validator.version>
+
+				<dhp.guava.version>14.0.1</dhp.guava.version>
+				<solr.version>8.11.0</solr.version>
 				<sparksolr.version>4.0.2</sparksolr.version>
 				<dhp.spark.version>3.4.1</dhp.spark.version>
 				<dhp.jackson.version>2.14.2</dhp.jackson.version>
 				<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
 				<json4s.version>3.7.0-M11</json4s.version>
-				<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
-
+				<javassist.version>3.25.0-GA</javassist.version>
+				<okhttp.version>4.10.0</okhttp.version>
+				<org.slf4j.version>2.0.6</org.slf4j.version>
+				<reflections.version>0.10.2</reflections.version>
 				<!--
 				<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
 				<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
 				 -->
 			</properties>
+
+
 		</profile>
 	</profiles>
-</project>
\ No newline at end of file
+</project>

From 2fa78f6071206415b08b00c20a97c6ae8441a0fe Mon Sep 17 00:00:00 2001
From: Giambattista Bloisi <gbloisi@gmail.com>
Date: Thu, 7 Sep 2023 11:58:59 +0200
Subject: [PATCH 02/34] Changes requires to build and run tests with Java 17

---
 .../WritePredefinedProjectPropertiesTest.java | 10 ++-
 .../java/eu/dnetlib/pace/util/UtilTest.java   |  4 +-
 .../oa/dedup/graph/ConnectedComponent.java    | 24 +++++---
 .../doiboost/orcid/OrcidClientTest.java       |  6 --
 dhp-workflows/dhp-graph-provision/pom.xml     | 61 ++++++++++++++++++-
 .../dhp/oa/provision/XmlIndexingJob.java      | 10 +--
 .../dnetlib/dhp/sparksolr/DHPSolrSupport.java | 12 ++++
 .../dnetlib/dhp/sparksolr/DHPSolrSupport.java | 12 ++++
 .../dhp-usage-raw-data-update/pom.xml         | 12 +++-
 dhp-workflows/dhp-usage-stats-build/pom.xml   | 18 +++++-
 pom.xml                                       | 38 ++++++++++--
 11 files changed, 168 insertions(+), 39 deletions(-)
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java

diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
index 84b962b4b8..19e9377afd 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;
 
 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
+			mojo.execute();
+			Assertions.assertTrue(false); // not reached
+		} catch (Exception e) {
+			Assertions
+				.assertTrue(
+					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+        }
 	}
 
 	@Test
diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
index 6056c342dc..c5c5eaba7f 100644
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@@ -10,7 +10,6 @@ import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 import eu.dnetlib.pace.model.Person;
-import jdk.nashorn.internal.ir.annotations.Ignore;
 
 public class UtilTest {
 
@@ -21,8 +20,7 @@ public class UtilTest {
 		params = new HashMap<>();
 	}
 
-	@Test
-	@Ignore
+	// @Test
 	public void paceResolverTest() {
 		PaceResolver paceResolver = new PaceResolver();
 		paceResolver.getComparator("keywordMatch", params);
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
index 4a39a175d4..4fc0a25e81 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
@@ -3,6 +3,9 @@ package eu.dnetlib.dhp.oa.dedup.graph;
 
 import java.io.IOException;
 import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
 
@@ -16,14 +19,16 @@ import eu.dnetlib.pace.util.PaceException;
 
 public class ConnectedComponent implements Serializable {
 
-	private String ccId;
-	private Set<String> ids;
+	private String ccId = "";
+	private List<String> ids = Collections.EMPTY_LIST;
 
 	private static final String CONNECTED_COMPONENT_ID_PREFIX = "connect_comp";
 
-	public ConnectedComponent(Set<String> ids, final int cut) {
-		this.ids = ids;
+	public ConnectedComponent() {
+	}
 
+	public ConnectedComponent(Set<String> ids, final int cut) {
+		this.ids = new ArrayList<>(ids);
 		this.ccId = createDefaultID();
 
 		if (cut > 0 && ids.size() > cut) {
@@ -31,14 +36,15 @@ public class ConnectedComponent implements Serializable {
 				.stream()
 				.filter(id -> !ccId.equalsIgnoreCase(id))
 				.limit(cut - 1)
-				.collect(Collectors.toSet());
+				.distinct()
+				.collect(Collectors.toList());
 //			this.ids.add(ccId); ??
 		}
 	}
 
 	public ConnectedComponent(String ccId, Set<String> ids) {
 		this.ccId = ccId;
-		this.ids = ids;
+		this.ids = new ArrayList<>(ids);
 	}
 
 	public String createDefaultID() {
@@ -82,12 +88,12 @@ public class ConnectedComponent implements Serializable {
 		}
 	}
 
-	public Set<String> getIds() {
+	public List<String> getIds() {
 		return ids;
 	}
 
-	public void setIds(Set<String> ids) {
-		this.ids = ids;
+	public void setIds(List<String> ids) {
+		this.ids =ids;
 	}
 
 	public String getCcId() {
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
index 70bbd066a0..8aebeda0b5 100644
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java
@@ -30,7 +30,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
 import eu.dnetlib.dhp.schema.orcid.AuthorData;
 import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
 import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
-import jdk.nashorn.internal.ir.annotations.Ignore;
 
 public class OrcidClientTest {
 	final int REQ_LIMIT = 24;
@@ -152,7 +151,6 @@ public class OrcidClientTest {
 	}
 
 	// @Test
-	@Ignore
 	private void testModifiedDate() throws ParseException {
 		testDate(toRetrieveDate);
 		testDate(toNotRetrieveDate);
@@ -332,7 +330,6 @@ public class OrcidClientTest {
 	}
 
 	@Test
-	@Ignore
 	void testUpdatedRecord() throws Exception {
 		final String base64CompressedRecord = IOUtils
 			.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
@@ -341,7 +338,6 @@ public class OrcidClientTest {
 	}
 
 	@Test
-	@Ignore
 	void testUpdatedWork() throws Exception {
 		final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA==";
 		final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
@@ -413,7 +409,6 @@ public class OrcidClientTest {
 	}
 
 	@Test
-	@Ignore
 	void testDownloadedAuthor() throws Exception {
 		final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA=";
 		final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
@@ -421,7 +416,6 @@ public class OrcidClientTest {
 	}
 
 	@Test
-	@Ignore
 	void testDownloadedWork() throws Exception {
 		final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA==";
 		final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml
index 47b0566146..60c925227b 100644
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@@ -18,7 +18,7 @@
                 <executions>
                     <execution>
                         <id>scala-compile-first</id>
-                        <phase>initialize</phase>
+                        <phase>process-resources</phase>
                         <goals>
                             <goal>add-source</goal>
                             <goal>compile</goal>
@@ -208,5 +208,64 @@
 
     </dependencies>
 
+    <profiles>
+        <profile>
+            <id>scala-2.11</id>
+            <activation>
+                <activeByDefault>true</activeByDefault>
+            </activation>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-3</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+
+        <profile>
+            <id>scala-2.12</id>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-4</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+
+    </profiles>
 
 </project>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
index cd401c6cbd..220eb4f536 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@@ -27,12 +27,11 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.lucidworks.spark.util.SolrSupport;
-
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
+import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@@ -156,12 +155,7 @@ public class XmlIndexingJob {
 		switch (outputFormat) {
 			case SOLR:
 				final String collection = ProvisionConstants.getCollectionName(format);
-
-				// SparkSolr >= 4
-				// com.lucidworks.spark.BatchSizeType bt = com.lucidworks.spark.BatchSizeType.NUM_DOCS;
-				// SolrSupport.indexDocs(zkHost, collection, batchSize, bt, docs.rdd());
-				// SparkSolr < 4
-				SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+				DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
 				break;
 			case HDFS:
 				spark
diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
new file mode 100644
index 0000000000..295f0f54d7
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.sparksolr;
+
+import com.lucidworks.spark.util.SolrSupport;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.rdd.RDD;
+
+public class DHPSolrSupport {
+
+    static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
+        SolrSupport.indexDocs(zkhost, collection, batchSize, docs);
+    }
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
new file mode 100644
index 0000000000..6b85176a3b
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.sparksolr;
+
+import com.lucidworks.spark.util.SolrSupport;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.rdd.RDD;
+
+public class DHPSolrSupport {
+
+    static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
+        SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs);
+    }
+}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
index a9dbb09ae1..8ce9826e2a 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml
+++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
@@ -39,8 +39,8 @@
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-        <cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
-        <cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
+        <cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
+        <cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
     </properties>
 
     <dependencies>
@@ -72,7 +72,13 @@
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
             <version>${cdh.hadoop.version}</version>
-        </dependency>        
+            <exclusions>
+                <exclusion>
+                    <groupId>jdk.tools</groupId>
+                    <artifactId>jdk.tools</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
         <dependency>
             <groupId>eu.dnetlib.dhp</groupId>
             <artifactId>dhp-common</artifactId>
diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml
index 56aec73b78..4dd987f515 100644
--- a/dhp-workflows/dhp-usage-stats-build/pom.xml
+++ b/dhp-workflows/dhp-usage-stats-build/pom.xml
@@ -39,8 +39,8 @@
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-        <cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
-        <cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
+        <cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
+        <cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
 	</properties>
     
     <dependencies>
@@ -67,11 +67,23 @@
 			<groupId>org.apache.hive</groupId>
 			<artifactId>hive-jdbc</artifactId>
 			<version>${cdh.hive.version}</version>
-		</dependency>
+            <exclusions>
+                <exclusion>
+                    <groupId>jdk.tools</groupId>
+                    <artifactId>jdk.tools</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
 		<dependency>
   			<groupId>org.apache.hadoop</groupId>
   			<artifactId>hadoop-common</artifactId>
   			<version>${cdh.hadoop.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>jdk.tools</groupId>
+                    <artifactId>jdk.tools</artifactId>
+                </exclusion>
+            </exclusions>
 		</dependency>        
         <dependency>
             <groupId>eu.dnetlib.dhp</groupId>
diff --git a/pom.xml b/pom.xml
index fa4f16df39..78dda85131 100644
--- a/pom.xml
+++ b/pom.xml
@@ -120,11 +120,18 @@
 		<repository>
 			<id>conjars</id>
 			<name>conjars</name>
-			<url>https://conjars.wensel.net/repo/</url>
+			<url>https://conjars.wensel.net/repo/ </url>
 		</repository>
 	</repositories>
 
 	<dependencies>
+		<!-- Quick FIX not to remove lombok everywhere -->
+		<dependency>
+			<groupId>org.projectlombok</groupId>
+			<artifactId>lombok</artifactId>
+			<version>1.18.28</version>
+			<scope>provided</scope>
+		</dependency>
 		<dependency>
 			<groupId>org.junit.jupiter</groupId>
 			<artifactId>junit-jupiter</artifactId>
@@ -812,7 +819,7 @@
 			<plugin>
 				<groupId>org.jacoco</groupId>
 				<artifactId>jacoco-maven-plugin</artifactId>
-				<version>0.7.9</version>
+				<version>0.8.10</version>
 				<configuration>
 					<excludes>
 						<exclude>**/schemas/*</exclude>
@@ -963,8 +970,8 @@
 
 				<dhp.guava.version>14.0.1</dhp.guava.version>
 				<solr.version>8.11.0</solr.version>
-				<sparksolr.version>4.0.2</sparksolr.version>
-				<dhp.spark.version>3.4.1</dhp.spark.version>
+                <sparksolr.version>4.0.4</sparksolr.version>
+                <dhp.spark.version>3.4.2-SNAPSHOT</dhp.spark.version>
 				<dhp.jackson.version>2.14.2</dhp.jackson.version>
 				<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
 				<json4s.version>3.7.0-M11</json4s.version>
@@ -977,8 +984,29 @@
 				<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
 				 -->
 			</properties>
+		</profile>
 
-
+		<profile>
+			<id>java17</id>
+			<activation>
+				<jdk>17</jdk>
+			</activation>
+			<build>
+				<pluginManagement>
+					<plugins>
+						<plugin>
+							<groupId>org.apache.maven.plugins</groupId>
+							<artifactId>maven-surefire-plugin</artifactId>
+							<version>3.0.0-M4</version>
+							<configuration>
+								<!-- only for java 11+ to run spark in tests -->
+								<argLine>--add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED</argLine>
+								<redirectTestOutputToFile>true</redirectTestOutputToFile>
+							</configuration>
+						</plugin>
+					</plugins>
+				</pluginManagement>
+			</build>
 		</profile>
 	</profiles>
 </project>

From 8c3e9a09d38fbb9d09d1a72d7bde2183c4a65967 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Mon, 18 Sep 2023 12:51:18 +0200
Subject: [PATCH 03/34] added repository openaire-third-parties

---
 .../WritePredefinedProjectPropertiesTest.java |    2 +-
 .../eu/dnetlib/pace/util/DiffPatchMatch.java  |   18 +
 .../oa/dedup/graph/ConnectedComponent.java    |    2 +-
 pom.xml                                       | 2047 +++++++++--------
 4 files changed, 1058 insertions(+), 1011 deletions(-)

diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
index 19e9377afd..eddcd88678 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@@ -88,7 +88,7 @@ class WritePredefinedProjectPropertiesTest {
 				.assertTrue(
 					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
 						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
-        }
+		}
 	}
 
 	@Test
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
index cfd9acd702..154bac62c9 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
@@ -1,6 +1,24 @@
 
 package eu.dnetlib.pace.util;
 
+/*
+ * Diff Match and Patch
+ * Copyright 2018 The diff-match-patch Authors.
+ * https://github.com/google/diff-match-patch
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 /*
  * Diff Match and Patch
  * Copyright 2018 The diff-match-patch Authors.
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
index 4fc0a25e81..f4b3c441ae 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
@@ -93,7 +93,7 @@ public class ConnectedComponent implements Serializable {
 	}
 
 	public void setIds(List<String> ids) {
-		this.ids =ids;
+		this.ids = ids;
 	}
 
 	public String getCcId() {
diff --git a/pom.xml b/pom.xml
index 78dda85131..1480af2a6e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,1012 +1,1041 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-
-	<modelVersion>4.0.0</modelVersion>
-	<groupId>eu.dnetlib.dhp</groupId>
-	<artifactId>dhp</artifactId>
-	<version>1.2.5-SNAPSHOT</version>
-	<packaging>pom</packaging>
-
-	<licenses>
-		<license>
-			<name>GNU Affero General Public License v3.0 or later</name>
-			<url>https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText</url>
-			<distribution>repo</distribution>
-			<comments>This program is free software: you can redistribute it and/or modify it under the terms of the
-				GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
-				License, or (at your option) any later version.</comments>
-		</license>
-	</licenses>
-
-	<modules>
-		<module>dhp-build</module>
-		<module>dhp-pace-core</module>
-		<module>dhp-common</module>
-		<module>dhp-workflows</module>
-	</modules>
-
-	<issueManagement>
-		<system>Redmine</system>
-		<url>https://support.openaire.eu/projects/openaire</url>
-	</issueManagement>
-
-	<ciManagement>
-		<system>jenkins</system>
-		<url>https://jenkins-dnet.d4science.org/</url>
-	</ciManagement>
-
-	<scm>
-		<connection>scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git</connection>
-		<developerConnection>scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git</developerConnection>
-		<url>https://code-repo.d4science.org/D-Net/dnet-hadoop/</url>
-		<tag>HEAD</tag>
-	</scm>
-
-	<description>This module is the root descriptor for the dnet-hadoop project</description>
-
-	<pluginRepositories>
-	</pluginRepositories>
-
-	<repositories>
-		<repository>
-			<id>dnet45-releases</id>
-			<name>D-Net 45 releases</name>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
-			<layout>default</layout>
-			<snapshots>
-				<enabled>false</enabled>
-			</snapshots>
-			<releases>
-				<enabled>true</enabled>
-			</releases>
-		</repository>
-		<repository>
-			<id>dnet45-snapshots</id>
-			<name>D-Net 45 snapshots</name>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
-			<layout>default</layout>
-			<snapshots>
-				<enabled>true</enabled>
-			</snapshots>
-			<releases>
-				<enabled>false</enabled>
-			</releases>
-		</repository>
-		<repository>
-			<id>dnet45-bootstrap-snapshot</id>
-			<name>D-Net 45 Bootstrap Snapshot</name>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/</url>
-			<releases>
-				<enabled>false</enabled>
-			</releases>
-			<snapshots>
-				<enabled>true</enabled>
-			</snapshots>
-			<layout>default</layout>
-		</repository>
-		<repository>
-			<id>dnet45-bootstrap-release</id>
-			<name>D-Net 45 Bootstrap Release</name>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/</url>
-			<releases>
-				<enabled>true</enabled>
-			</releases>
-			<snapshots>
-				<enabled>false</enabled>
-			</snapshots>
-			<layout>default</layout>
-		</repository>
-		<repository>
-			<id>cloudera</id>
-			<name>Cloudera Repository</name>
-			<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
-			<releases>
-				<enabled>true</enabled>
-			</releases>
-			<snapshots>
-				<enabled>false</enabled>
-			</snapshots>
-		</repository>
-		<repository>
-			<id>dnet-deps</id>
-			<name>dnet-dependencies</name>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps</url>
-			<layout>default</layout>
-		</repository>
-		<repository>
-			<id>maven-restlet</id>
-			<name>Restlet repository</name>
-			<url>https://maven.restlet.talend.com</url>
-		</repository>
-		<repository>
-			<id>conjars</id>
-			<name>conjars</name>
-			<url>https://conjars.wensel.net/repo/ </url>
-		</repository>
-	</repositories>
-
-	<dependencies>
-		<!-- Quick FIX not to remove lombok everywhere -->
-		<dependency>
-			<groupId>org.projectlombok</groupId>
-			<artifactId>lombok</artifactId>
-			<version>1.18.28</version>
-			<scope>provided</scope>
-		</dependency>
-		<dependency>
-			<groupId>org.junit.jupiter</groupId>
-			<artifactId>junit-jupiter</artifactId>
-			<version>${junit-jupiter.version}</version>
-			<scope>test</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.mockito</groupId>
-			<artifactId>mockito-core</artifactId>
-			<version>${mockito-core.version}</version>
-			<scope>test</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.mockito</groupId>
-			<artifactId>mockito-junit-jupiter</artifactId>
-			<version>${mockito-core.version}</version>
-			<scope>test</scope>
-		</dependency>
-
-	</dependencies>
-
-	<dependencyManagement>
-		<dependencies>
-			<dependency>
-				<groupId>eu.dnetlib.dhp</groupId>
-				<artifactId>${dhp-schemas.artifact}</artifactId>
-				<version>${dhp-schemas.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.hadoop</groupId>
-				<artifactId>hadoop-hdfs</artifactId>
-				<version>${dhp.hadoop.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.hadoop</groupId>
-				<artifactId>hadoop-common</artifactId>
-				<version>${dhp.hadoop.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.hadoop</groupId>
-				<artifactId>hadoop-client</artifactId>
-				<version>${dhp.hadoop.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.hadoop</groupId>
-				<artifactId>hadoop-distcp</artifactId>
-				<version>${dhp.hadoop.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.spark</groupId>
-				<artifactId>spark-core_${scala.binary.version}</artifactId>
-				<version>${dhp.spark.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.spark</groupId>
-				<artifactId>spark-sql_${scala.binary.version}</artifactId>
-				<version>${dhp.spark.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.spark</groupId>
-				<artifactId>spark-graphx_${scala.binary.version}</artifactId>
-				<version>${dhp.spark.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.spark</groupId>
-				<artifactId>spark-hive_${scala.binary.version}</artifactId>
-				<version>${dhp.spark.version}</version>
-				<scope>test</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>org.slf4j</groupId>
-				<artifactId>slf4j-api</artifactId>
-				<version>${org.slf4j.version}</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>org.slf4j</groupId>
-				<artifactId>jcl-over-slf4j</artifactId>
-				<version>${org.slf4j.version}</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.commons</groupId>
-				<artifactId>commons-lang3</artifactId>
-				<version>${dhp.commons.lang.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.commons</groupId>
-				<artifactId>commons-beanutils</artifactId>
-				<version>${commons-beanutils.version}</version>
-			</dependency>
-
-
-			<dependency>
-				<groupId>commons-validator</groupId>
-				<artifactId>commons-validator</artifactId>
-				<version>${commons-validator.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.github.sisyphsu</groupId>
-				<artifactId>dateparser</artifactId>
-				<version>${dateparser.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>me.xuender</groupId>
-				<artifactId>unidecode</artifactId>
-				<version>${unidecode.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.google.guava</groupId>
-				<artifactId>guava</artifactId>
-				<version>${dhp.guava.version}</version>
-			</dependency>
-
-
-			<dependency>
-				<groupId>commons-codec</groupId>
-				<artifactId>commons-codec</artifactId>
-				<version>${commons-codec.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>commons-io</groupId>
-				<artifactId>commons-io</artifactId>
-				<version>${commons-io.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>commons-cli</groupId>
-				<artifactId>commons-cli</artifactId>
-				<version>1.2</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>net.sf.saxon</groupId>
-				<artifactId>Saxon-HE</artifactId>
-				<version>9.9.1-6</version>
-			</dependency>
-
-			<dependency>
-				<groupId>dom4j</groupId>
-				<artifactId>dom4j</artifactId>
-				<version>1.6.1</version>
-			</dependency>
-
-			<dependency>
-				<groupId>xml-apis</groupId>
-				<artifactId>xml-apis</artifactId>
-				<version>1.4.01</version>
-			</dependency>
-
-			<dependency>
-				<groupId>jaxen</groupId>
-				<artifactId>jaxen</artifactId>
-				<version>1.1.6</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.mycila.xmltool</groupId>
-				<artifactId>xmltool</artifactId>
-				<version>3.3</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.solr</groupId>
-				<artifactId>solr-solrj</artifactId>
-				<version>${solr.version}</version>
-				<exclusions>
-					<exclusion>
-						<artifactId>*</artifactId>
-						<groupId>*</groupId>
-					</exclusion>
-				</exclusions>
-			</dependency>
-			<dependency>
-				<groupId>com.lucidworks.spark</groupId>
-				<artifactId>spark-solr</artifactId>
-				<version>${sparksolr.version}</version>
-				<exclusions>
-					<exclusion>
-						<artifactId>*</artifactId>
-						<groupId>*</groupId>
-					</exclusion>
-				</exclusions>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.solr</groupId>
-				<artifactId>solr-test-framework</artifactId>
-				<version>${solr.version}</version>
-				<scope>test</scope>
-			</dependency>
-			<dependency>
-				<groupId>io.dropwizard.metrics</groupId>
-				<artifactId>metrics-core</artifactId>
-				<version>3.2.6</version>
-				<scope>test</scope>
-			</dependency>
-
-
-			<dependency>
-				<groupId>org.apache.httpcomponents</groupId>
-				<artifactId>httpclient</artifactId>
-				<version>${org.apache.httpcomponents.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.httpcomponents</groupId>
-				<artifactId>httpmime</artifactId>
-				<version>${org.apache.httpcomponents.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>org.noggit</groupId>
-				<artifactId>noggit</artifactId>
-				<version>0.8</version>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.zookeeper</groupId>
-				<artifactId>zookeeper</artifactId>
-				<version>3.4.11</version>
-			</dependency>
-
-			<dependency>
-				<groupId>net.schmizz</groupId>
-				<artifactId>sshj</artifactId>
-				<version>0.10.0</version>
-				<scope>test</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>com.fasterxml.jackson.core</groupId>
-				<artifactId>jackson-core</artifactId>
-				<version>${dhp.jackson.version}</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>com.fasterxml.jackson.core</groupId>
-				<artifactId>jackson-annotations</artifactId>
-				<version>${dhp.jackson.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>com.fasterxml.jackson.core</groupId>
-				<artifactId>jackson-databind</artifactId>
-				<version>${dhp.jackson.version}</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>eu.dnetlib</groupId>
-				<artifactId>dnet-actionmanager-common</artifactId>
-				<version>${dnet-actionmanager-common.version}</version>
-				<exclusions>
-					<exclusion>
-						<groupId>org.apache.hadoop</groupId>
-						<artifactId>hadoop-common</artifactId>
-					</exclusion>
-				</exclusions>
-			</dependency>
-			<dependency>
-				<groupId>eu.dnetlib</groupId>
-				<artifactId>dnet-actionmanager-api</artifactId>
-				<version>${dnet-actionmanager-api.version}</version>
-				<exclusions>
-					<exclusion>
-						<groupId>eu.dnetlib</groupId>
-						<artifactId>cnr-misc-utils</artifactId>
-					</exclusion>
-				</exclusions>
-			</dependency>
-
-			<dependency>
-				<groupId>eu.dnetlib</groupId>
-				<artifactId>cnr-rmi-api</artifactId>
-				<version>${cnr-rmi-api.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>eu.dnetlib.dhp</groupId>
-				<artifactId>dnet-openaire-broker-common</artifactId>
-				<version>${dnet-openaire-broker-common.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.cxf</groupId>
-				<artifactId>cxf-rt-transports-http</artifactId>
-				<version>3.1.5</version>
-			</dependency>
-
-			<dependency>
-				<groupId>javax.persistence</groupId>
-				<artifactId>javax.persistence-api</artifactId>
-				<version>2.2</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>com.jayway.jsonpath</groupId>
-				<artifactId>json-path</artifactId>
-				<version>2.4.0</version>
-			</dependency>
-			<dependency>
-				<groupId>com.arakelian</groupId>
-				<artifactId>java-jq</artifactId>
-				<version>0.10.1</version>
-			</dependency>
-			<dependency>
-				<groupId>edu.cmu</groupId>
-				<artifactId>secondstring</artifactId>
-				<version>1.0.0</version>
-			</dependency>
-			<dependency>
-				<groupId>org.mongodb</groupId>
-				<artifactId>mongo-java-driver</artifactId>
-				<version>${mongodb.driver.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>io.fares.junit.mongodb</groupId>
-				<artifactId>mongodb-junit-test</artifactId>
-				<version>1.1.0</version>
-			</dependency>
-			<dependency>
-				<groupId>org.postgresql</groupId>
-				<artifactId>postgresql</artifactId>
-				<version>42.2.10</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.antlr</groupId>
-				<artifactId>stringtemplate</artifactId>
-				<version>3.2.1</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.antlr</groupId>
-				<artifactId>ST4</artifactId>
-				<version>4.3.4</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.ximpleware</groupId>
-				<artifactId>vtd-xml</artifactId>
-				<version>${vtd.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.elasticsearch</groupId>
-				<artifactId>elasticsearch-hadoop</artifactId>
-				<version>7.6.0</version>
-			</dependency>
-
-
-			<dependency>
-				<groupId>org.apache.oozie</groupId>
-				<artifactId>oozie-client</artifactId>
-				<version>${dhp.oozie.version}</version>
-				<scope>provided</scope>
-				<exclusions>
-					<!-- conflicts -->
-					<exclusion>
-						<artifactId>slf4j-simple</artifactId>
-						<groupId>org.slf4j</groupId>
-					</exclusion>
-				</exclusions>
-			</dependency>
-
-
-			<dependency>
-				<groupId>com.squareup.okhttp3</groupId>
-				<artifactId>okhttp</artifactId>
-				<version>${okhttp.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.commons</groupId>
-				<artifactId>commons-compress</artifactId>
-				<version>${common.compress.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.commons</groupId>
-				<artifactId>commons-csv</artifactId>
-				<version>${common.csv.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.poi</groupId>
-				<artifactId>poi-ooxml</artifactId>
-				<version>${apache.poi.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.json</groupId>
-				<artifactId>json</artifactId>
-				<version>20180813</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.json4s</groupId>
-				<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
-				<version>${json4s.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.github.victools</groupId>
-				<artifactId>jsonschema-generator</artifactId>
-				<version>${jsonschemagenerator.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.commons</groupId>
-				<artifactId>commons-text</artifactId>
-				<version>${common.text.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.opencsv</groupId>
-				<artifactId>opencsv</artifactId>
-				<version>5.5</version>
-			</dependency>
-			<dependency>
-				<groupId>io.github.classgraph</groupId>
-				<artifactId>classgraph</artifactId>
-				<version>4.8.71</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.fasterxml.jackson.dataformat</groupId>
-				<artifactId>jackson-dataformat-xml</artifactId>
-				<version>${jackson.version}</version>
-				<scope>provided</scope>
-			</dependency>
-			<dependency>
-				<groupId>com.fasterxml.jackson.module</groupId>
-				<artifactId>jackson-module-jsonSchema</artifactId>
-				<version>${jackson.version}</version>
-				<scope>provided</scope>
-			</dependency>
-
-			<dependency>
-				<groupId>org.apache.commons</groupId>
-				<artifactId>commons-math3</artifactId>
-				<version>3.6.1</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.google.code.gson</groupId>
-				<artifactId>gson</artifactId>
-				<version>${google.gson.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>commons-collections</groupId>
-				<artifactId>commons-collections</artifactId>
-				<version>${commons.collections.version}</version>
-			</dependency>
-			<dependency>
-				<groupId>commons-logging</groupId>
-				<artifactId>commons-logging</artifactId>
-				<version>${commons.logging.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.reflections</groupId>
-				<artifactId>reflections</artifactId>
-				<version>${reflections.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.scala-lang</groupId>
-				<artifactId>scala-library</artifactId>
-				<version>${scala.version}</version>
-			</dependency>
-
-			<dependency>
-				<groupId>com.ibm.icu</groupId>
-				<artifactId>icu4j</artifactId>
-				<version>70.1</version>
-			</dependency>
-
-			<dependency>
-				<groupId>org.javassist</groupId>
-				<artifactId>javassist</artifactId>
-				<version>${javassist.version}</version>
-			</dependency>
-		</dependencies>
-	</dependencyManagement>
-
-	<build>
-		<directory>target</directory>
-		<outputDirectory>target/classes</outputDirectory>
-		<finalName>${project.artifactId}-${project.version}</finalName>
-		<testOutputDirectory>target/test-classes</testOutputDirectory>
-		<pluginManagement>
-			<plugins>
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-plugin-plugin</artifactId>
-					<version>3.3</version>
-				</plugin>
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-project-info-reports-plugin</artifactId>
-					<version>3.0.0</version>
-				</plugin>
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-site-plugin</artifactId>
-					<version>3.9.1</version>
-					<configuration>
-						<skip>${dhp.site.skip}</skip>
-					</configuration>
-				</plugin>
-
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-compiler-plugin</artifactId>
-					<version>${maven.compiler.plugin.version}</version>
-					<configuration>
-						<source>1.8</source>
-						<target>1.8</target>
-						<encoding>${project.build.sourceEncoding}</encoding>
-					</configuration>
-				</plugin>
-
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-jar-plugin</artifactId>
-					<version>3.0.2</version>
-				</plugin>
-
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-source-plugin</artifactId>
-					<version>3.0.1</version>
-					<executions>
-						<execution>
-							<id>attach-sources</id>
-							<phase>verify</phase>
-							<goals>
-								<goal>jar-no-fork</goal>
-							</goals>
-						</execution>
-					</executions>
-				</plugin>
-
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-surefire-plugin</artifactId>
-					<version>3.0.0-M4</version>
-					<configuration>
-						<redirectTestOutputToFile>true</redirectTestOutputToFile>
-					</configuration>
-				</plugin>
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-javadoc-plugin</artifactId>
-					<version>3.2.0</version>
-					<configuration>
-						<detectLinks>true</detectLinks>
-						<doclint>none</doclint>
-					</configuration>
-				</plugin>
-				<plugin>
-					<groupId>org.apache.maven.plugins</groupId>
-					<artifactId>maven-dependency-plugin</artifactId>
-					<version>3.6.0</version>
-				</plugin>
-
-				<plugin>
-					<groupId>net.revelc.code.formatter</groupId>
-					<artifactId>formatter-maven-plugin</artifactId>
-					<version>2.11.0</version>
-					<dependencies>
-						<dependency>
-							<groupId>eu.dnetlib.dhp</groupId>
-							<artifactId>dhp-code-style</artifactId>
-							<version>${project.version}</version>
-						</dependency>
-					</dependencies>
-				</plugin>
-				<plugin>
-					<groupId>org.antipathy</groupId>
-					<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
-					<version>1.0.1640073709.733712b</version>
-					<dependencies>
-						<dependency>
-							<groupId>eu.dnetlib.dhp</groupId>
-							<artifactId>dhp-code-style</artifactId>
-							<version>${project.version}</version>
-						</dependency>
-					</dependencies>
-				</plugin>
-			</plugins>
-		</pluginManagement>
-		<plugins>
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-site-plugin</artifactId>
-			</plugin>
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-project-info-reports-plugin</artifactId>
-			</plugin>
-			<plugin>
-				<groupId>net.revelc.code.formatter</groupId>
-				<artifactId>formatter-maven-plugin</artifactId>
-				<executions>
-					<execution>
-						<goals>
-							<goal>format</goal>
-						</goals>
-						<configuration>
-							<configFile>eclipse/formatter_dnet.xml</configFile>
-						</configuration>
-					</execution>
-				</executions>
-			</plugin>
-			<plugin>
-				<groupId>net.revelc.code</groupId>
-				<artifactId>impsort-maven-plugin</artifactId>
-				<version>1.4.1</version>
-				<configuration>
-					<groups>java.,javax.,org.,com.</groups>
-					<staticGroups>java,*</staticGroups>
-					<excludes>
-						<exclude>**/thrift/*.java</exclude>
-					</excludes>
-				</configuration>
-				<executions>
-					<execution>
-						<id>sort-imports</id>
-						<goals>
-							<goal>sort</goal>
-						</goals>
-					</execution>
-				</executions>
-			</plugin>
-			<plugin>
-				<groupId>org.antipathy</groupId>
-				<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
-				<configuration>
-					<configLocation>https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
-					<skipTestSources>false</skipTestSources>
-					<skipSources>false</skipSources>
-					<sourceDirectories>
-						<param>${project.basedir}/src/main/scala</param>
-					</sourceDirectories>
-					<testSourceDirectories>
-						<param>${project.basedir}/src/test/scala</param>
-					</testSourceDirectories>
-					<validateOnly>false</validateOnly>
-					<onlyChangedFiles>false</onlyChangedFiles>
-					<branch>: git rev-parse --abbrev-ref HEAD</branch>
-					<useSpecifiedRepositories>false</useSpecifiedRepositories>
-				</configuration>
-				<executions>
-					<execution>
-						<phase>validate</phase>
-						<goals>
-							<goal>format</goal>
-						</goals>
-					</execution>
-				</executions>
-			</plugin>
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-release-plugin</artifactId>
-				<version>2.5.3</version>
-			</plugin>
-			<plugin>
-				<groupId>org.jacoco</groupId>
-				<artifactId>jacoco-maven-plugin</artifactId>
-				<version>0.8.10</version>
-				<configuration>
-					<excludes>
-						<exclude>**/schemas/*</exclude>
-						<exclude>**/com/cloudera/**/*</exclude>
-						<exclude>**/org/apache/avro/io/**/*</exclude>
-					</excludes>
-				</configuration>
-				<executions>
-					<execution>
-						<id>default-prepare-agent</id>
-						<goals>
-							<goal>prepare-agent</goal>
-						</goals>
-					</execution>
-					<execution>
-						<id>default-report</id>
-						<phase>prepare-package</phase>
-						<goals>
-							<goal>report</goal>
-						</goals>
-					</execution>
-				</executions>
-			</plugin>
-
-		</plugins>
-
-		<extensions>
-			<extension>
-				<groupId>org.apache.maven.wagon</groupId>
-				<artifactId>wagon-ssh</artifactId>
-				<version>2.10</version>
-			</extension>
-		</extensions>
-	</build>
-	<distributionManagement>
-		<snapshotRepository>
-			<id>dnet45-snapshots</id>
-			<name>DNet45 Snapshots</name>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
-			<layout>default</layout>
-		</snapshotRepository>
-		<repository>
-			<id>dnet45-releases</id>
-			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
-		</repository>
-		<site>
-			<id>DHPSite</id>
-			<url>${dhp.site.stage.path}/</url>
-		</site>
-	</distributionManagement>
-	<reporting>
-		<plugins>
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-javadoc-plugin</artifactId>
-				<configuration>
-					<detectLinks>true</detectLinks>
-					<doclint>none</doclint>
-				</configuration>
-			</plugin>
-		</plugins>
-	</reporting>
-
-	<properties>
-		<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
-		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-		<maven.compiler.source>1.8</maven.compiler.source>
-		<maven.compiler.target>1.8</maven.compiler.target>
-
-		<!-- scala version -->
-		<scala.version>2.11.12</scala.version>
-		<scala.binary.version>2.11</scala.binary.version>
-
-		<!-- plugin versions -->
-		<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
-		<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
-		<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
-		<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
-
-		<!-- dependency versions -->
-		<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
-
-		<apache.poi.version>4.1.2</apache.poi.version>
-		<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
-		<common.compress.version>1.20</common.compress.version>
-		<common.csv.version>1.8</common.csv.version>
-		<common.text.version>1.8</common.text.version>
-		<commons-beanutils.version>1.9.4</commons-beanutils.version>
-		<commons-codec.version>1.9</commons-codec.version>
-		<commons.collections.version>3.2.1</commons.collections.version>
-		<commons-io.version>2.4</commons-io.version>
-		<commons.logging.version>1.1.3</commons.logging.version>
-		<commons-validator.version>1.7</commons-validator.version>
-		<dateparser.version>1.0.7</dateparser.version>
-		<dhp-schemas.version>[3.17.1]</dhp-schemas.version>
-		<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
-		<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
-		<dhp.guava.version>11.0.2</dhp.guava.version>
-		<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
-		<dhp.jackson.version>2.9.6</dhp.jackson.version>
-		<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
-		<dhp.site.skip>true</dhp.site.skip>
-		<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
-		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
-		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
-		<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
-		<google.gson.version>2.2.2</google.gson.version>
-		<javassist.version>3.19.0-GA</javassist.version>
-		<json4s.version>3.5.3</json4s.version>
-		<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
-		<junit-jupiter.version>5.6.1</junit-jupiter.version>
-		<mockito-core.version>3.3.3</mockito-core.version>
-		<mongodb.driver.version>3.4.2</mongodb.driver.version>
-		<okhttp.version>4.7.2</okhttp.version>
-		<org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
-		<org.slf4j.version>1.7.25</org.slf4j.version>
-		<reflections.version>0.9.10</reflections.version>
-		<scala-xml.version>1.3.0</scala-xml.version>
-		<solr.version>7.5.0</solr.version>
-		<sparksolr.version>3.6.0</sparksolr.version>
-		<unidecode.version>0.0.7</unidecode.version>
-		<vtd.version>[2.12,3.0)</vtd.version>
-	</properties>
-
-	<!-- Build with scala 12 and Spark 3.4 -->
-	<profiles>
-		<profile>
-			<id>scala-2.12</id>
-			<properties>
-				<scala.binary.version>2.12</scala.binary.version>
-				<scala.version>2.12.18</scala.version>
-				<scala-xml.version>1.3.0</scala-xml.version>
-
-				<!-- plugin versions -->
-				<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
-
-				<!-- dependencies -->
-				<common.compress.version>1.22</common.compress.version>
-				<common.csv.version>1.8</common.csv.version>
-				<common.text.version>1.10.0</common.text.version>
-				<commons-beanutils.version>1.9.4</commons-beanutils.version>
-				<commons-codec.version>1.15</commons-codec.version>
-				<commons.collections.version>3.2.2</commons.collections.version>
-				<commons-io.version>2.11.0</commons-io.version>
-				<commons.logging.version>1.1.3</commons.logging.version>
-				<commons-validator.version>1.7</commons-validator.version>
-
-				<dhp.guava.version>14.0.1</dhp.guava.version>
-				<solr.version>8.11.0</solr.version>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>eu.dnetlib.dhp</groupId>
+    <artifactId>dhp</artifactId>
+    <version>1.2.5-SNAPSHOT</version>
+    <packaging>pom</packaging>
+
+    <licenses>
+        <license>
+            <name>GNU Affero General Public License v3.0 or later</name>
+            <url>https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText</url>
+            <distribution>repo</distribution>
+            <comments>This program is free software: you can redistribute it and/or modify it under the terms of the
+                GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
+                License, or (at your option) any later version.
+            </comments>
+        </license>
+    </licenses>
+
+    <modules>
+        <module>dhp-build</module>
+        <module>dhp-pace-core</module>
+        <module>dhp-common</module>
+        <module>dhp-workflows</module>
+    </modules>
+
+    <issueManagement>
+        <system>Redmine</system>
+        <url>https://support.openaire.eu/projects/openaire</url>
+    </issueManagement>
+
+    <ciManagement>
+        <system>jenkins</system>
+        <url>https://jenkins-dnet.d4science.org/</url>
+    </ciManagement>
+
+    <scm>
+        <connection>scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git</connection>
+        <developerConnection>scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git</developerConnection>
+        <url>https://code-repo.d4science.org/D-Net/dnet-hadoop/</url>
+        <tag>HEAD</tag>
+    </scm>
+
+    <description>This module is the root descriptor for the dnet-hadoop project</description>
+
+    <pluginRepositories>
+    </pluginRepositories>
+
+    <repositories>
+
+        <repository>
+            <id>Openaire-third-parties-snaphot</id>
+            <name>Openaire third parties Snapshot</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/</url>
+            <releases>
+                <enabled>false</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+
+        <repository>
+            <id>dnet45-releases</id>
+            <name>D-Net 45 releases</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
+            <layout>default</layout>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+        </repository>
+        <repository>
+            <id>dnet45-snapshots</id>
+            <name>D-Net 45 snapshots</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
+            <layout>default</layout>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+            <releases>
+                <enabled>false</enabled>
+            </releases>
+        </repository>
+        <repository>
+            <id>dnet45-bootstrap-snapshot</id>
+            <name>D-Net 45 Bootstrap Snapshot</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/</url>
+            <releases>
+                <enabled>false</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+            <layout>default</layout>
+        </repository>
+        <repository>
+            <id>dnet45-bootstrap-release</id>
+            <name>D-Net 45 Bootstrap Release</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+            <layout>default</layout>
+        </repository>
+        <repository>
+            <id>cloudera</id>
+            <name>Cloudera Repository</name>
+            <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+        <repository>
+            <id>dnet-deps</id>
+            <name>dnet-dependencies</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet-deps</url>
+            <layout>default</layout>
+        </repository>
+        <repository>
+            <id>maven-restlet</id>
+            <name>Restlet repository</name>
+            <url>https://maven.restlet.talend.com</url>
+        </repository>
+        <repository>
+            <id>conjars</id>
+            <name>conjars</name>
+            <url>https://conjars.wensel.net/repo/</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <!-- Quick FIX not to remove lombok everywhere -->
+        <dependency>
+            <groupId>org.projectlombok</groupId>
+            <artifactId>lombok</artifactId>
+            <version>1.18.28</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <version>${junit-jupiter.version}</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>${mockito-core.version}</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-junit-jupiter</artifactId>
+            <version>${mockito-core.version}</version>
+            <scope>test</scope>
+        </dependency>
+
+    </dependencies>
+
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>eu.dnetlib.dhp</groupId>
+                <artifactId>${dhp-schemas.artifact}</artifactId>
+                <version>${dhp-schemas.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.hadoop</groupId>
+                <artifactId>hadoop-hdfs</artifactId>
+                <version>${dhp.hadoop.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.hadoop</groupId>
+                <artifactId>hadoop-common</artifactId>
+                <version>${dhp.hadoop.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.hadoop</groupId>
+                <artifactId>hadoop-client</artifactId>
+                <version>${dhp.hadoop.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.hadoop</groupId>
+                <artifactId>hadoop-distcp</artifactId>
+                <version>${dhp.hadoop.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-core_${scala.binary.version}</artifactId>
+                <version>${dhp.spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-sql_${scala.binary.version}</artifactId>
+                <version>${dhp.spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-graphx_${scala.binary.version}</artifactId>
+                <version>${dhp.spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-hive_${scala.binary.version}</artifactId>
+                <version>${dhp.spark.version}</version>
+                <scope>test</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.slf4j</groupId>
+                <artifactId>slf4j-api</artifactId>
+                <version>${org.slf4j.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.slf4j</groupId>
+                <artifactId>jcl-over-slf4j</artifactId>
+                <version>${org.slf4j.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-lang3</artifactId>
+                <version>${dhp.commons.lang.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-beanutils</artifactId>
+                <version>${commons-beanutils.version}</version>
+            </dependency>
+
+
+            <dependency>
+                <groupId>commons-validator</groupId>
+                <artifactId>commons-validator</artifactId>
+                <version>${commons-validator.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.github.sisyphsu</groupId>
+                <artifactId>dateparser</artifactId>
+                <version>${dateparser.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>me.xuender</groupId>
+                <artifactId>unidecode</artifactId>
+                <version>${unidecode.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.google.guava</groupId>
+                <artifactId>guava</artifactId>
+                <version>${dhp.guava.version}</version>
+            </dependency>
+
+
+            <dependency>
+                <groupId>commons-codec</groupId>
+                <artifactId>commons-codec</artifactId>
+                <version>${commons-codec.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>commons-io</groupId>
+                <artifactId>commons-io</artifactId>
+                <version>${commons-io.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>commons-cli</groupId>
+                <artifactId>commons-cli</artifactId>
+                <version>1.2</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>net.sf.saxon</groupId>
+                <artifactId>Saxon-HE</artifactId>
+                <version>9.9.1-6</version>
+            </dependency>
+
+            <dependency>
+                <groupId>dom4j</groupId>
+                <artifactId>dom4j</artifactId>
+                <version>1.6.1</version>
+            </dependency>
+
+            <dependency>
+                <groupId>xml-apis</groupId>
+                <artifactId>xml-apis</artifactId>
+                <version>1.4.01</version>
+            </dependency>
+
+            <dependency>
+                <groupId>jaxen</groupId>
+                <artifactId>jaxen</artifactId>
+                <version>1.1.6</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.mycila.xmltool</groupId>
+                <artifactId>xmltool</artifactId>
+                <version>3.3</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.solr</groupId>
+                <artifactId>solr-solrj</artifactId>
+                <version>${solr.version}</version>
+                <exclusions>
+                    <exclusion>
+                        <artifactId>*</artifactId>
+                        <groupId>*</groupId>
+                    </exclusion>
+                </exclusions>
+            </dependency>
+            <dependency>
+                <groupId>com.lucidworks.spark</groupId>
+                <artifactId>spark-solr</artifactId>
+                <version>${sparksolr.version}</version>
+                <exclusions>
+                    <exclusion>
+                        <artifactId>*</artifactId>
+                        <groupId>*</groupId>
+                    </exclusion>
+                </exclusions>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.solr</groupId>
+                <artifactId>solr-test-framework</artifactId>
+                <version>${solr.version}</version>
+                <scope>test</scope>
+            </dependency>
+            <dependency>
+                <groupId>io.dropwizard.metrics</groupId>
+                <artifactId>metrics-core</artifactId>
+                <version>3.2.6</version>
+                <scope>test</scope>
+            </dependency>
+
+
+            <dependency>
+                <groupId>org.apache.httpcomponents</groupId>
+                <artifactId>httpclient</artifactId>
+                <version>${org.apache.httpcomponents.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.httpcomponents</groupId>
+                <artifactId>httpmime</artifactId>
+                <version>${org.apache.httpcomponents.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.noggit</groupId>
+                <artifactId>noggit</artifactId>
+                <version>0.8</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.zookeeper</groupId>
+                <artifactId>zookeeper</artifactId>
+                <version>3.4.11</version>
+            </dependency>
+
+            <dependency>
+                <groupId>net.schmizz</groupId>
+                <artifactId>sshj</artifactId>
+                <version>0.10.0</version>
+                <scope>test</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-core</artifactId>
+                <version>${dhp.jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-annotations</artifactId>
+                <version>${dhp.jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-databind</artifactId>
+                <version>${dhp.jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>eu.dnetlib</groupId>
+                <artifactId>dnet-actionmanager-common</artifactId>
+                <version>${dnet-actionmanager-common.version}</version>
+                <exclusions>
+                    <exclusion>
+                        <groupId>org.apache.hadoop</groupId>
+                        <artifactId>hadoop-common</artifactId>
+                    </exclusion>
+                </exclusions>
+            </dependency>
+            <dependency>
+                <groupId>eu.dnetlib</groupId>
+                <artifactId>dnet-actionmanager-api</artifactId>
+                <version>${dnet-actionmanager-api.version}</version>
+                <exclusions>
+                    <exclusion>
+                        <groupId>eu.dnetlib</groupId>
+                        <artifactId>cnr-misc-utils</artifactId>
+                    </exclusion>
+                </exclusions>
+            </dependency>
+
+            <dependency>
+                <groupId>eu.dnetlib</groupId>
+                <artifactId>cnr-rmi-api</artifactId>
+                <version>${cnr-rmi-api.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>eu.dnetlib.dhp</groupId>
+                <artifactId>dnet-openaire-broker-common</artifactId>
+                <version>${dnet-openaire-broker-common.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.cxf</groupId>
+                <artifactId>cxf-rt-transports-http</artifactId>
+                <version>3.1.5</version>
+            </dependency>
+
+            <dependency>
+                <groupId>javax.persistence</groupId>
+                <artifactId>javax.persistence-api</artifactId>
+                <version>2.2</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>com.jayway.jsonpath</groupId>
+                <artifactId>json-path</artifactId>
+                <version>2.4.0</version>
+            </dependency>
+            <dependency>
+                <groupId>com.arakelian</groupId>
+                <artifactId>java-jq</artifactId>
+                <version>0.10.1</version>
+            </dependency>
+            <dependency>
+                <groupId>edu.cmu</groupId>
+                <artifactId>secondstring</artifactId>
+                <version>1.0.0</version>
+            </dependency>
+            <dependency>
+                <groupId>org.mongodb</groupId>
+                <artifactId>mongo-java-driver</artifactId>
+                <version>${mongodb.driver.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>io.fares.junit.mongodb</groupId>
+                <artifactId>mongodb-junit-test</artifactId>
+                <version>1.1.0</version>
+            </dependency>
+            <dependency>
+                <groupId>org.postgresql</groupId>
+                <artifactId>postgresql</artifactId>
+                <version>42.2.10</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.antlr</groupId>
+                <artifactId>stringtemplate</artifactId>
+                <version>3.2.1</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.antlr</groupId>
+                <artifactId>ST4</artifactId>
+                <version>4.3.4</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.ximpleware</groupId>
+                <artifactId>vtd-xml</artifactId>
+                <version>${vtd.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.elasticsearch</groupId>
+                <artifactId>elasticsearch-hadoop</artifactId>
+                <version>7.6.0</version>
+            </dependency>
+
+
+            <dependency>
+                <groupId>org.apache.oozie</groupId>
+                <artifactId>oozie-client</artifactId>
+                <version>${dhp.oozie.version}</version>
+                <scope>provided</scope>
+                <exclusions>
+                    <!-- conflicts -->
+                    <exclusion>
+                        <artifactId>slf4j-simple</artifactId>
+                        <groupId>org.slf4j</groupId>
+                    </exclusion>
+                </exclusions>
+            </dependency>
+
+
+            <dependency>
+                <groupId>com.squareup.okhttp3</groupId>
+                <artifactId>okhttp</artifactId>
+                <version>${okhttp.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-compress</artifactId>
+                <version>${common.compress.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-csv</artifactId>
+                <version>${common.csv.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.poi</groupId>
+                <artifactId>poi-ooxml</artifactId>
+                <version>${apache.poi.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.json</groupId>
+                <artifactId>json</artifactId>
+                <version>20180813</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.json4s</groupId>
+                <artifactId>json4s-jackson_${scala.binary.version}</artifactId>
+                <version>${json4s.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.github.victools</groupId>
+                <artifactId>jsonschema-generator</artifactId>
+                <version>${jsonschemagenerator.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-text</artifactId>
+                <version>${common.text.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.opencsv</groupId>
+                <artifactId>opencsv</artifactId>
+                <version>5.5</version>
+            </dependency>
+            <dependency>
+                <groupId>io.github.classgraph</groupId>
+                <artifactId>classgraph</artifactId>
+                <version>4.8.71</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.fasterxml.jackson.dataformat</groupId>
+                <artifactId>jackson-dataformat-xml</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.fasterxml.jackson.module</groupId>
+                <artifactId>jackson-module-jsonSchema</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-math3</artifactId>
+                <version>3.6.1</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.google.code.gson</groupId>
+                <artifactId>gson</artifactId>
+                <version>${google.gson.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>commons-collections</groupId>
+                <artifactId>commons-collections</artifactId>
+                <version>${commons.collections.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>commons-logging</groupId>
+                <artifactId>commons-logging</artifactId>
+                <version>${commons.logging.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.reflections</groupId>
+                <artifactId>reflections</artifactId>
+                <version>${reflections.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.scala-lang</groupId>
+                <artifactId>scala-library</artifactId>
+                <version>${scala.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.ibm.icu</groupId>
+                <artifactId>icu4j</artifactId>
+                <version>70.1</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.javassist</groupId>
+                <artifactId>javassist</artifactId>
+                <version>${javassist.version}</version>
+            </dependency>
+        </dependencies>
+    </dependencyManagement>
+
+    <build>
+        <directory>target</directory>
+        <outputDirectory>target/classes</outputDirectory>
+        <finalName>${project.artifactId}-${project.version}</finalName>
+        <testOutputDirectory>target/test-classes</testOutputDirectory>
+        <pluginManagement>
+            <plugins>
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-plugin-plugin</artifactId>
+                    <version>3.3</version>
+                </plugin>
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-project-info-reports-plugin</artifactId>
+                    <version>3.0.0</version>
+                </plugin>
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-site-plugin</artifactId>
+                    <version>3.9.1</version>
+                    <configuration>
+                        <skip>${dhp.site.skip}</skip>
+                    </configuration>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-compiler-plugin</artifactId>
+                    <version>${maven.compiler.plugin.version}</version>
+                    <configuration>
+                        <source>1.8</source>
+                        <target>1.8</target>
+                        <encoding>${project.build.sourceEncoding}</encoding>
+                    </configuration>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <version>3.0.2</version>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-source-plugin</artifactId>
+                    <version>3.0.1</version>
+                    <executions>
+                        <execution>
+                            <id>attach-sources</id>
+                            <phase>verify</phase>
+                            <goals>
+                                <goal>jar-no-fork</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>3.0.0-M4</version>
+                    <configuration>
+                        <redirectTestOutputToFile>true</redirectTestOutputToFile>
+                    </configuration>
+                </plugin>
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-javadoc-plugin</artifactId>
+                    <version>3.2.0</version>
+                    <configuration>
+                        <detectLinks>true</detectLinks>
+                        <doclint>none</doclint>
+                    </configuration>
+                </plugin>
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-dependency-plugin</artifactId>
+                    <version>3.6.0</version>
+                </plugin>
+
+                <plugin>
+                    <groupId>net.revelc.code.formatter</groupId>
+                    <artifactId>formatter-maven-plugin</artifactId>
+                    <version>2.11.0</version>
+                    <dependencies>
+                        <dependency>
+                            <groupId>eu.dnetlib.dhp</groupId>
+                            <artifactId>dhp-code-style</artifactId>
+                            <version>${project.version}</version>
+                        </dependency>
+                    </dependencies>
+                </plugin>
+                <plugin>
+                    <groupId>org.antipathy</groupId>
+                    <artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
+                    <version>1.0.1640073709.733712b</version>
+                    <dependencies>
+                        <dependency>
+                            <groupId>eu.dnetlib.dhp</groupId>
+                            <artifactId>dhp-code-style</artifactId>
+                            <version>${project.version}</version>
+                        </dependency>
+                    </dependencies>
+                </plugin>
+            </plugins>
+        </pluginManagement>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-site-plugin</artifactId>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-project-info-reports-plugin</artifactId>
+            </plugin>
+            <plugin>
+                <groupId>net.revelc.code.formatter</groupId>
+                <artifactId>formatter-maven-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>format</goal>
+                        </goals>
+                        <configuration>
+                            <configFile>eclipse/formatter_dnet.xml</configFile>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>net.revelc.code</groupId>
+                <artifactId>impsort-maven-plugin</artifactId>
+                <version>1.4.1</version>
+                <configuration>
+                    <groups>java.,javax.,org.,com.</groups>
+                    <staticGroups>java,*</staticGroups>
+                    <excludes>
+                        <exclude>**/thrift/*.java</exclude>
+                    </excludes>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>sort-imports</id>
+                        <goals>
+                            <goal>sort</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.antipathy</groupId>
+                <artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
+                <configuration>
+                    <configLocation>
+                        https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
+                    </configLocation>
+                    <skipTestSources>false</skipTestSources>
+                    <skipSources>false</skipSources>
+                    <sourceDirectories>
+                        <param>${project.basedir}/src/main/scala</param>
+                    </sourceDirectories>
+                    <testSourceDirectories>
+                        <param>${project.basedir}/src/test/scala</param>
+                    </testSourceDirectories>
+                    <validateOnly>false</validateOnly>
+                    <onlyChangedFiles>false</onlyChangedFiles>
+                    <branch>: git rev-parse --abbrev-ref HEAD</branch>
+                    <useSpecifiedRepositories>false</useSpecifiedRepositories>
+                </configuration>
+                <executions>
+                    <execution>
+                        <phase>validate</phase>
+                        <goals>
+                            <goal>format</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-release-plugin</artifactId>
+                <version>2.5.3</version>
+            </plugin>
+            <plugin>
+                <groupId>org.jacoco</groupId>
+                <artifactId>jacoco-maven-plugin</artifactId>
+                <version>0.8.10</version>
+                <configuration>
+                    <excludes>
+                        <exclude>**/schemas/*</exclude>
+                        <exclude>**/com/cloudera/**/*</exclude>
+                        <exclude>**/org/apache/avro/io/**/*</exclude>
+                    </excludes>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>default-prepare-agent</id>
+                        <goals>
+                            <goal>prepare-agent</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>default-report</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>report</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+
+        <extensions>
+            <extension>
+                <groupId>org.apache.maven.wagon</groupId>
+                <artifactId>wagon-ssh</artifactId>
+                <version>2.10</version>
+            </extension>
+        </extensions>
+    </build>
+    <distributionManagement>
+        <snapshotRepository>
+            <id>dnet45-snapshots</id>
+            <name>DNet45 Snapshots</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
+            <layout>default</layout>
+        </snapshotRepository>
+        <repository>
+            <id>dnet45-releases</id>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
+        </repository>
+        <site>
+            <id>DHPSite</id>
+            <url>${dhp.site.stage.path}/</url>
+        </site>
+    </distributionManagement>
+    <reporting>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <configuration>
+                    <detectLinks>true</detectLinks>
+                    <doclint>none</doclint>
+                </configuration>
+            </plugin>
+        </plugins>
+    </reporting>
+
+    <properties>
+        <dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+
+        <!-- scala version -->
+        <scala.version>2.11.12</scala.version>
+        <scala.binary.version>2.11</scala.binary.version>
+
+        <!-- plugin versions -->
+        <maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
+        <maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
+        <properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
+        <net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
+
+        <!-- dependency versions -->
+        <dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
+
+        <apache.poi.version>4.1.2</apache.poi.version>
+        <cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
+        <common.compress.version>1.20</common.compress.version>
+        <common.csv.version>1.8</common.csv.version>
+        <common.text.version>1.8</common.text.version>
+        <commons-beanutils.version>1.9.4</commons-beanutils.version>
+        <commons-codec.version>1.9</commons-codec.version>
+        <commons.collections.version>3.2.1</commons.collections.version>
+        <commons-io.version>2.4</commons-io.version>
+        <commons.logging.version>1.1.3</commons.logging.version>
+        <commons-validator.version>1.7</commons-validator.version>
+        <dateparser.version>1.0.7</dateparser.version>
+        <dhp-schemas.version>[3.17.1]</dhp-schemas.version>
+        <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
+        <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
+        <dhp.guava.version>11.0.2</dhp.guava.version>
+        <dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
+        <dhp.jackson.version>2.9.6</dhp.jackson.version>
+        <dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
+        <dhp.site.skip>true</dhp.site.skip>
+        <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
+        <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
+        <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
+        <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
+        <google.gson.version>2.2.2</google.gson.version>
+        <javassist.version>3.19.0-GA</javassist.version>
+        <json4s.version>3.5.3</json4s.version>
+        <jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
+        <junit-jupiter.version>5.6.1</junit-jupiter.version>
+        <mockito-core.version>3.3.3</mockito-core.version>
+        <mongodb.driver.version>3.4.2</mongodb.driver.version>
+        <okhttp.version>4.7.2</okhttp.version>
+        <org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
+        <org.slf4j.version>1.7.25</org.slf4j.version>
+        <reflections.version>0.9.10</reflections.version>
+        <scala-xml.version>1.3.0</scala-xml.version>
+        <solr.version>7.5.0</solr.version>
+        <sparksolr.version>3.6.0</sparksolr.version>
+        <unidecode.version>0.0.7</unidecode.version>
+        <vtd.version>[2.12,3.0)</vtd.version>
+    </properties>
+
+    <!-- Build with scala 12 and Spark 3.4 -->
+    <profiles>
+        <profile>
+            <id>scala-2.12</id>
+            <properties>
+                <scala.binary.version>2.12</scala.binary.version>
+                <scala.version>2.12.18</scala.version>
+                <scala-xml.version>1.3.0</scala-xml.version>
+
+                <!-- plugin versions -->
+                <net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
+
+                <!-- dependencies -->
+                <common.compress.version>1.22</common.compress.version>
+                <common.csv.version>1.8</common.csv.version>
+                <common.text.version>1.10.0</common.text.version>
+                <commons-beanutils.version>1.9.4</commons-beanutils.version>
+                <commons-codec.version>1.15</commons-codec.version>
+                <commons.collections.version>3.2.2</commons.collections.version>
+                <commons-io.version>2.11.0</commons-io.version>
+                <commons.logging.version>1.1.3</commons.logging.version>
+                <commons-validator.version>1.7</commons-validator.version>
+
+                <dhp.guava.version>14.0.1</dhp.guava.version>
+                <solr.version>8.11.0</solr.version>
                 <sparksolr.version>4.0.4</sparksolr.version>
-                <dhp.spark.version>3.4.2-SNAPSHOT</dhp.spark.version>
-				<dhp.jackson.version>2.14.2</dhp.jackson.version>
-				<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
-				<json4s.version>3.7.0-M11</json4s.version>
-				<javassist.version>3.25.0-GA</javassist.version>
-				<okhttp.version>4.10.0</okhttp.version>
-				<org.slf4j.version>2.0.6</org.slf4j.version>
-				<reflections.version>0.10.2</reflections.version>
-				<!--
-				<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
-				<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
-				 -->
-			</properties>
-		</profile>
+                <dhp.spark.version>3.4.2.openaire-SNAPSHOT</dhp.spark.version>
+                <dhp.jackson.version>2.14.2</dhp.jackson.version>
+                <dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
+                <json4s.version>3.7.0-M11</json4s.version>
+                <javassist.version>3.25.0-GA</javassist.version>
+                <okhttp.version>4.10.0</okhttp.version>
+                <org.slf4j.version>2.0.6</org.slf4j.version>
+                <reflections.version>0.10.2</reflections.version>
+                <!--
+                <dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
+                <dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
+                 -->
+            </properties>
+        </profile>
 
-		<profile>
-			<id>java17</id>
-			<activation>
-				<jdk>17</jdk>
-			</activation>
-			<build>
-				<pluginManagement>
-					<plugins>
-						<plugin>
-							<groupId>org.apache.maven.plugins</groupId>
-							<artifactId>maven-surefire-plugin</artifactId>
-							<version>3.0.0-M4</version>
-							<configuration>
-								<!-- only for java 11+ to run spark in tests -->
-								<argLine>--add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED</argLine>
-								<redirectTestOutputToFile>true</redirectTestOutputToFile>
-							</configuration>
-						</plugin>
-					</plugins>
-				</pluginManagement>
-			</build>
-		</profile>
-	</profiles>
+        <profile>
+            <id>java17</id>
+            <activation>
+                <jdk>17</jdk>
+            </activation>
+            <build>
+                <pluginManagement>
+                    <plugins>
+                        <plugin>
+                            <groupId>org.apache.maven.plugins</groupId>
+                            <artifactId>maven-surefire-plugin</artifactId>
+                            <version>3.0.0-M4</version>
+                            <configuration>
+                                <!-- only for java 11+ to run spark in tests -->
+                                <argLine>--add-opens=java.base/java.lang=ALL-UNNAMED
+                                    --add-opens=java.base/java.lang.invoke=ALL-UNNAMED
+                                    --add-opens=java.base/java.lang.reflect=ALL-UNNAMED
+                                    --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED
+                                    --add-opens=java.base/java.nio=ALL-UNNAMED
+                                    --add-opens=java.base/java.util=ALL-UNNAMED
+                                    --add-opens=java.base/java.util.concurrent=ALL-UNNAMED
+                                    --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
+                                    --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
+                                    --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
+                                    --add-opens=java.base/sun.security.action=ALL-UNNAMED
+                                    --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
+                                </argLine>
+                                <redirectTestOutputToFile>true</redirectTestOutputToFile>
+                            </configuration>
+                        </plugin>
+                    </plugins>
+                </pluginManagement>
+            </build>
+        </profile>
+    </profiles>
 </project>

From 52495f2cd2f7acc4b5e8ba0e6bc9b99e27a3ade4 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Mon, 18 Sep 2023 13:58:22 +0200
Subject: [PATCH 04/34] used javax.xml.stream.XMLEventReader instead of
 deprecated scala.xml.pull.XMLEventReader

---
 .../ebi/SparkCreateBaselineDataFrame.scala    | 17 ++++++++--------
 .../dnetlib/dhp/sx/bio/pubmed/PMParser.scala  |  3 ++-
 .../dnetlib/dhp/sx/bio/BioScholixTest.scala   | 20 +++++++++----------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
index 8ac8b00bfa..6f5b7110fd 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.pubmed._
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
@@ -14,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}
 
-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory
 
 object SparkCreateBaselineDataFrame {
 
@@ -83,7 +83,7 @@ object SparkCreateBaselineDataFrame {
           if (response.getStatusLine.getStatusCode > 400) {
             tries -= 1
           } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
         } catch {
           case e: Throwable =>
             println(s"Error on requesting ${r.getURI}")
@@ -155,7 +155,7 @@ object SparkCreateBaselineDataFrame {
       IOUtils.toString(
         SparkEBILinksToOaf.getClass.getResourceAsStream(
           "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),Charset.defaultCharset()
       )
     )
     parser.parseArgument(args)
@@ -194,10 +194,11 @@ object SparkCreateBaselineDataFrame {
     if (!"true".equalsIgnoreCase(skipUpdate)) {
       downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
       val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+      val inputFactory = XMLInputFactory.newInstance
       val ds: Dataset[PMArticle] = spark.createDataset(
         k.filter(i => i._1.endsWith(".gz"))
           .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
             new PMParser(xml)
           })
       )
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
index 9102c12c43..fb941a461c 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed
 
 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
 
 /** @param xml
   */
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index d1611300d2..c4af14c409 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
 
 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
 
   @Test
   def testEBIData() = {
-    val inputXML = Source
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-      .mkString
-    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
     new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
   }
 
@@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
 
   @Test
   def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
     val parser = new PMParser(xml)
     parser.foreach(checkPMArticle)
   }
@@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
   @Test
   def testPubmedMapping(): Unit = {
 
-    val xml = new XMLEventReader(
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+    val inputFactory = XMLInputFactory.newInstance
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
     val parser = new PMParser(xml)
     val results = ListBuffer[Oaf]()
     parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))

From 613ec5ffceebb11740fc7ec29a406cbf7490ac14 Mon Sep 17 00:00:00 2001
From: Giambattista Bloisi <gbloisi@gmail.com>
Date: Thu, 21 Sep 2023 14:23:37 +0200
Subject: [PATCH 05/34] Add profiles for different spark versions: spark-24,
 spark-34, spark-35

---
 .../eu/dnetlib/dhp/common/PacePerson.java     |  2 +-
 dhp-pace-core/pom.xml                         | 88 ++++++++++++++++-
 .../eu/dnetlib/pace/model/SparkModel.scala    |  9 +-
 .../eu/dnetlib/pace/util/DiffPatchMatch.java  |  1 -
 .../dnetlib/pace/util/SparkCompatUtils.scala  | 12 +++
 .../dnetlib/pace/util/SparkCompatUtils.scala  | 12 +++
 .../ebi/SparkCreateBaselineDataFrame.scala    |  5 +-
 .../createunresolvedentities/ProduceTest.java |  5 +-
 .../opencitations/ReadCOCITest.java           |  4 +-
 dhp-workflows/dhp-graph-provision/pom.xml     | 42 ++++++++-
 .../dnetlib/dhp/swh/PrepareSWHActionsets.java |  3 +-
 pom.xml                                       | 94 ++++++++++++++++---
 12 files changed, 245 insertions(+), 32 deletions(-)
 create mode 100644 dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
 create mode 100644 dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala

diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
index fac9a75650..fbf586f8c5 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
index a6d2538f29..6449b7ec89 100644
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@@ -95,4 +95,90 @@
 		</dependency>
 	</dependencies>
 
+	<profiles>
+		<profile>
+			<id>spark-24</id>
+			<activation>
+				<activeByDefault>true</activeByDefault>
+			</activation>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-34</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-2</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<id>spark-35</id>
+
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.codehaus.mojo</groupId>
+						<artifactId>build-helper-maven-plugin</artifactId>
+						<version>3.4.0</version>
+						<executions>
+							<execution>
+								<phase>generate-sources</phase>
+								<goals>
+									<goal>add-source</goal>
+								</goals>
+								<configuration>
+									<sources>
+										<source>src/main/spark-35</source>
+									</sources>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+	</profiles>
+
 </project>
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
index aa997c6e9f..63322738f7 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@@ -2,11 +2,10 @@ package eu.dnetlib.pace.model
 
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
+import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
-import org.apache.spark.sql.{Dataset, Row}
 
 import java.util.regex.Pattern
 import scala.collection.JavaConverters._
@@ -48,8 +47,8 @@ case class SparkModel(conf: DedupConfig) {
 
   val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
 
-  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+   val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
   }
 
   def rowFromJson(json: String): Row = {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
index 154bac62c9..ac37c5e5a8 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
@@ -18,7 +18,6 @@ package eu.dnetlib.pace.util;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /*
  * Diff Match and Patch
  * Copyright 2018 The diff-match-patch Authors.
diff --git a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
new file mode 100644
index 0000000000..a426703d67
--- /dev/null
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    RowEncoder(schema)
+  }
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
new file mode 100644
index 0000000000..cbc454ae2c
--- /dev/null
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+    ExpressionEncoder(schema)
+  }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
index 6f5b7110fd..11d087583e 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@@ -155,7 +155,8 @@ object SparkCreateBaselineDataFrame {
       IOUtils.toString(
         SparkEBILinksToOaf.getClass.getResourceAsStream(
           "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        ),Charset.defaultCharset()
+        ),
+        Charset.defaultCharset()
       )
     )
     parser.parseArgument(args)
@@ -198,7 +199,7 @@ object SparkCreateBaselineDataFrame {
       val ds: Dataset[PMArticle] = spark.createDataset(
         k.filter(i => i._1.endsWith(".gz"))
           .flatMap(i => {
-            val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
             new PMParser(xml)
           })
       )
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
index ce116688a2..0a4dfc00bd 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
index 3b416caf2c..ebde0ed0c3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
+					"-format",
+					"COCI"
 				});
 
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml
index 60c925227b..4b4e6c1c4c 100644
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@@ -162,6 +162,18 @@
                     <artifactId>antlr4-runtime</artifactId>
                     <groupId>org.antlr</groupId>
                 </exclusion>
+                <exclusion>
+                    <artifactId>woodstox-core</artifactId>
+                    <groupId>com.fasterxml.woodstox</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>log4j</artifactId>
+                    <groupId>*</groupId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.logging.log4j</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
@@ -210,7 +222,7 @@
 
     <profiles>
         <profile>
-            <id>scala-2.11</id>
+            <id>spark-24</id>
             <activation>
                 <activeByDefault>true</activeByDefault>
             </activation>
@@ -240,7 +252,7 @@
         </profile>
 
         <profile>
-            <id>scala-2.12</id>
+            <id>spark-34</id>
 
             <build>
                 <plugins>
@@ -266,6 +278,32 @@
             </build>
         </profile>
 
+        <profile>
+            <id>spark-35</id>
+
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>build-helper-maven-plugin</artifactId>
+                        <version>3.4.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-sources</phase>
+                                <goals>
+                                    <goal>add-source</goal>
+                                </goals>
+                                <configuration>
+                                    <sources>
+                                        <source>src/main/sparksolr-4</source>
+                                    </sources>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
     </profiles>
 
 </project>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
index 2691d4b7ec..230a077f7e 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
@@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
@@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
 			.map(
 				(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
 				Encoders.bean(Software.class))
-			.filter(t -> t.getCodeRepositoryUrl() != null)
+			.filter((FilterFunction<Software>) t -> t.getCodeRepositoryUrl() != null)
 			.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
 	}
 
diff --git a/pom.xml b/pom.xml
index 1480af2a6e..8c6bcd3d13 100644
--- a/pom.xml
+++ b/pom.xml
@@ -174,7 +174,7 @@
         <dependencies>
             <dependency>
                 <groupId>eu.dnetlib.dhp</groupId>
-                <artifactId>${dhp-schemas.artifact}</artifactId>
+                <artifactId>dhp-schemas</artifactId>
                 <version>${dhp-schemas.version}</version>
             </dependency>
             <dependency>
@@ -233,6 +233,13 @@
                 <scope>provided</scope>
             </dependency>
 
+            <dependency>
+                <groupId>org.slf4j</groupId>
+                <artifactId>slf4j-log4j12</artifactId>
+                <version>${org.slf4j.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
             <dependency>
                 <groupId>org.slf4j</groupId>
                 <artifactId>jcl-over-slf4j</artifactId>
@@ -240,6 +247,28 @@
                 <scope>provided</scope>
             </dependency>
 
+            <dependency>
+                <groupId>org.apache.logging.log4j</groupId>
+                <artifactId>log4j-slf4j2-impl</artifactId>
+                <version>${log4j.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.logging.log4j</groupId>
+                <artifactId>log4j-api</artifactId>
+                <version>${log4j.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.logging.log4j</groupId>
+                <artifactId>log4j-core</artifactId>
+                <version>${log4j.version}</version>
+            </dependency>
+            <dependency>
+                <!-- API bridge between log4j 1 and 2 -->
+                <groupId>org.apache.logging.log4j</groupId>
+                <artifactId>log4j-1.2-api</artifactId>
+                <version>${log4j.version}</version>
+            </dependency>
+
             <dependency>
                 <groupId>org.apache.commons</groupId>
                 <artifactId>commons-lang3</artifactId>
@@ -381,7 +410,7 @@
             <dependency>
                 <groupId>org.apache.zookeeper</groupId>
                 <artifactId>zookeeper</artifactId>
-                <version>3.4.11</version>
+                <version>${zookeeper.version}</version>
             </dependency>
 
             <dependency>
@@ -713,6 +742,7 @@
                     <version>3.0.0-M4</version>
                     <configuration>
                         <redirectTestOutputToFile>true</redirectTestOutputToFile>
+                        <trimStackTrace>false</trimStackTrace>
                     </configuration>
                 </plugin>
                 <plugin>
@@ -782,7 +812,7 @@
             <plugin>
                 <groupId>net.revelc.code</groupId>
                 <artifactId>impsort-maven-plugin</artifactId>
-                <version>1.4.1</version>
+                <version>1.6.2</version>
                 <configuration>
                     <groups>java.,javax.,org.,com.</groups>
                     <staticGroups>java,*</staticGroups>
@@ -918,8 +948,6 @@
         <net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
 
         <!-- dependency versions -->
-        <dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
-
         <apache.poi.version>4.1.2</apache.poi.version>
         <cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
         <common.compress.version>1.20</common.compress.version>
@@ -932,7 +960,7 @@
         <commons.logging.version>1.1.3</commons.logging.version>
         <commons-validator.version>1.7</commons-validator.version>
         <dateparser.version>1.0.7</dateparser.version>
-        <dhp-schemas.version>[3.17.1]</dhp-schemas.version>
+        <dhp-schemas.version>4.17.2</dhp-schemas.version>
         <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
         <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
         <dhp.guava.version>11.0.2</dhp.guava.version>
@@ -945,6 +973,7 @@
         <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
         <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
         <google.gson.version>2.2.2</google.gson.version>
+        <log4j.version>1.2.17</log4j.version>
         <javassist.version>3.19.0-GA</javassist.version>
         <json4s.version>3.5.3</json4s.version>
         <jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
@@ -960,12 +989,13 @@
         <sparksolr.version>3.6.0</sparksolr.version>
         <unidecode.version>0.0.7</unidecode.version>
         <vtd.version>[2.12,3.0)</vtd.version>
+        <zookeeper.version>3.4.6</zookeeper.version>
     </properties>
 
     <!-- Build with scala 12 and Spark 3.4 -->
     <profiles>
         <profile>
-            <id>scala-2.12</id>
+            <id>spark-34</id>
             <properties>
                 <scala.binary.version>2.12</scala.binary.version>
                 <scala.version>2.12.18</scala.version>
@@ -988,25 +1018,60 @@
                 <dhp.guava.version>14.0.1</dhp.guava.version>
                 <solr.version>8.11.0</solr.version>
                 <sparksolr.version>4.0.4</sparksolr.version>
-                <dhp.spark.version>3.4.2.openaire-SNAPSHOT</dhp.spark.version>
+                <dhp.spark.version>3.4.2.openaire</dhp.spark.version>
                 <dhp.jackson.version>2.14.2</dhp.jackson.version>
                 <dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
+                <log4j.version>2.19.0</log4j.version>
                 <json4s.version>3.7.0-M11</json4s.version>
                 <javassist.version>3.25.0-GA</javassist.version>
                 <okhttp.version>4.10.0</okhttp.version>
                 <org.slf4j.version>2.0.6</org.slf4j.version>
                 <reflections.version>0.10.2</reflections.version>
-                <!--
-                <dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
-                <dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
-                 -->
+                <zookeeper.version>3.6.3</zookeeper.version>
             </properties>
         </profile>
 
         <profile>
-            <id>java17</id>
+            <id>spark-35</id>
+            <properties>
+                <scala.binary.version>2.12</scala.binary.version>
+                <scala.version>2.12.18</scala.version>
+                <scala-xml.version>1.3.0</scala-xml.version>
+
+                <!-- plugin versions -->
+                <net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
+
+                <!-- dependencies -->
+                <common.compress.version>1.23.0</common.compress.version>
+                <common.csv.version>1.8</common.csv.version>
+                <common.text.version>1.10.0</common.text.version>
+                <commons-beanutils.version>1.9.4</commons-beanutils.version>
+                <commons-codec.version>1.16.0</commons-codec.version>
+                <commons.collections.version>3.2.2</commons.collections.version>
+                <commons-io.version>2.13.0</commons-io.version>
+                <commons.logging.version>1.1.3</commons.logging.version>
+                <commons-validator.version>1.7</commons-validator.version>
+
+                <dhp.guava.version>14.0.1</dhp.guava.version>
+                <solr.version>8.11.0</solr.version>
+                <sparksolr.version>4.0.4</sparksolr.version>
+                <dhp.spark.version>3.5.1.openaire-SNAPSHOT</dhp.spark.version>
+                <dhp.jackson.version>2.15.2</dhp.jackson.version>
+                <dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
+                <log4j.version>2.20.0</log4j.version>
+                <json4s.version>3.7.0-M11</json4s.version>
+                <javassist.version>3.25.0-GA</javassist.version>
+                <okhttp.version>4.10.0</okhttp.version>
+                <org.slf4j.version>2.0.7</org.slf4j.version>
+                <reflections.version>0.10.2</reflections.version>
+                <zookeeper.version>3.6.3</zookeeper.version>
+            </properties>
+        </profile>
+
+        <profile>
+            <id>java11</id>
             <activation>
-                <jdk>17</jdk>
+                <jdk>[11</jdk>
             </activation>
             <build>
                 <pluginManagement>
@@ -1031,6 +1096,7 @@
                                     --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
                                 </argLine>
                                 <redirectTestOutputToFile>true</redirectTestOutputToFile>
+                                <trimStackTrace>false</trimStackTrace>
                             </configuration>
                         </plugin>
                     </plugins>

From 342cb6189bbbfe44dfae772fc5308f419a6f8d09 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Fri, 19 Apr 2024 12:13:26 +0200
Subject: [PATCH 06/34] fixed problem on changed signature on RowEncoder
 removed property dhp.schema.artifact

---
 dhp-common/pom.xml                                             | 2 +-
 .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index d64e7e7a09..7c99ed527a 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -164,7 +164,7 @@
 
 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>
 
 		<dependency>
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
index cb1c700599..bade4869fd 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.dedup;
 
 import static org.apache.spark.sql.functions.col;
 
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
@@ -147,7 +148,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
 
-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
 
 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();

From 8dd9cf84e2ccbeec1db1d91193e813b35555bfb1 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Fri, 19 Apr 2024 12:30:59 +0200
Subject: [PATCH 07/34] code formatted

---
 .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
index bade4869fd..c5cb299b12 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.dedup;
 
 import static org.apache.spark.sql.functions.col;
 
-import eu.dnetlib.pace.util.SparkCompatUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
@@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;
 

From 073f320c6a2735bda5d51e7bd7766f01f791651d Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Mon, 22 Apr 2024 11:32:31 +0200
Subject: [PATCH 08/34] Added module containing all the dependencies, useful
 for spark deploy on k8.

---
 .../eu/dnetlib/pace/model/SparkModel.scala    |   4 +-
 dhp-shade-package/pom.xml                     | 169 ++++++++++++++++++
 .../dhp/oa/dedup/SparkCreateMergeRels.java    |   4 +-
 .../dhp/oa/dedup/SparkPropagateRelation.java  |   1 -
 pom.xml                                       |   1 +
 5 files changed, 174 insertions(+), 5 deletions(-)
 create mode 100644 dhp-shade-package/pom.xml

diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
index aa04188dae..e6a1c4ccc1 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
   val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
 
   val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
   }
 
   def rowFromJson(json: String): Row = {
diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml
new file mode 100644
index 0000000000..128a571165
--- /dev/null
+++ b/dhp-shade-package/pom.xml
@@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>dhp</artifactId>
+        <version>1.2.5-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+
+    </parent>
+
+    <artifactId>dhp-shade-package</artifactId>
+    <packaging>jar</packaging>
+
+    <distributionManagement>
+        <site>
+            <id>DHPSite</id>
+            <url>${dhp.site.stage.path}/dhp-common</url>
+        </site>
+    </distributionManagement>
+
+    <description>This module create a jar of all module dependencies</description>
+
+
+    <dependencies>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-actionmanager</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-aggregation</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-blacklist</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-broker-events</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-dedup-openaire</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-enrichment</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-mapper</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-graph-provision</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-impact-indicators</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-actionsets</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-hist-snaps</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-monitor-irish</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-promote</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-stats-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-swh</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-raw-data-update</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-usage-stats-build</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <transformers>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
+                                </transformer>
+                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/maven/**</exclude>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com</pattern>
+                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
+                                    <includes>
+                                        <include>com.google.common.**</include>
+                                    </includes>
+                                </relocation>
+                            </relocations>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
index 59626c1414..d48351c48a 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;
 
@@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
-						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
 
 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
index c64fbe4a4d..c7efce4d74 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
diff --git a/pom.xml b/pom.xml
index 06e4ba9d48..fc68a666d1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,6 +23,7 @@
         <module>dhp-pace-core</module>
         <module>dhp-common</module>
         <module>dhp-workflows</module>
+        <module>dhp-shade-package</module>
     </modules>
 
     <issueManagement>

From 9cd3bc0f10cc8104cd1dcde539f577ea1a3f3df9 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Fri, 26 Apr 2024 16:02:07 +0200
Subject: [PATCH 09/34] Added a new generation of the  dump for scholexplorer
 tested with last version of spark, and strongly refactored

---
 .../scholexplorer/relation/relations.json     |   8 +
 .../dhp/sx/graph/scholix/ScholixUtils.scala   |  19 +-
 .../dhp/sx/create_scholix_dump_params.json    |   5 +
 .../eu/dnetlib/dhp/sx/relation/relations.json | 166 ++++++++++++
 .../dhp/sx/graph/ScholexplorerUtils.scala     | 256 ++++++++++++++++++
 .../graph/SparkCreateScholexplorerDump.scala  | 130 +++++++++
 .../graph/scholix/ScholixGenerationTest.scala |  17 ++
 pom.xml                                       |   2 +-
 8 files changed, 597 insertions(+), 6 deletions(-)
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala

diff --git a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
index 98e8daa18c..4f0cee53d7 100644
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@@ -154,5 +154,13 @@
   "unknown":{
     "original":"Unknown",
     "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
   }
 }
\ No newline at end of file
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
index a995016a8d..f256ca1a12 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
   }
 
   def generateScholixResourceFromResult(r: Result): ScholixResource = {
-    generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    val sum = ScholixUtils.resultToSummary(r)
+    if (sum != null)
+      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+    else
+      null
   }
 
   val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
 
   }
 
+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
   def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
     if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
       val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
     if (persistentIdentifiers.isEmpty)
       return null
     s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
-      s.setTypology(Typology.publication)
-    else
-      s.setTypology(Typology.dataset)
+    s.setTypology(r.getResulttype.getClassid)
 
     s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
new file mode 100644
index 0000000000..fead58ab1c
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@@ -0,0 +1,5 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
new file mode 100644
index 0000000000..4f0cee53d7
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@@ -0,0 +1,166 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  },
+  "isamongtopnsimilardocuments": {
+    "original": "IsAmongTopNSimilarDocuments",
+    "inverse": "HasAmongTopNSimilarDocuments"
+  },
+  "hasamongtopnsimilardocuments": {
+    "original": "HasAmongTopNSimilarDocuments",
+    "inverse": "IsAmongTopNSimilarDocuments"
+  }
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
new file mode 100644
index 0000000000..95564d5236
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@@ -0,0 +1,256 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
+import eu.dnetlib.dhp.schema.sx.scholix.{
+  Scholix,
+  ScholixCollectedFrom,
+  ScholixEntityId,
+  ScholixIdentifier,
+  ScholixRelationship,
+  ScholixResource
+}
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+
+import scala.collection.JavaConverters._
+import scala.io.Source
+
+case class RelationInfo(
+  source: String,
+  target: String,
+  relclass: String,
+  id: String,
+  collectedfrom: Seq[RelKeyValue]
+) {}
+case class RelKeyValue(key: String, value: String) {}
+
+object ScholexplorerUtils {
+
+  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
+
+  case class RelationVocabulary(original: String, inverse: String) {}
+
+  val relations: Map[String, RelationVocabulary] = {
+    val input = Source
+      .fromInputStream(
+        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
+      )
+      .mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+
+    lazy val json: json4s.JValue = parse(input)
+
+    json.extract[Map[String, RelationVocabulary]]
+  }
+
+  def invRel(rel: String): String = {
+    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+    if (semanticRelation != null)
+      semanticRelation.inverse
+    else
+      null
+  }
+
+  def generateDatasourceOpenAIREURLS(id: String): String = {
+    if (id != null && id.length > 12)
+      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
+    else
+      null
+  }
+
+  def findURLForPID(
+    pidValue: List[StructuredProperty],
+    urls: List[String]
+  ): List[(StructuredProperty, String)] = {
+    pidValue.map { p =>
+      val pv = p.getValue
+
+      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+      (p, r.orNull)
+    }
+  }
+
+  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
+    if (r.getInstance() == null || r.getInstance().isEmpty)
+      return List()
+    r.getInstance()
+      .asScala
+      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+      .filter(i => i.getPid != null && i.getUrl != null)
+      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+      .distinct
+      .toList
+  }
+
+  def generateScholixResourceFromResult(result: Result): ScholixResource = {
+
+    if (result.getInstance() == null || result.getInstance().size() == 0)
+      return null
+
+    if (result.getPid == null || result.getPid.isEmpty)
+      return null
+
+    val r = new ScholixResource
+    r.setDnetIdentifier(result.getId)
+
+    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
+    if (persistentIdentifiers.isEmpty)
+      return null
+
+    r.setIdentifier(persistentIdentifiers.asJava)
+
+    r.setObjectType(result.getResulttype.getClassid)
+
+    r.setObjectSubType(
+      result
+        .getInstance()
+        .asScala
+        .filter(i => i != null && i.getInstancetype != null)
+        .map(i => i.getInstancetype.getClassname)
+        .distinct
+        .head
+    )
+
+    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
+      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
+      if (titles.nonEmpty)
+        r.setTitle(titles.head)
+      else
+        return null
+    }
+    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
+      val authors: List[ScholixEntityId] =
+        result.getAuthor.asScala
+          .map(a => {
+            val entity = new ScholixEntityId()
+            entity.setName(a.getFullname)
+            if (a.getPid != null && a.getPid.size() > 0)
+              entity.setIdentifiers(
+                a.getPid.asScala
+                  .map(sp => {
+                    val id = new ScholixIdentifier()
+                    id.setIdentifier(sp.getValue)
+                    id.setSchema(sp.getQualifier.getClassid)
+                    id
+                  })
+                  .take(3)
+                  .toList
+                  .asJava
+              )
+            entity
+          })
+          .toList
+      if (authors.nonEmpty)
+        r.setCreator(authors.asJava)
+
+    }
+
+    val dt: List[String] = result
+      .getInstance()
+      .asScala
+      .filter(i => i.getDateofacceptance != null)
+      .map(i => i.getDateofacceptance.getValue)
+      .toList
+    if (dt.nonEmpty)
+      r.setPublicationDate(dt.distinct.head)
+
+    r.setPublisher(
+      result
+        .getInstance()
+        .asScala
+        .map(i => i.getHostedby)
+        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
+        .map(h => {
+          val eid = new ScholixEntityId()
+          eid.setName(h.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(h.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          eid
+        })
+        .distinct
+        .asJava
+    )
+
+    r.setCollectedFrom(
+      result.getCollectedfrom.asScala
+        .map(cf => {
+          val scf = new ScholixCollectedFrom()
+          scf.setProvisionMode("collected")
+          scf.setCompletionStatus("complete")
+          val eid = new ScholixEntityId()
+          eid.setName(cf.getValue)
+          val id = new ScholixIdentifier()
+          id.setIdentifier(cf.getKey)
+          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
+          eid.setIdentifiers(List(id).asJava)
+          scf.setProvider(eid)
+          scf
+        })
+        .asJava
+    )
+
+    r
+  }
+
+  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
+    val s: Scholix = new Scholix
+    s.setSource(source)
+    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
+      s.setLinkprovider(
+        relation.collectedfrom
+          .map(cf => {
+            val eid = new ScholixEntityId()
+            eid.setName(cf.value)
+            val id = new ScholixIdentifier()
+            id.setIdentifier(cf.key)
+            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
+            eid.setIdentifiers(List(id).asJava)
+            eid
+          })
+          .toList
+          .asJava
+      )
+    else {
+      val eid = new ScholixEntityId()
+      eid.setName("OpenAIRE")
+      val id = new ScholixIdentifier()
+      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
+      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
+      eid.setIdentifiers(List(id).asJava)
+      s.setLinkprovider(List(eid).asJava)
+    }
+    s.setIdentifier(relation.id)
+    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
+    if (semanticRelation == null)
+      return null
+    s.setRelationship(
+      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+    )
+    s.setPublicationDate(source.getPublicationDate)
+    s.setPublisher(source.getPublisher)
+    val mockTarget = new ScholixResource
+    mockTarget.setDnetIdentifier(relation.target)
+    s.setTarget(mockTarget)
+    s
+  }
+
+  def updateTarget(s: Scholix, t: ScholixResource): Scholix = {
+
+    s.setTarget(t)
+    val spublishers: Seq[ScholixEntityId] =
+      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
+    val tpublishers: Seq[ScholixEntityId] =
+      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
+    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
+    s.setPublisher(mergedPublishers.asJava)
+    s
+  }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
new file mode 100644
index 0000000000..9334fc6e03
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@@ -0,0 +1,130 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import eu.dnetlib.dhp.schema.oaf.{
+  KeyValue,
+  OtherResearchProduct,
+  Publication,
+  Relation,
+  Result,
+  Software,
+  Dataset => OafDataset
+}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
+import org.apache.spark.sql.functions.{col, concat, expr, md5}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
+    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+  /** Here all the spark applications runs this method
+    * where the whole logic of the spark node is defined
+    */
+  override def run(): Unit = {
+    val sourcePath = parser.get("sourcePath")
+    log.info("sourcePath: {}", sourcePath)
+    val targetPath = parser.get("targetPath")
+    log.info("targetPath: {}", targetPath)
+    generateBidirectionalRelations(sourcePath, targetPath, spark)
+    generateScholixResource(sourcePath, targetPath, spark)
+    generateScholix(targetPath, spark)
+  }
+
+  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
+    val entityMap: Map[String, StructType] = Map(
+      "publication"          -> Encoders.bean(classOf[Publication]).schema,
+      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
+      "software"             -> Encoders.bean(classOf[Software]).schema,
+      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
+    )
+
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
+
+    val resDs = spark.emptyDataset[ScholixResource]
+    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
+      println(s"adding ${item._1}")
+      res.union(
+        spark.read
+          .schema(item._2)
+          .json(s"$inputPath/${item._1}")
+          .as[Result]
+          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
+          .filter(s => s != null)
+      )
+    })
+    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
+  }
+
+  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
+    val relSchema = Encoders.bean(classOf[Relation]).schema
+
+    val relDF = spark.read
+      .schema(relSchema)
+      .json(s"$inputPath/relation")
+      .where(
+        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
+        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
+      )
+      .select("source", "target", "collectedfrom", "relClass")
+
+    def invRel: String => String = { s =>
+      ScholexplorerUtils.invRel(s)
+    }
+
+    import org.apache.spark.sql.functions.udf
+    val inverseRelationUDF = udf(invRel)
+    val inverseRelation = relDF.select(
+      col("target").alias("source"),
+      col("source").alias("target"),
+      col("collectedfrom"),
+      inverseRelationUDF(col("relClass")).alias("relClass")
+    )
+
+    val bidRel = inverseRelation
+      .union(relDF)
+      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
+      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
+      .drop("collectedfrom")
+      .withColumnRenamed("cf", "collectedfrom")
+      .distinct()
+
+    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
+
+  }
+
+  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
+    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+    implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix])
+
+    import spark.implicits._
+    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
+    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
+
+    val scholix_one_verse = relations
+      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
+      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+
+    scholix_one_verse
+      .joinWith(resource, scholix_one_verse("target.dnetIdentifier") === resource("dnetIdentifier"), "inner")
+      .map(k => ScholexplorerUtils.updateTarget(k._1, k._2))
+      .write
+      .mode(SaveMode.Overwrite)
+      .option("compression", "gzip")
+      .json(s"$outputPath/scholix")
+  }
+}
+
+object SparkCreateScholexplorerDump {
+  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
+
+  def main(args: Array[String]): Unit = {
+    new SparkCreateScholexplorerDump(
+      log = logger,
+      args = args,
+      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
+    ).initialize().run()
+  }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
new file mode 100644
index 0000000000..0a2872cb48
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@@ -0,0 +1,17 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
+import org.apache.spark.sql.SparkSession
+import org.junit.jupiter.api.Test
+
+class ScholixGenerationTest {
+
+  @Test
+  def generateScholix(): Unit = {
+    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
+    val app = new SparkCreateScholexplorerDump(null, null, null)
+//    app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+//    app.generateBidirectionalRelations("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+  }
+}
diff --git a/pom.xml b/pom.xml
index d3db1d3d47..9f6f1f2a90 100644
--- a/pom.xml
+++ b/pom.xml
@@ -960,7 +960,7 @@
         <commons.logging.version>1.1.3</commons.logging.version>
         <commons-validator.version>1.7</commons-validator.version>
         <dateparser.version>1.0.7</dateparser.version>
-        <dhp-schemas.version>[6.1.1]</dhp-schemas.version>
+        <dhp-schemas.version>[6.1.2-SNAPSHOT]</dhp-schemas.version>
         <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
         <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
         <dhp.guava.version>11.0.2</dhp.guava.version>

From 052c6aac9d2dd96d37d75120890aa4dc4647a19b Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Fri, 26 Apr 2024 16:03:04 +0200
Subject: [PATCH 10/34] formatted code

---
 .../dhp/collection/crossref/Crossref2Oaf.scala      |  1 +
 .../dnetlib/dhp/collection/crossref/issn_pub.json   |  4 ----
 .../collection/crossref/CrossrefMappingTest.scala   | 13 ++++++++++++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
index 44c82e256b..c4aa64fd49 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
             tp._1 match {
               case "electronic" => journal.setIssnOnline(tp._2)
               case "print"      => journal.setIssnPrinted(tp._2)
+              case _            =>
             }
           })
         }
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
index 2a9e391df8..2f1af2a6e3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@@ -789,10 +789,6 @@
       "value": "2227-9717",
       "type": "electronic"
     },
-    {
-      "value": "VALUE",
-      "type": "PIPPO"
-    },
     {
       "value": "1063-4584",
       "type": "pu"
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
index ed43bb1a19..c3ea884eb3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
     super.setUpVocabulary()
   }
 
+  @Test
+  def mappingRecord(): Unit = {
+    val input =
+      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+  }
+
 }

From 133ead1e3ef86be422783eddf9fd3e46738b6e02 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Mon, 29 Apr 2024 09:00:30 +0200
Subject: [PATCH 11/34] updated new version of scholexplorer Generation

---
 .../dhp/sx/graph/SparkCreateScholexplorerDump.scala       | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
index 9334fc6e03..1211dcc786 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@@ -107,9 +107,13 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
       .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
       .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
 
+    val resourceTarget = relations
+      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
+      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
+
     scholix_one_verse
-      .joinWith(resource, scholix_one_verse("target.dnetIdentifier") === resource("dnetIdentifier"), "inner")
-      .map(k => ScholexplorerUtils.updateTarget(k._1, k._2))
+      .joinWith(resourceTarget, scholix_one_verse("identifier") === resourceTarget("_1"), "inner")
+      .map(k => ScholexplorerUtils.updateTarget(k._1, k._2._2))
       .write
       .mode(SaveMode.Overwrite)
       .option("compression", "gzip")

From 0646d0d0645341020ee12c284e0872e6e450cc11 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Thu, 2 May 2024 15:15:03 +0200
Subject: [PATCH 12/34] Updated main sparkApplication to avoid to require
 master variable

---
 .../eu/dnetlib/dhp/application/SparkScalaApplication.scala | 7 ++++---
 .../eu/dnetlib/dhp/sx/create_scholix_dump_params.json      | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
index a14c258379..526bbd2953 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
     val conf: SparkConf = new SparkConf()
     val master = parser.get("master")
     log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
       .builder()
       .config(conf)
       .appName(getClass.getSimpleName)
-      .master(master)
-      .getOrCreate()
+    if (master != null)
+      b.master(master)
+    b.getOrCreate()
   }
 
   def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
index fead58ab1c..53fe95895f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@@ -1,5 +1,5 @@
 [
-  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": true},
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
   {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
   {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
 ]
\ No newline at end of file

From a860c57bbc2c6ae788c91c103873dc942e7ff473 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Thu, 2 May 2024 15:16:00 +0200
Subject: [PATCH 13/34] updated .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 14cd4d3450..6fafc70555 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
+/dhp-shade-package/dependency-reduced-pom.xml

From db358ad0d2ffb63cd7215ec89e693274982b78e1 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Thu, 2 May 2024 15:25:57 +0200
Subject: [PATCH 14/34] code formatted

---
 .../eu/dnetlib/pace/common/PaceCommonUtils.java   | 15 ++++++++-------
 .../main/java/eu/dnetlib/pace/model/Person.java   | 11 ++++++-----
 .../java/eu/dnetlib/pace/util/Capitalise.java     |  3 ++-
 .../pace/common/AbstractPaceFunctions.java        | 13 +++++++------
 4 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
index a279271b55..61fbc24708 100644
--- a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java
@@ -1,19 +1,20 @@
 
 package eu.dnetlib.pace.common;
 
-import com.google.common.base.Splitter;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
 /**
  * Set of common functions for the framework
  *
diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java
index c95c9d823b..6a1957183c 100644
--- a/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java
@@ -1,20 +1,21 @@
 
 package eu.dnetlib.pace.model;
 
+import java.nio.charset.Charset;
+import java.text.Normalizer;
+import java.util.List;
+import java.util.Set;
+
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.hash.Hashing;
+
 import eu.dnetlib.pace.common.PaceCommonUtils;
 import eu.dnetlib.pace.util.Capitalise;
 import eu.dnetlib.pace.util.DotAbbreviations;
 
-import java.nio.charset.Charset;
-import java.text.Normalizer;
-import java.util.List;
-import java.util.Set;
-
 public class Person {
 
 	private static final String UTF8 = "UTF-8";
diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java b/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java
index 0153864234..671320c71c 100644
--- a/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java
+++ b/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java
@@ -1,9 +1,10 @@
 
 package eu.dnetlib.pace.util;
 
-import com.google.common.base.Function;
 import org.apache.commons.lang3.text.WordUtils;
 
+import com.google.common.base.Function;
+
 public class Capitalise implements Function<String, String> {
 
 	private final char[] DELIM = {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index 6bfb8b3f4b..b055077d89 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -1,12 +1,6 @@
 
 package eu.dnetlib.pace.common;
 
-import com.google.common.base.Joiner;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
@@ -15,6 +9,13 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
 /**
  * Set of common functions for the framework
  *

From 6efab4d88e7ce481896e5569e1801daf81c96777 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Thu, 16 May 2024 16:19:18 +0200
Subject: [PATCH 15/34] fixed scholexplorer bug

---
 .../dhp/sx/graph/scholix/ScholixUtils.scala   |   2 +-
 dhp-shade-package/pom.xml                     | 150 +++++++++---------
 .../dhp/sx/graph/ScholexplorerUtils.scala     |  15 +-
 .../graph/SparkCreateScholexplorerDump.scala  |  23 ++-
 .../graph/scholix/ScholixGenerationTest.scala |  17 +-
 5 files changed, 112 insertions(+), 95 deletions(-)

diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
index f256ca1a12..72a17777e9 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@@ -389,7 +389,7 @@ object ScholixUtils extends Serializable {
     if (persistentIdentifiers.isEmpty)
       return null
     s.setLocalIdentifier(persistentIdentifiers.asJava)
-    s.setTypology(r.getResulttype.getClassid)
+//    s.setTypology(r.getResulttype.getClassid)
 
     s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
 
diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml
index 128a571165..fd9c040660 100644
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@@ -31,86 +31,86 @@
             <artifactId>dhp-actionmanager</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-aggregation</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-blacklist</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-broker-events</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-dedup-openaire</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-enrichment</artifactId>
-            <version>${project.version}</version>
-        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-aggregation</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-blacklist</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-broker-events</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-dedup-openaire</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-enrichment</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
         <dependency>
             <groupId>eu.dnetlib.dhp</groupId>
             <artifactId>dhp-graph-mapper</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-graph-provision</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-impact-indicators</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-actionsets</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-hist-snaps</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-monitor-irish</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-promote</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-stats-update</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-swh</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-usage-raw-data-update</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>eu.dnetlib.dhp</groupId>
-            <artifactId>dhp-usage-stats-build</artifactId>
-            <version>${project.version}</version>
-        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-graph-provision</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-impact-indicators</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-actionsets</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-hist-snaps</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-monitor-irish</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-promote</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-stats-update</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-swh</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-usage-raw-data-update</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
+<!--        <dependency>-->
+<!--            <groupId>eu.dnetlib.dhp</groupId>-->
+<!--            <artifactId>dhp-usage-stats-build</artifactId>-->
+<!--            <version>${project.version}</version>-->
+<!--        </dependency>-->
     </dependencies>
 
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
index 95564d5236..f62f271e30 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@@ -1,14 +1,8 @@
 package eu.dnetlib.dhp.sx.graph
 
+import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
-import eu.dnetlib.dhp.schema.sx.scholix.{
-  Scholix,
-  ScholixCollectedFrom,
-  ScholixEntityId,
-  ScholixIdentifier,
-  ScholixRelationship,
-  ScholixResource
-}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource}
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
@@ -28,6 +22,7 @@ case class RelKeyValue(key: String, value: String) {}
 object ScholexplorerUtils {
 
   val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
+  val mapper= new ObjectMapper()
 
   case class RelationVocabulary(original: String, inverse: String) {}
 
@@ -242,7 +237,7 @@ object ScholexplorerUtils {
     s
   }
 
-  def updateTarget(s: Scholix, t: ScholixResource): Scholix = {
+  def updateTarget(s: Scholix, t: ScholixResource): String = {
 
     s.setTarget(t)
     val spublishers: Seq[ScholixEntityId] =
@@ -251,6 +246,6 @@ object ScholexplorerUtils {
       if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
     val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
     s.setPublisher(mergedPublishers.asJava)
-    s
+    mapper.writeValueAsString(s)
   }
 }
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
index 1211dcc786..32aa686659 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@@ -11,7 +11,7 @@ import eu.dnetlib.dhp.schema.oaf.{
   Dataset => OafDataset
 }
 import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
-import org.apache.spark.sql.functions.{col, concat, expr, md5}
+import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
@@ -89,7 +89,13 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
       .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
       .drop("collectedfrom")
       .withColumnRenamed("cf", "collectedfrom")
-      .distinct()
+      .groupBy(col("id"))
+      .agg(
+        first("source").alias("source"),
+        first("target").alias("target"),
+        first("relClass").alias("relClass"),
+        first("collectedfrom").alias("collectedfrom")
+      )
 
     bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
 
@@ -97,27 +103,32 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
 
   def generateScholix(outputPath: String, spark: SparkSession): Unit = {
     implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
-    implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix])
+    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
 
     import spark.implicits._
     val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
     val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
 
+
+
     val scholix_one_verse = relations
       .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
       .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+      .map(s=> (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
+
 
     val resourceTarget = relations
       .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
       .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
 
+
     scholix_one_verse
-      .joinWith(resourceTarget, scholix_one_verse("identifier") === resourceTarget("_1"), "inner")
-      .map(k => ScholexplorerUtils.updateTarget(k._1, k._2._2))
+      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
+      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
       .write
       .mode(SaveMode.Overwrite)
       .option("compression", "gzip")
-      .json(s"$outputPath/scholix")
+      .text(s"$outputPath/scholix")
   }
 }
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
index 0a2872cb48..67d40dcf12 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@@ -1,17 +1,28 @@
 package eu.dnetlib.dhp.sx.graph.scholix
 
+import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
 import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.Test
+import org.objenesis.strategy.StdInstantiatorStrategy
 
 class ScholixGenerationTest {
 
   @Test
   def generateScholix(): Unit = {
+
     val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
     val app = new SparkCreateScholexplorerDump(null, null, null)
-//    app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
-//    app.generateBidirectionalRelations("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+//   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+//    app.generateBidirectionalRelations(
+//      "/home/sandro/Downloads/scholix_sample/",
+//      "/home/sandro/Downloads/scholix/",
+//      spark
+//    )
     app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+
+
+
   }
 }

From a87f9ea64317dff7afac5045a4c64bb9c8a26954 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@gmail.com>
Date: Fri, 17 May 2024 14:16:43 +0200
Subject: [PATCH 16/34] fixed scholexplorer bug

---
 .../eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala  | 11 +++++++++--
 .../dhp/sx/graph/SparkCreateScholexplorerDump.scala   |  6 +-----
 .../dhp/sx/graph/scholix/ScholixGenerationTest.scala  |  2 --
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
index f62f271e30..d171d96d99 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@@ -2,7 +2,14 @@ package eu.dnetlib.dhp.sx.graph
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
-import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource}
+import eu.dnetlib.dhp.schema.sx.scholix.{
+  Scholix,
+  ScholixCollectedFrom,
+  ScholixEntityId,
+  ScholixIdentifier,
+  ScholixRelationship,
+  ScholixResource
+}
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
@@ -22,7 +29,7 @@ case class RelKeyValue(key: String, value: String) {}
 object ScholexplorerUtils {
 
   val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
-  val mapper= new ObjectMapper()
+  val mapper = new ObjectMapper()
 
   case class RelationVocabulary(original: String, inverse: String) {}
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
index 32aa686659..dd420ab956 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@@ -109,19 +109,15 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo
     val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
     val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
 
-
-
     val scholix_one_verse = relations
       .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
       .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
-      .map(s=> (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
-
+      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
 
     val resourceTarget = relations
       .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
       .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
 
-
     scholix_one_verse
       .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
       .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
index 67d40dcf12..204fe97941 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@@ -22,7 +22,5 @@ class ScholixGenerationTest {
 //    )
     app.generateScholix("/home/sandro/Downloads/scholix/", spark)
 
-
-
   }
 }

From 032bcc8279849cfa498bc8227f8a96c4e1a48525 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Mon, 20 May 2024 09:24:15 +0200
Subject: [PATCH 17/34] since last beta workflow we decide to introduce in the
 graph only MAG item with DOI and set them invisible ( this should be the same
 behaviour of the previous DOIBoost mapping). This commit apply this type of
 mapping

---
 .../dhp/collection/mag/MagUtility.scala       | 41 +++++--------------
 .../dhp/collection/mag/SparkMAGtoOAF.scala    |  3 ++
 .../dhp/collection/mag/MAGMappingTest.scala   | 12 ++++--
 3 files changed, 22 insertions(+), 34 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
index df22a6b845..c415dd9a43 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@@ -79,23 +79,6 @@ object MagUtility extends Serializable {
   private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
 
   private val MAGDataInfo: DataInfo = {
-    val di = new DataInfo
-    di.setDeletedbyinference(false)
-    di.setInferred(false)
-    di.setInvisible(false)
-    di.setTrust("0.9")
-    di.setProvenanceaction(
-      OafMapperUtils.qualifier(
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.SYSIMPORT_ACTIONSET,
-        ModelConstants.DNET_PROVENANCE_ACTIONS,
-        ModelConstants.DNET_PROVENANCE_ACTIONS
-      )
-    )
-    di
-  }
-
-  private val MAGDataInfoInvisible: DataInfo = {
     val di = new DataInfo
     di.setDeletedbyinference(false)
     di.setInferred(false)
@@ -111,8 +94,7 @@ object MagUtility extends Serializable {
     )
     di
   }
-
-  val datatypedict = Map(
+val datatypedict = Map(
     "bool"     -> BooleanType,
     "int"      -> IntegerType,
     "uint"     -> IntegerType,
@@ -453,7 +435,6 @@ object MagUtility extends Serializable {
 
       case "repository" =>
         result = new Publication()
-        result.setDataInfo(MAGDataInfoInvisible)
         qualifier(
           "0038",
           "Other literature type",
@@ -488,8 +469,7 @@ object MagUtility extends Serializable {
     }
 
     if (result != null) {
-      if (result.getDataInfo == null)
-        result.setDataInfo(MAGDataInfo)
+      result.setDataInfo(MAGDataInfo)
       val i = new Instance
       i.setInstancetype(tp)
       i.setInstanceTypeMapping(
@@ -512,7 +492,7 @@ object MagUtility extends Serializable {
       return null
 
     result.setCollectedfrom(List(MAGCollectedFrom).asJava)
-    val pidList = List(
+    var pidList = List(
       structuredProperty(
         paper.paperId.get.toString,
         qualifier(
@@ -525,7 +505,7 @@ object MagUtility extends Serializable {
       )
     )
 
-    result.setPid(pidList.asJava)
+
 
     result.setOriginalId(pidList.map(s => s.getValue).asJava)
 
@@ -618,10 +598,9 @@ object MagUtility extends Serializable {
     }
 
     val instance = result.getInstance().get(0)
-    instance.setPid(pidList.asJava)
-    if (paper.doi.orNull != null)
-      instance.setAlternateIdentifier(
-        List(
+
+    if (paper.doi.orNull != null) {
+      pidList = pidList ::: List(
           structuredProperty(
             paper.doi.get,
             qualifier(
@@ -632,8 +611,10 @@ object MagUtility extends Serializable {
             ),
             null
           )
-        ).asJava
-      )
+        )
+    }
+    instance.setPid(pidList.asJava)
+    result.setPid(pidList.asJava)
     instance.setUrl(paper.urls.get.asJava)
     instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
     instance.setCollectedfrom(MAGCollectedFrom)
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
index 5dd38970de..123d8e0f8d 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@@ -35,9 +35,12 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
   def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
     import spark.implicits._
 
+
+
     spark.read
       .load(s"$magBasePath/mag_denormalized")
       .as[MAGPaper]
+      .filter(col("doi").isNotNull)
       .map(s => MagUtility.convertMAGtoOAF(s))
       .filter(s => s != null)
       .write
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
index 59b91d66b1..3ae25decbe 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@@ -3,13 +3,17 @@ package eu.dnetlib.dhp.collection.mag
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
 
+
+
 class MAGMappingTest {
 
   val mapper = new ObjectMapper()
 
+
   def mappingTest(): Unit = {
 
     val spark = SparkSession
@@ -18,12 +22,12 @@ class MAGMappingTest {
       .master("local[*]")
       .getOrCreate()
 
-    val s = new SparkMagOrganizationAS(null, null, null)
-
-    s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
-
+    val s = new SparkMAGtoOAF(null, null, null)
+    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
   }
 
+
+
   @Test
   def mappingMagType(): Unit = {
 

From 834461ba26a92b98ac18e2c0206ba0cb15a2c598 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Tue, 21 May 2024 13:47:05 +0200
Subject: [PATCH 18/34] [graph provision]fixed wf definition, revised
 serialization of the usage counts measures

---
 .../utils/XmlSerializationUtils.java          | 27 ++++++-------------
 .../dhp/oa/provision/oozie_app/workflow.xml   |  2 +-
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java
index b4d021b683..fbd647ae4d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java
@@ -170,30 +170,19 @@ public class XmlSerializationUtils {
 		return sb.toString();
 	}
 
-	// <measure downloads="0" views="0">infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE</measure>
+	// <measure views="0" datasource="infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE" />
+	// <measure downloads="0" datasource="infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE" />
 	public static String usageMeasureAsXmlElement(String name, Measure measure) {
-		HashSet<String> dsIds = Optional
-			.ofNullable(measure.getUnit())
-			.map(
-				m -> m
-					.stream()
-					.map(KeyValue::getKey)
-					.collect(Collectors.toCollection(HashSet::new)))
-			.orElse(new HashSet<>());
-
 		StringBuilder sb = new StringBuilder();
-		dsIds.forEach(dsId -> {
+		for (KeyValue kv : measure.getUnit()) {
 			sb
 				.append("<")
-				.append(name);
-			for (KeyValue kv : measure.getUnit()) {
-				sb.append(" ").append(attr(measure.getId(), kv.getValue()));
-			}
-			sb
+				.append(name)
 				.append(" ")
-				.append(attr("datasource", dsId))
-				.append("/>");
-		});
+				.append(attr(measure.getId(), kv.getValue()))
+				.append(attr("datasource", kv.getKey()))
+				.append(" />");
+		}
 		return sb.toString();
 	}
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 50acb4526f..a754c7a5da 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -15,8 +15,8 @@
         </property>
         <property>
             <name>validateXML</name>
-            <description>should the payload converter validate the XMLs</description>
             <value>false</value>
+            <description>should the payload converter validate the XMLs</description>
         </property>
         <property>
             <name>relPartitions</name>

From c7b32bbacc43b89a652fa9cbf8c2982150b64b36 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Thu, 23 May 2024 13:00:19 +0300
Subject: [PATCH 19/34] Update CopyDataToImpalaCluster: Update the code of
 acquiring the entities from Ocean cluster, through hive, in order to optimize
 the process and account for additional reserved keywords in Impala.

Co-authored-by: Antonis Lempesis <antleb@di.uoa.gr>
---
 .../oozie_app/copyDataToImpalaCluster.sh      | 26 +++----------------
 .../oozie_app/copyDataToImpalaCluster.sh      | 26 +++----------------
 .../oozie_app/copyDataToImpalaCluster.sh      | 26 +++----------------
 .../oozie_app/copyDataToImpalaCluster.sh      | 25 +++---------------
 4 files changed, 16 insertions(+), 87 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index 059fb90894..f0ea50cbd8 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -39,23 +39,9 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
 
 IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 
-
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
-
 
 function copydb() {
   db=$1
@@ -109,17 +95,13 @@ function copydb() {
   num_tables=0
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
     # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
-    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
     if [ -n "$create_view_statement_test" ]; then
       echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
-        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
-        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
-        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
       all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 1130a684da..8d32e11fb1 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -38,23 +38,9 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
 
 IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 
-
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
-
 
 function copydb() {
   db=$1
@@ -108,17 +94,13 @@ function copydb() {
   num_tables=0
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
     # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
-    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
     if [ -n "$create_view_statement_test" ]; then
       echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
-        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
-        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
-        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
       all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index de275145b3..ece71a6341 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -38,23 +38,9 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
 
 IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 
-
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
-
 
 function copydb() {
   db=$1
@@ -108,17 +94,13 @@ function copydb() {
   num_tables=0
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
     # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
-    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
     if [ -n "$create_view_statement_test" ]; then
       echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
-        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
-        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
-        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
       all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 6fc0aa7456..109f9111c1 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -40,19 +40,6 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
-
 
 export HADOOP_USER_NAME=$6
 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
@@ -110,17 +97,13 @@ function copydb() {
   num_tables=0
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
     # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
-    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
     if [ -n "$create_view_statement_test" ]; then
       echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
-        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
-        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
-        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
       all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"

From 68322843e2a1fd352ac372838a1da99d2bcb0a44 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Thu, 23 May 2024 15:07:49 +0300
Subject: [PATCH 20/34] Small updates to the copy-operation to Impala Cluster:
 - Add a configuration-"switch" to control whether the script exits upon an
 error or not. - Allow the script to exit when a table could not be created. -
 Show the elapsed time for processing each database.

---
 .../oozie_app/copyDataToImpalaCluster.sh      | 44 ++++++++++++++---
 .../oozie_app/copyDataToImpalaCluster.sh      | 46 +++++++++++++++---
 .../oozie_app/copyDataToImpalaCluster.sh      | 45 +++++++++++++++---
 .../oozie_app/copyDataToImpalaCluster.sh      | 47 +++++++++++++++----
 4 files changed, 153 insertions(+), 29 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index f0ea50cbd8..f829cecc16 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,7 @@ fi
 
 export HADOOP_USER_NAME=$2
 
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -30,7 +31,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
     echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
-    exit 1
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 1
+    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
@@ -43,8 +46,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
 
+function print_elapsed_time()
+{
+  start_time=$1
+  end_time=$(date +%s)
+  elapsed_time=$(($end_time-$start_time))
+  hours=$((elapsed_time / 3600))
+  minutes=$(((elapsed_time % 3600) / 60))
+  seconds=$((elapsed_time % 60))
+  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
+
+
 function copydb() {
   db=$1
+  start_db_time=$(date +%s)
   echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
@@ -53,7 +69,9 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 2
+    fi
   fi
 
   echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -77,7 +95,9 @@ function copydb() {
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
-    exit 3
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 3
+    fi
   fi
 
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -109,12 +129,17 @@ function copydb() {
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 4
+          fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 5
+          fi
         fi
       fi
     fi
@@ -158,7 +183,9 @@ function copydb() {
 
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+        exit 6
+      fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
     else
@@ -186,11 +213,14 @@ function copydb() {
   else
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 7
+    fi
   fi
 
   rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
+  print_elapsed_time start_db_time
 }
 
 
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 8d32e11fb1..0af44a2cce 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,9 @@ fi
 
 export HADOOP_USER_NAME=$2
 
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+
+
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
 echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
     echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
-    exit 1
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 1
+    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
@@ -42,8 +47,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
 
+function print_elapsed_time()
+{
+  start_time=$1
+  end_time=$(date +%s)
+  elapsed_time=$(($end_time-$start_time))
+  hours=$((elapsed_time / 3600))
+  minutes=$(((elapsed_time % 3600) / 60))
+  seconds=$((elapsed_time % 60))
+  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
+
+
 function copydb() {
   db=$1
+  start_db_time=$(date +%s)
   echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
@@ -52,7 +70,9 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 2
+    fi
   fi
 
   echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -76,7 +96,9 @@ function copydb() {
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
-    exit 3
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 3
+    fi
   fi
 
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -108,12 +130,17 @@ function copydb() {
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 4
+          fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 5
+          fi
         fi
       fi
     fi
@@ -157,7 +184,9 @@ function copydb() {
 
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+        exit 6
+      fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
     else
@@ -185,11 +214,14 @@ function copydb() {
   else
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 7
+    fi
   fi
 
   rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
+  print_elapsed_time start_db_time
 }
 
 
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index ece71a6341..46d4955780 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,8 @@ fi
 
 export HADOOP_USER_NAME=$2
 
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
 echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@@ -29,7 +31,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
     echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
-    exit 1
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 1
+    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
@@ -42,8 +46,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 
 
+function print_elapsed_time()
+{
+  start_time=$1
+  end_time=$(date +%s)
+  elapsed_time=$(($end_time-$start_time))
+  hours=$((elapsed_time / 3600))
+  minutes=$(((elapsed_time % 3600) / 60))
+  seconds=$((elapsed_time % 60))
+  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
+
+
 function copydb() {
   db=$1
+  start_db_time=$(date +%s)
   echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
@@ -52,7 +69,9 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 2
+    fi
   fi
 
   echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -76,7 +95,9 @@ function copydb() {
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
-    exit 3
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 3
+    fi
   fi
 
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -108,12 +129,17 @@ function copydb() {
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 4
+          fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 5
+          fi
         fi
       fi
     fi
@@ -157,7 +183,9 @@ function copydb() {
 
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+        exit 6
+      fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
     else
@@ -185,11 +213,14 @@ function copydb() {
   else
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 7
+    fi
   fi
 
   rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
+  print_elapsed_time start_db_time
 }
 
 
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 109f9111c1..cd9019746c 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -6,6 +6,8 @@ then
     ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
 fi
 
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
     echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
-    exit 1
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 1
+    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
@@ -45,8 +49,21 @@ export HADOOP_USER_NAME=$6
 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
 
 
+function print_elapsed_time()
+{
+  start_time=$1
+  end_time=$(date +%s)
+  elapsed_time=$(($end_time-$start_time))
+  hours=$((elapsed_time / 3600))
+  minutes=$(((elapsed_time % 3600) / 60))
+  seconds=$((elapsed_time % 60))
+  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
+
+
 function copydb() {
   db=$1
+  start_db_time=$(date +%s)
   echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
@@ -55,7 +72,9 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 2
+    fi
   fi
 
   echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -79,7 +98,9 @@ function copydb() {
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
-    exit 3
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 3
+    fi
   fi
 
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -111,12 +132,17 @@ function copydb() {
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 4
+          fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+            exit 5
+          fi
         fi
       fi
     fi
@@ -160,7 +186,9 @@ function copydb() {
 
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+        exit 6
+      fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
     else
@@ -188,11 +216,14 @@ function copydb() {
   else
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 7
+    fi
   fi
 
   rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
+  print_elapsed_time start_db_time
 }
 
 STATS_DB=$1
@@ -216,6 +247,6 @@ copydb $MONITOR_DB'_ris_tail'
 contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
 for i in ${contexts}
 do
-   tmp=`echo "$i"  | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
+  tmp=`echo "$i"  | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
   copydb ${MONITOR_DB}'_'${tmp}
 done
\ No newline at end of file

From b48ed6e617038aaaf9677f7eb9143ab3a464f82d Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Thu, 23 May 2024 16:58:12 +0300
Subject: [PATCH 21/34] Change configuration in the copy-operation to Impala
 Cluster: Set the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" parameter to "false".

---
 .../stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh      | 3 ++-
 .../stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh   | 2 +-
 .../graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh   | 3 ++-
 .../dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh    | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index f829cecc16..26760d650f 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -8,7 +8,8 @@ fi
 
 export HADOOP_USER_NAME=$2
 
-SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 0af44a2cce..26760d650f 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -8,7 +8,7 @@ fi
 
 export HADOOP_USER_NAME=$2
 
-SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
 
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index 46d4955780..1ab3e417a0 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -8,7 +8,8 @@ fi
 
 export HADOOP_USER_NAME=$2
 
-SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index cd9019746c..7957a659c9 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -6,7 +6,7 @@ then
     ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
 fi
 
-SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
 
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.

From 15b54a345aa329fe0e256aa1e8c84050d30308f2 Mon Sep 17 00:00:00 2001
From: Antonis Lempesis <antleb@di.uoa.gr>
Date: Fri, 24 May 2024 13:21:28 +0300
Subject: [PATCH 22/34] added fos lvl4

---
 .../dhp/oa/graph/stats/oozie_app/scripts/step7.sql       | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
index eb16a161e9..c0993ef0b3 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
 with
     lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
     lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
-    lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
-select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
+    lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
+    lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
+select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
 from lvl1
  join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
- join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
+ join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
+ join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
+
 
 DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
 

From 1af4224d3dc5ab9eac7f197b58afa4e5d06af87a Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Sun, 26 May 2024 15:43:24 +0200
Subject: [PATCH 23/34] [org dedup] avoid NPEs in SparkPrepareOrgRels

---
 .../main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
index 61325ab502..4fea61c18a 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 					final Organization o = r._2()._2();
 					return new OrgSimRel(
 						r._1()._1(),
-						o.getOriginalId().get(0),
+						Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
 						Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
 						Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),

From 3a7a6ecc32fd443ed1a23b8f9ffaadee285db2c1 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Sun, 26 May 2024 16:48:11 +0200
Subject: [PATCH 24/34] [org dedup] avoid NPEs in SparkPrepareOrgRels

---
 .../java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
index 4fea61c18a..83ec7e5222 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
 					OrgSimRel orgSimRel = r._1()._2();
-					orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+					orgSimRel
+						.setLocal_id(
+							Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
 					return orgSimRel;
 				},
 				Encoders.bean(OrgSimRel.class));

From 107d958b8949e756271083b94182e42667be3d3b Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Sun, 26 May 2024 21:23:30 +0200
Subject: [PATCH 25/34] [org dedup] avoid NPEs in SparkPrepareNewOrgs

---
 .../dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
index d12048b028..0507b7b9af 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Field;
 import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
 					"",
-					r._1()._2().getOriginalId().get(0),
-					r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
-					r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
-					r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
-					r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
-					r._1()._2().getCollectedfrom().get(0).getValue(),
+					Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
+					Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
+					Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
+					Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
 					"",
 					structuredPropertyListToString(r._1()._2().getPid()),
 					parseECField(r._1()._2().getEclegalbody()),

From b55fed09f8a0e88b432b93da9d265cfd9533ae59 Mon Sep 17 00:00:00 2001
From: Miriam Baglioni <miriam.baglioni@isti.cnr.it>
Date: Fri, 24 May 2024 12:28:24 +0200
Subject: [PATCH 26/34] Update to include a blackList that filters out the
 results we know are wrongly associated to IE

---
 .../CreateActionSetFromWebEntries.java        | 56 ++++++++++++-------
 .../actionmanager/webcrawl/as_parameters.json |  7 ++-
 .../actionmanager/webcrawl/CreateASTest.java  | 28 +++++++---
 .../webcrawl/{ => input}/part-00000           |  0
 .../webcrawl/{ => input}/part-00001           |  0
 .../webcrawl/{ => input}/part-00002           |  0
 6 files changed, 62 insertions(+), 29 deletions(-)
 rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/{ => input}/part-00000 (100%)
 rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/{ => input}/part-00001 (100%)
 rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/{ => input}/part-00002 (100%)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
index eb370e981a..541ed8e103 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.types.StructType;
@@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 
+		final String blackListInputPath = parser.get("blackListPath");
+		log.info("blackListInputPath: {}", blackListInputPath);
+
 		SparkConf conf = new SparkConf();
 
 		runWithSparkSession(
@@ -77,35 +81,40 @@ public class CreateActionSetFromWebEntries implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 
-				createActionSet(spark, inputPath, outputPath);
+				createActionSet(spark, inputPath, outputPath, blackListInputPath);
 
 			});
 	}
 
 	public static void createActionSet(SparkSession spark, String inputPath,
-		String outputPath) {
+		String outputPath, String blackListInputPath) {
 
 		final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
-			.filter("publication_year <= 2020 or country_code=='IE'")
+			.filter("country_code=='IE'")
 			.drop("publication_year");
 
-		dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
-			List<Relation> ret = new ArrayList<>();
-			final String ror = ROR_PREFIX
-				+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
-			ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
-			ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
-			ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
 
-			return ret
-				.iterator();
-		}, Encoders.bean(Relation.class))
-			.toJavaRDD()
-			.map(p -> new AtomicAction(p.getClass(), p))
-			.mapToPair(
-				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+		dataset.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
+				.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
+				.drop("OpenAlexId")
+				.flatMap((FlatMapFunction<Row, Relation>) row -> {
+					List<Relation> ret = new ArrayList<>();
+					final String ror = ROR_PREFIX
+							+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
+					ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
+					ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+					ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+
+					return ret
+							.iterator();
+				}, Encoders.bean(Relation.class))
+				.toJavaRDD()
+				.map(p -> new AtomicAction(p.getClass(), p))
+				.mapToPair(
+						aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+								new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+				.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);//, GzipCodec.class);
 
 	}
 
@@ -136,6 +145,15 @@ public class CreateActionSetFromWebEntries implements Serializable {
 
 	}
 
+	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath){
+
+		return spark
+				.read()
+				.option("header", true)
+				.csv(inputPath)
+				.select("OpenAlexId");
+	}
+
 	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
 		if (pmcid == null)
 			return new ArrayList<>();
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
index 3f056edf77..b79140b3a7 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@@ -16,5 +16,10 @@
     "paramLongName": "isSparkSessionManaged",
     "paramDescription": "the hdfs name node",
     "paramRequired": false
-  }
+  },{
+  "paramName": "bl",
+  "paramLongName": "blackListPath",
+  "paramDescription": "the working path",
+  "paramRequired": true
+}
 ]
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
index 402f07d4d7..c574a58129 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@@ -75,8 +75,12 @@ public class CreateASTest {
 
 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
 			.getPath();
+		String blackListPath = getClass()
+				.getResource(
+						"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+				.getPath();
 
 		CreateActionSetFromWebEntries
 			.main(
@@ -86,7 +90,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
+						"-blackListPath", blackListPath
 				});
 
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@@ -96,7 +101,7 @@ public class CreateASTest {
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Relation) aa.getPayload()));
 
-		Assertions.assertEquals(64, tmp.count());
+		Assertions.assertEquals(58, tmp.count());
 
 	}
 
@@ -109,6 +114,10 @@ public class CreateASTest {
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
 			.getPath();
+		String blackListPath = getClass()
+				.getResource(
+						"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+				.getPath();
 
 		CreateActionSetFromWebEntries
 			.main(
@@ -118,7 +127,8 @@ public class CreateASTest {
 					"-sourcePath",
 					inputPath,
 					"-outputPath",
-					workingDir.toString() + "/actionSet1"
+					workingDir.toString() + "/actionSet1",
+						"-blackListPath", blackListPath
 				});
 
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@@ -184,7 +194,7 @@ public class CreateASTest {
 
 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getSource()
@@ -197,7 +207,7 @@ public class CreateASTest {
 
 		Assertions
 			.assertEquals(
-				5, tmp
+				2, tmp
 					.filter(
 						r -> r
 							.getTarget()
@@ -210,7 +220,7 @@ public class CreateASTest {
 
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@@ -224,7 +234,7 @@ public class CreateASTest {
 
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(
 						r -> r
 							.getTarget()
@@ -238,7 +248,7 @@ public class CreateASTest {
 
 		Assertions
 			.assertEquals(
-				1, tmp
+				0, tmp
 					.filter(
 						r -> r
 							.getTarget()
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00000 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00000
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00001 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00001
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00002 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00002
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002

From 87c9c61b414657777eb5d65323851d4ec70bd18d Mon Sep 17 00:00:00 2001
From: Miriam Baglioni <miriam.baglioni@isti.cnr.it>
Date: Fri, 24 May 2024 15:23:42 +0200
Subject: [PATCH 27/34] Update to include a blackList that filters out the
 results we know are wrongly associated to IE - refactoring

---
 .../CreateActionSetFromWebEntries.java        | 49 ++++++++++---------
 .../actionmanager/webcrawl/CreateASTest.java  | 16 +++---
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
index 541ed8e103..27970f2c34 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@@ -95,26 +95,27 @@ public class CreateActionSetFromWebEntries implements Serializable {
 
 		final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
 
-		dataset.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
-				.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
-				.drop("OpenAlexId")
-				.flatMap((FlatMapFunction<Row, Relation>) row -> {
-					List<Relation> ret = new ArrayList<>();
-					final String ror = ROR_PREFIX
-							+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
-					ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
-					ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
-					ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+		dataset
+			.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
+			.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
+			.drop("OpenAlexId")
+			.flatMap((FlatMapFunction<Row, Relation>) row -> {
+				List<Relation> ret = new ArrayList<>();
+				final String ror = ROR_PREFIX
+					+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
+				ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
+				ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+				ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
 
-					return ret
-							.iterator();
-				}, Encoders.bean(Relation.class))
-				.toJavaRDD()
-				.map(p -> new AtomicAction(p.getClass(), p))
-				.mapToPair(
-						aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-								new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-				.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);//, GzipCodec.class);
+				return ret
+					.iterator();
+			}, Encoders.bean(Relation.class))
+			.toJavaRDD()
+			.map(p -> new AtomicAction(p.getClass(), p))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
 
 	}
 
@@ -145,13 +146,13 @@ public class CreateActionSetFromWebEntries implements Serializable {
 
 	}
 
-	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath){
+	private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
 
 		return spark
-				.read()
-				.option("header", true)
-				.csv(inputPath)
-				.select("OpenAlexId");
+			.read()
+			.option("header", true)
+			.csv(inputPath)
+			.select("OpenAlexId");
 	}
 
 	private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
index c574a58129..e9291f93c5 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@@ -78,9 +78,9 @@ public class CreateASTest {
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
 			.getPath();
 		String blackListPath = getClass()
-				.getResource(
-						"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
-				.getPath();
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+			.getPath();
 
 		CreateActionSetFromWebEntries
 			.main(
@@ -91,7 +91,7 @@ public class CreateASTest {
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet1",
-						"-blackListPath", blackListPath
+					"-blackListPath", blackListPath
 				});
 
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@@ -115,9 +115,9 @@ public class CreateASTest {
 				"/eu/dnetlib/dhp/actionmanager/webcrawl/")
 			.getPath();
 		String blackListPath = getClass()
-				.getResource(
-						"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
-				.getPath();
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+			.getPath();
 
 		CreateActionSetFromWebEntries
 			.main(
@@ -128,7 +128,7 @@ public class CreateASTest {
 					inputPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet1",
-						"-blackListPath", blackListPath
+					"-blackListPath", blackListPath
 				});
 
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

From 75d5ddb999fe9a061b8211f828ffa35e6b8c763d Mon Sep 17 00:00:00 2001
From: Miriam Baglioni <miriam.baglioni@isti.cnr.it>
Date: Fri, 24 May 2024 16:01:19 +0200
Subject: [PATCH 28/34] Update to include a blackList that filters out the
 results we know are wrongly associated to IE - update workflow definition -
 the blacklist parameter

---
 .../eu/dnetlib/dhp/actionmanager/webcrawl/job.properties         | 1 +
 .../eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
index f616baea70..d7bd709fca 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@@ -1,2 +1,3 @@
 sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
 outputPath=/tmp/miriam/webcrawlComplete/
+blackListPath=/user/miriam.baglioni/openalex-blackList
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
index 653a7d3842..b9394c7e69 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@@ -45,6 +45,7 @@
             </spark-opts>
             <arg>--sourcePath</arg><arg>${sourcePath}</arg>
             <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>--blackListPath</arg><arg>${blackListPath}</arg>
         </spark>
         <ok to="End"/>
         <error to="Kill"/>

From 73316d8c8353408670ef07c837cb626fdc508e90 Mon Sep 17 00:00:00 2001
From: Giambattista Bloisi <giambattista.bloisi@openaire.eu>
Date: Tue, 28 May 2024 14:14:51 +0200
Subject: [PATCH 29/34] Add jaxb and jaxws dependencies when compiling with
 spark-34 profile as they are required to run with jdk > 8

---
 dhp-common/pom.xml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 2c7a0ef8c2..bfec019af6 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -169,4 +169,23 @@
 		</dependency>
 	</dependencies>
 
+	<!-- dependencies required on JDK9+ because J2EE has been removed -->
+	<profiles>
+		<profile>
+			<id>spark-34</id>
+			<dependencies>
+				<dependency>
+					<groupId>javax.xml.bind</groupId>
+					<artifactId>jaxb-api</artifactId>
+					<version>2.2.11</version>
+				</dependency>
+				<dependency>
+					<groupId>com.sun.xml.ws</groupId>
+					<artifactId>jaxws-ri</artifactId>
+					<version>2.3.3</version>
+					<type>pom</type>
+				</dependency>
+			</dependencies>
+		</profile>
+	</profiles>
 </project>

From e3f28338c147571f54c81fa9996b0c03f8f95455 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Tue, 28 May 2024 17:51:45 +0300
Subject: [PATCH 30/34] Miscellaneous updates to the copying operation to
 Impala Cluster: - Assign the WRITE and EXECUTE permissions to the DBs'
 HDFS-directories, in order to be able to create tables on top of them, in the
 Impala Cluster. - Make sure the "copydb" function returns early, when it
 encounters a fatal error, while respecting the
 "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config.

---
 .../oozie_app/copyDataToImpalaCluster.sh      | 31 +++++++++++++------
 .../oozie_app/copyDataToImpalaCluster.sh      | 31 +++++++++++++------
 .../oozie_app/copyDataToImpalaCluster.sh      | 31 +++++++++++++------
 .../oozie_app/copyDataToImpalaCluster.sh      | 31 +++++++++++++------
 4 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index 26760d650f..ca0f7a6433 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -72,6 +72,8 @@ function copydb() {
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 2
+    else
+      return 2
     fi
   fi
 
@@ -90,19 +92,30 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 3
+    else
+      return 3
     fi
   fi
 
-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi
 
   echo -e "\nCreating schema for db: '${db}'\n"
 
@@ -131,7 +144,7 @@ function copydb() {
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
           fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@@ -139,7 +152,7 @@ function copydb() {
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
           fi
         fi
       fi
@@ -185,7 +198,7 @@ function copydb() {
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
       fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@@ -215,7 +228,7 @@ function copydb() {
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
     fi
   fi
 
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 26760d650f..ca0f7a6433 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -72,6 +72,8 @@ function copydb() {
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 2
+    else
+      return 2
     fi
   fi
 
@@ -90,19 +92,30 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 3
+    else
+      return 3
     fi
   fi
 
-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi
 
   echo -e "\nCreating schema for db: '${db}'\n"
 
@@ -131,7 +144,7 @@ function copydb() {
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
           fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@@ -139,7 +152,7 @@ function copydb() {
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
           fi
         fi
       fi
@@ -185,7 +198,7 @@ function copydb() {
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
       fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@@ -215,7 +228,7 @@ function copydb() {
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
     fi
   fi
 
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index 1ab3e417a0..dd2203eef3 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -72,6 +72,8 @@ function copydb() {
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 2
+    else
+      return 2
     fi
   fi
 
@@ -90,19 +92,30 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 3
+    else
+      return 3
     fi
   fi
 
-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi
 
   echo -e "\nCreating schema for db: '${db}'\n"
 
@@ -131,7 +144,7 @@ function copydb() {
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
           fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@@ -139,7 +152,7 @@ function copydb() {
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
           fi
         fi
       fi
@@ -185,7 +198,7 @@ function copydb() {
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
       fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@@ -215,7 +228,7 @@ function copydb() {
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
     fi
   fi
 
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 7957a659c9..918775f495 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -74,6 +74,8 @@ function copydb() {
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 2
+    else
+      return 2
     fi
   fi
 
@@ -92,19 +94,30 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
   else
     echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
       exit 3
+    else
+      return 3
     fi
   fi
 
-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi
 
   echo -e "\nCreating schema for db: '${db}'\n"
 
@@ -133,7 +146,7 @@ function copydb() {
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
           echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
           fi
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@@ -141,7 +154,7 @@ function copydb() {
         if [ -n "$log_errors" ]; then
           echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
           if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
           fi
         fi
       fi
@@ -187,7 +200,7 @@ function copydb() {
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
       fi
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
       echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@@ -217,7 +230,7 @@ function copydb() {
     echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
     fi
   fi
 

From 5d85b70e1fbd724ab34648e3ace3282ce40bae58 Mon Sep 17 00:00:00 2001
From: Miriam Baglioni <miriam.baglioni@isti.cnr.it>
Date: Wed, 29 May 2024 11:55:00 +0200
Subject: [PATCH 31/34] [NOAMI] removed Ireland funder id 501100011103. ticket
 9635

---
 .../eu/dnetlib/dhp/collection/crossref/irish_funder.json    | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
index f0275e06bc..e4f491e5c6 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@@ -625,12 +625,6 @@
     "name": "Alimentary Health",
     "synonym": []
   },
-  {
-    "id": "501100011103",
-    "uri": "http://dx.doi.org/10.13039/501100011103",
-    "name": "Rann\u00eds",
-    "synonym": []
-  },
   {
     "id": "501100012354",
     "uri": "http://dx.doi.org/10.13039/501100012354",

From e996787be232c0a4f214712d6fffc0884ab2c400 Mon Sep 17 00:00:00 2001
From: "michele.artini" <michele.artini@isti.cnr.it>
Date: Wed, 29 May 2024 15:05:17 +0200
Subject: [PATCH 32/34] OSF test

---
 .../collection/plugin/rest/OsfPreprintCollectorTest.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
index 90f4c7f25b..0e64f8bab2 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest {
 	private final String resumptionType = "page";
 	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
 
-	private final String resultSizeParam = "";
-	private final String resultSizeValue = "";
+	private final String resultSizeParam = "page[size]";
+	private final String resultSizeValue = "100";
 
 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
 
-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);

From 1b165a14a09394adda40aca0eb3df238d471a448 Mon Sep 17 00:00:00 2001
From: Alessia <alessia.bardi@gmail.com>
Date: Wed, 29 May 2024 15:41:36 +0200
Subject: [PATCH 33/34] Rest collector plugin on hadoop supports a new param to
 pass request headers

---
 .../plugin/rest/RestCollectorPlugin.java      | 11 ++--
 .../collection/plugin/rest/RestIterator.java  | 62 +++++++++++++------
 .../plugin/rest/RestCollectorPluginTest.java  | 42 ++++++++++---
 .../plugin/rest/RestIteratorTest.java         |  2 +-
 4 files changed, 83 insertions(+), 34 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
index 997948687b..8445e49e0f 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@@ -1,12 +1,14 @@
 
 package eu.dnetlib.dhp.collection.plugin.rest;
 
+import java.util.Map;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 
+import com.google.gson.Gson;
 import org.apache.commons.lang3.StringUtils;
 
 import eu.dnetlib.dhp.collection.ApiDescriptor;
@@ -47,6 +49,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		final String entityXpath = api.getParams().get("entityXpath");
 		final String authMethod = api.getParams().get("authMethod");
 		final String authToken = api.getParams().get("authToken");
+		final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+		Gson gson = new Gson();
+		Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
 		final String resultSizeValue = Optional
 			.ofNullable(api.getParams().get("resultSizeValue"))
 			.filter(StringUtils::isNotBlank)
@@ -64,9 +69,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
 		if (StringUtils.isBlank(resultFormatValue)) {
 			throw new CollectorException("Param 'resultFormatValue' is null or empty");
 		}
-		if (StringUtils.isBlank(queryParams)) {
-			throw new CollectorException("Param 'queryParams' is null or empty");
-		}
 		if (StringUtils.isBlank(entityXpath)) {
 			throw new CollectorException("Param 'entityXpath' is null or empty");
 		}
@@ -92,7 +94,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			entityXpath,
 			authMethod,
 			authToken,
-			resultOutputFormat);
+			resultOutputFormat,
+				requestHeaders);
 
 		return StreamSupport
 			.stream(
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
index 76af6cff1a..e51c9eb1b8 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@@ -9,6 +9,7 @@ import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;
 
@@ -24,6 +25,7 @@ import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 
+import com.google.common.collect.Maps;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.http.HttpHeaders;
@@ -49,13 +51,14 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
  */
 public class RestIterator implements Iterator<String> {
 
+
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
 	private static final int MAX_ATTEMPTS = 5;
 
 	private final HttpClientParams clientParams;
 
-	private final String BASIC = "basic";
+	private final String AUTHBASIC = "basic";
 
 	private final String baseUrl;
 	private final String resumptionType;
@@ -89,6 +92,12 @@ public class RestIterator implements Iterator<String> {
 	 */
 	private final String resultOutputFormat;
 
+	/*
+    Can be used to set additional request headers, like for content negotiation
+     */
+	private Map<String, String> requestHeaders;
+
+
 	/**
 	 * RestIterator class compatible to version 1.3.33
 	 */
@@ -107,7 +116,8 @@ public class RestIterator implements Iterator<String> {
 		final String entityXpath,
 		final String authMethod,
 		final String authToken,
-		final String resultOutputFormat) {
+		final String resultOutputFormat,
+		final Map<String, String> requestHeaders) {
 
 		this.clientParams = clientParams;
 		this.baseUrl = baseUrl;
@@ -119,6 +129,7 @@ public class RestIterator implements Iterator<String> {
 		this.authMethod = authMethod;
 		this.authToken = authToken;
 		this.resultOutputFormat = resultOutputFormat;
+		this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
 
 		this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
 			: "";
@@ -231,25 +242,20 @@ public class RestIterator implements Iterator<String> {
 
 				final URL qUrl = new URL(query);
 				log.debug("authMethod: {}", this.authMethod);
-				if ("bearer".equalsIgnoreCase(this.authMethod)) {
-					log.trace("authMethod before inputStream: {}", resultXml);
-					final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-					conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
-					conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
-					conn.setRequestMethod("GET");
-					theHttpInputStream = conn.getInputStream();
-				} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
-					log.trace("authMethod before inputStream: {}", resultXml);
-					final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
-					conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
-					conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
-					conn.setRequestMethod("GET");
-					theHttpInputStream = conn.getInputStream();
-				} else {
-					theHttpInputStream = qUrl.openStream();
+				if (this.authMethod == "bearer") {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Bearer " + authToken);
+					//requestHeaders.put("Content-Type", "application/json");
+				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+					requestHeaders.put("Authorization", "Basic " + authToken);
+					//requestHeaders.put("accept", "application/xml");
 				}
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestMethod("GET");
+				this.setRequestHeader(conn);
+				resultStream = conn.getInputStream();
 
-				this.resultStream = theHttpInputStream;
 				if ("json".equals(this.resultOutputFormat)) {
 					resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
 					resultXml = JsonUtils.convertToXML(resultJson);
@@ -380,7 +386,7 @@ public class RestIterator implements Iterator<String> {
 			try {
 				if (this.resultTotal == -1) {
 					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
-					if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
+					if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
 						this.resultTotal += 1;
 					} // to correct the upper bound
 					log.info("resultTotal was -1 is now: " + this.resultTotal);
@@ -433,6 +439,22 @@ public class RestIterator implements Iterator<String> {
 		}
 	}
 
+	/**
+	 * setRequestHeader
+	 *
+	 * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+	 * @param conn
+	 */
+	private void setRequestHeader(HttpURLConnection conn) {
+		if (requestHeaders != null) {
+			for (String key : requestHeaders.keySet()) {
+				conn.setRequestProperty(key, requestHeaders.get(key));
+			}
+			log.debug("Set Request Header with: " + requestHeaders);
+		}
+
+	}
+
 	public String getResultFormatValue() {
 		return this.resultFormatValue;
 	}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
index f708c367b3..a9fc325c36 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@@ -4,10 +4,16 @@
 
 package eu.dnetlib.dhp.collection.plugin.rest;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
 
+import com.google.gson.Gson;
 import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -25,18 +31,18 @@ class RestCollectorPluginTest {
 
 	private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
 
-	private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
-	private final String resumptionType = "count";
-	private final String resumptionParam = "from";
-	private final String entityXpath = "//hits/hits";
-	private final String resumptionXpath = "//hits";
-	private final String resultTotalXpath = "//hits/total";
-	private final String resultFormatParam = "format";
+	private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+	private final String resumptionType = "discover";
+	private final String resumptionParam = "skip";
+	private final String entityXpath = "//*[local-name()='data']";
+	private final String resumptionXpath = "";
+	private final String resultTotalXpath = "//*[local-name()='count']";
+	private final String resultFormatParam = "";
 	private final String resultFormatValue = "json";
-	private final String resultSizeParam = "size";
+	private final String resultSizeParam = "top";
 	private final String resultSizeValue = "10";
 	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
-	private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+	private final String query = "";
 	// private String query = "=(sources:engrXiv AND type:preprint)";
 
 	private final String protocolDescriptor = "rest_json2xml";
@@ -56,10 +62,12 @@ class RestCollectorPluginTest {
 		params.put("resultSizeValue", resultSizeValue);
 		params.put("queryParams", query);
 		params.put("entityXpath", entityXpath);
+		params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
 
 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
 
+
 		rcp = new RestCollectorPlugin(new HttpClientParams());
 	}
 
@@ -78,4 +86,20 @@ class RestCollectorPluginTest {
 		log.info("{}", i.intValue());
 		Assertions.assertTrue(i.intValue() > 0);
 	}
+
+	@Disabled
+	@Test
+	void testUrl() throws IOException {
+		String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+		URL url = new URL(url_s);
+		final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+		conn.setRequestMethod("GET");
+		conn.setRequestProperty("User-Agent", "OpenAIRE");
+		Gson gson = new Gson();
+		System.out.println("Request header");
+		System.out.println(gson.toJson(conn.getHeaderFields()));
+		InputStream inputStream = conn.getInputStream();
+
+
+	}
 }
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
index e2d6ad3e7f..ed31c2b7ec 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@@ -44,7 +44,7 @@ public class RestIteratorTest {
 
 		final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
 			resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
-			query, entityXpath, authMethod, authToken, resultOffsetParam);
+			query, entityXpath, authMethod, authToken, resultOffsetParam, null);
 		int i = 20;
 		while (iterator.hasNext() && i > 0) {
 			String result = iterator.next();

From c272c4ad68255820fe6d9fd3d4aac182da1f5678 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 29 May 2024 15:50:07 +0200
Subject: [PATCH 34/34] code formatting

---
 .../plugin/rest/RestCollectorPlugin.java         |  5 +++--
 .../dhp/collection/plugin/rest/RestIterator.java | 16 ++++++++--------
 .../plugin/rest/RestCollectorPluginTest.java     |  5 ++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
index 8445e49e0f..f4ba09f72b 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@@ -8,9 +8,10 @@ import java.util.Spliterators;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 
-import com.google.gson.Gson;
 import org.apache.commons.lang3.StringUtils;
 
+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@@ -95,7 +96,7 @@ public class RestCollectorPlugin implements CollectorPlugin {
 			authMethod,
 			authToken,
 			resultOutputFormat,
-				requestHeaders);
+			requestHeaders);
 
 		return StreamSupport
 			.stream(
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
index e51c9eb1b8..2518fd92fe 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@@ -25,7 +25,6 @@ import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 
-import com.google.common.collect.Maps;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.http.HttpHeaders;
@@ -36,6 +35,8 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 
+import com.google.common.collect.Maps;
+
 import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
 import eu.dnetlib.dhp.common.collection.CollectorException;
 import eu.dnetlib.dhp.common.collection.HttpClientParams;
@@ -51,7 +52,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
  */
 public class RestIterator implements Iterator<String> {
 
-
 	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
 	public static final String UTF_8 = "UTF-8";
 	private static final int MAX_ATTEMPTS = 5;
@@ -93,11 +93,10 @@ public class RestIterator implements Iterator<String> {
 	private final String resultOutputFormat;
 
 	/*
-    Can be used to set additional request headers, like for content negotiation
-     */
+	 * Can be used to set additional request headers, like for content negotiation
+	 */
 	private Map<String, String> requestHeaders;
 
-
 	/**
 	 * RestIterator class compatible to version 1.3.33
 	 */
@@ -245,11 +244,11 @@ public class RestIterator implements Iterator<String> {
 				if (this.authMethod == "bearer") {
 					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
 					requestHeaders.put("Authorization", "Bearer " + authToken);
-					//requestHeaders.put("Content-Type", "application/json");
+					// requestHeaders.put("Content-Type", "application/json");
 				} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
 					log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
 					requestHeaders.put("Authorization", "Basic " + authToken);
-					//requestHeaders.put("accept", "application/xml");
+					// requestHeaders.put("accept", "application/xml");
 				}
 				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
 				conn.setRequestMethod("GET");
@@ -386,7 +385,8 @@ public class RestIterator implements Iterator<String> {
 			try {
 				if (this.resultTotal == -1) {
 					this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
-					if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+					if ("page".equalsIgnoreCase(this.resumptionType)
+						&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
 						this.resultTotal += 1;
 					} // to correct the upper bound
 					log.info("resultTotal was -1 is now: " + this.resultTotal);
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
index a9fc325c36..99b95d9e38 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@@ -13,11 +13,12 @@ import java.util.HashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
 
-import com.google.gson.Gson;
 import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.gson.Gson;
+
 import eu.dnetlib.dhp.collection.ApiDescriptor;
 import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
 import eu.dnetlib.dhp.common.collection.CollectorException;
@@ -67,7 +68,6 @@ class RestCollectorPluginTest {
 		api.setBaseUrl(baseUrl);
 		api.setParams(params);
 
-
 		rcp = new RestCollectorPlugin(new HttpClientParams());
 	}
 
@@ -100,6 +100,5 @@ class RestCollectorPluginTest {
 		System.out.println(gson.toJson(conn.getHeaderFields()));
 		InputStream inputStream = conn.getInputStream();
 
-
 	}
 }