Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta

2024-05-27 11:59:02 +02:00 · 2024-05-27 11:59:02 +02:00 · 0d5bdb2db0
parent 1ea67eba82 66548e6a83
commit 0d5bdb2db0
56 changed files with 2435 additions and 1229 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ spark-warehouse
 /**/.factorypath
 /**/.scalafmt.conf
 /.java-version
 /dhp-shade-package/dependency-reduced-pom.xml
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
 		mojo.outputFile = testFolder;
 		// execute
-		Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+		try {
 			mojo.execute();
 			Assertions.assertTrue(false); // not reached
 		} catch (Exception e) {
 			Assertions
 				.assertTrue(
 					MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
 						IllegalArgumentException.class.isAssignableFrom(e.getClass()));
 		}
 	}
 	@Test
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -70,10 +70,7 @@
 			<groupId>com.ibm.icu</groupId>
 			<artifactId>icu4j</artifactId>
 		</dependency>
-		<dependency>
+
 			<groupId>org.apache.hadoop</groupId>
 			<artifactId>hadoop-common</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@
 		<dependency>
 			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>${dhp-schemas.artifact}</artifactId>
+			<artifactId>dhp-schemas</artifactId>
 		</dependency>
 		<dependency>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@ -38,7 +38,7 @@ public class PacePerson {
 					PacePerson.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/common/name_particles.txt")));
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
 	 *            part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
 	 *            concept_rec_id = 656930
 	 * @return response code
 	 * @throws IOException
 	 * @throws MissingConceptDoiException
 	 */
 	public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
 		setDepositionId(concept_rec_id, 1);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.commons.lang3.time.DateUtils;
 import org.apache.http.HttpHeaders;
 import org.joda.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -154,5 +154,13 @@
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
  },
  "isamongtopnsimilardocuments": {
    "original": "IsAmongTopNSimilarDocuments",
    "inverse": "HasAmongTopNSimilarDocuments"
  },
  "hasamongtopnsimilardocuments": {
    "original": "HasAmongTopNSimilarDocuments",
    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
    val conf: SparkConf = new SparkConf()
    val master = parser.get("master")
    log.info(s"Creating Spark session: Master: $master")
-    SparkSession
+    val b = SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
-      .master(master)
+    if (master != null)
-      .getOrCreate()
+      b.master(master)
    b.getOrCreate()
  }
  def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
  }
  def generateScholixResourceFromResult(r: Result): ScholixResource = {
    val sum = ScholixUtils.resultToSummary(r)
    if (sum != null)
      generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
    else
      null
  }
  val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
  }
  def invRel(rel: String): String = {
    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
    if (semanticRelation != null)
      semanticRelation.inverse
    else
      null
  }
  def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
    if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
      val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
    if (persistentIdentifiers.isEmpty)
      return null
    s.setLocalIdentifier(persistentIdentifiers.asJava)
-    if (r.isInstanceOf[Publication])
+//    s.setTypology(r.getResulttype.getClassid)
      s.setTypology(Typology.publication)
    else
      s.setTypology(Typology.dataset)
    s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@ -24,7 +24,7 @@
 				<executions>
 					<execution>
 						<id>scala-compile-first</id>
-						<phase>initialize</phase>
+						<phase>process-resources</phase>
 						<goals>
 							<goal>add-source</goal>
 							<goal>compile</goal>
@ -59,14 +59,6 @@
 			<groupId>edu.cmu</groupId>
 			<artifactId>secondstring</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.google.guava</groupId>
 			<artifactId>guava</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.google.code.gson</groupId>
 			<artifactId>gson</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-math3</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.jayway.jsonpath</groupId>
 			<artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
 		</dependency>
 	</dependencies>
 	<profiles>
 		<profile>
 			<id>spark-24</id>
 			<activation>
 				<activeByDefault>true</activeByDefault>
 			</activation>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-2</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 		<profile>
 			<id>spark-34</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-2</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 		<profile>
 			<id>spark-35</id>
 			<build>
 				<plugins>
 					<plugin>
 						<groupId>org.codehaus.mojo</groupId>
 						<artifactId>build-helper-maven-plugin</artifactId>
 						<version>3.4.0</version>
 						<executions>
 							<execution>
 								<phase>generate-sources</phase>
 								<goals>
 									<goal>add-source</goal>
 								</goals>
 								<configuration>
 									<sources>
 										<source>src/main/spark-35</source>
 									</sources>
 								</configuration>
 							</execution>
 						</executions>
 					</plugin>
 				</plugins>
 			</build>
 		</profile>
 	</profiles>
 </project>
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -1,12 +1,6 @@
 package eu.dnetlib.pace.common;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import com.ibm.icu.text.Transliterator;
 /**
 * Set of common functions for the framework
 *
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
 import com.jayway.jsonpath.{Configuration, JsonPath}
 import eu.dnetlib.pace.common.AbstractPaceFunctions
 import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
-    df.map(r => rowFromJson(r))(RowEncoder(schema))
+    df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
  }
  def rowFromJson(json: String): Row = {
--- a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.pace.util
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.types.StructType
 object SparkCompatUtils {
  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
    RowEncoder(schema)
  }
 }
--- a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@ -0,0 +1,12 @@
 package eu.dnetlib.pace.util
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.types.StructType
 object SparkCompatUtils {
  def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
    ExpressionEncoder(schema)
  }
 }
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import eu.dnetlib.pace.model.Person;
 import jdk.nashorn.internal.ir.annotations.Ignore;
 public class UtilTest {
--- a/dhp-shade-package/pom.xml
+++ b/dhp-shade-package/pom.xml
@ -0,0 +1,169 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp</artifactId>
        <version>1.2.5-SNAPSHOT</version>
        <relativePath>../pom.xml</relativePath>
    </parent>
    <artifactId>dhp-shade-package</artifactId>
    <packaging>jar</packaging>
    <distributionManagement>
        <site>
            <id>DHPSite</id>
            <url>${dhp.site.stage.path}/dhp-common</url>
        </site>
    </distributionManagement>
    <description>This module create a jar of all module dependencies</description>
    <dependencies>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-actionmanager</artifactId>
            <version>${project.version}</version>
        </dependency>
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-aggregation</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-blacklist</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-broker-events</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-dedup-openaire</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
 <!--        <dependency>-->
 <!--            <groupId>eu.dnetlib.dhp</groupId>-->
 <!--            <artifactId>dhp-enrichment</artifactId>-->
 <!--            <version>${project.version}</version>-->
 <!--        </dependency>-->
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-mapper</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-provision</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-impact-indicators</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-actionsets</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-hist-snaps</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-monitor-irish</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-promote</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-stats-update</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-swh</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-usage-raw-data-update</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-usage-stats-build</artifactId>
            <version>${project.version}</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
                                </transformer>
                                <!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>META-INF/cxf/bus-extensions.txt</resource>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/maven/**</exclude>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <relocations>
                                <relocation>
                                    <pattern>com</pattern>
                                    <shadedPattern>repackaged.com.google.common</shadedPattern>
                                    <includes>
                                        <include>com.google.common.**</include>
                                    </includes>
                                </relocation>
                            </relocations>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -9,6 +9,7 @@ import java.util.List;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
 					.union(openAPCRelations)
 					.union(dataciteRelations)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				resultsRDD
 					.union(projectsRDD)
 					.saveAsHadoopFile(
-						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
 			});
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
            tp._1 match {
              case "electronic" => journal.setIssnOnline(tp._2)
              case "print"      => journal.setIssnPrinted(tp._2)
              case _            =>
            }
          })
        }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@ -94,7 +94,8 @@ object MagUtility extends Serializable {
    )
    di
  }
-val datatypedict = Map(
+
  val datatypedict = Map(
    "bool"     -> BooleanType,
    "int"      -> IntegerType,
    "uint"     -> IntegerType,
@ -505,8 +506,6 @@ val datatypedict = Map(
      )
    )
    result.setOriginalId(pidList.map(s => s.getValue).asJava)
    result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@ -35,8 +35,6 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
  def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
    import spark.implicits._
    spark.read
      .load(s"$magBasePath/mag_denormalized")
      .as[MAGPaper]
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
 import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
+import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
 import eu.dnetlib.dhp.sx.bio.pubmed._
 import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
 import eu.dnetlib.dhp.utils.ISLookupClientFactory
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
 import org.apache.http.impl.client.HttpClientBuilder
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Aggregator
 import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.Aggregator
 import org.slf4j.{Logger, LoggerFactory}
-import java.io.InputStream
+import java.io.{ByteArrayInputStream, InputStream}
-import scala.io.Source
+import java.nio.charset.Charset
-import scala.xml.pull.XMLEventReader
+import javax.xml.stream.XMLInputFactory
 object SparkCreateBaselineDataFrame {
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
          if (response.getStatusLine.getStatusCode > 400) {
            tries -= 1
          } else
-            return IOUtils.toString(response.getEntity.getContent)
+            return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
        } catch {
          case e: Throwable =>
            println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
      IOUtils.toString(
        SparkEBILinksToOaf.getClass.getResourceAsStream(
          "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
-        )
+        ),
        Charset.defaultCharset()
      )
    )
    parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
    val workingPath = parser.get("workingPath")
    log.info("workingPath: {}", workingPath)
-    val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
+    val targetPath = parser.get("targetPath")
-    log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
+    log.info("targetPath: {}", targetPath)
    val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
    val outputBasePath = cleanedMdStoreVersion.getHdfsPath
    log.info("outputBasePath: {}", outputBasePath)
    val hdfsServerUri = parser.get("hdfsServerUri")
-    log.info("hdfsServerUri: {}", hdfsServerUri)
+    log.info("hdfsServerUri: {}", targetPath)
    val skipUpdate = parser.get("skipUpdate")
    log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
    if (!"true".equalsIgnoreCase(skipUpdate)) {
      downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
      val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
      val inputFactory = XMLInputFactory.newInstance
      val ds: Dataset[PMArticle] = spark.createDataset(
        k.filter(i => i._1.endsWith(".gz"))
          .flatMap(i => {
-            val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+            val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
            new PMParser(xml)
          })
      )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
        .map(a => PubMedToOaf.convert(a, vocabularies))
        .as[Oaf]
        .filter(p => p != null),
-      s"$outputBasePath/$MDSTORE_DATA_PATH"
+      targetPath
    )
    val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
    val mdStoreSize = df.count
    writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.sx.bio.pubmed
 import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
 import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
 /** @param xml
  */
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -119,7 +119,9 @@ public class ReadCOCITest {
 					workingDir.toString() + "/COCI",
 					"-outputPath",
 					workingDir.toString() + "/COCI_json/",
-					"-inputFile", "input1;input2;input3;input4;input5"
+					"-inputFile", "input1;input2;input3;input4;input5",
 					"-format",
 					"COCI"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@ -789,10 +789,6 @@
      "value": "2227-9717",
      "type": "electronic"
    },
    {
      "value": "VALUE",
      "type": "PIPPO"
    },
    {
      "value": "1063-4584",
      "type": "pu"
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
 import org.apache.commons.io.IOUtils
 import org.junit.jupiter.api.{BeforeEach, Test}
 import org.junit.jupiter.api.extension.ExtendWith
 import org.mockito.junit.jupiter.MockitoExtension
 import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
    super.setUpVocabulary()
  }
  @Test
  def mappingRecord(): Unit = {
    val input =
      IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
    println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
  }
 }
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@ -7,13 +7,10 @@ import org.apache.spark.sql.functions.col
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
 class MAGMappingTest {
  val mapper = new ObjectMapper()
  def mappingTest(): Unit = {
    val spark = SparkSession
@ -26,8 +23,6 @@ class MAGMappingTest {
    s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
  }
  @Test
  def mappingMagType(): Unit = {
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
 import java.io.{BufferedReader, InputStream, InputStreamReader}
 import java.util.zip.GZIPInputStream
 import javax.xml.stream.XMLInputFactory
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testEBIData() = {
-    val inputXML = Source
+    val inputFactory = XMLInputFactory.newInstance
-      .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
      .mkString
    val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
    new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
  }
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testParsingPubmedXML(): Unit = {
-    val xml = new XMLEventReader(
+    val inputFactory = XMLInputFactory.newInstance
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
-    )
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
    val parser = new PMParser(xml)
    parser.foreach(checkPMArticle)
  }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
  @Test
  def testPubmedMapping(): Unit = {
-    val xml = new XMLEventReader(
+    val inputFactory = XMLInputFactory.newInstance
-      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+    val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
-    )
+
    val parser = new PMParser(xml)
    val results = ListBuffer[Oaf]()
    parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -53,24 +53,10 @@
            <artifactId>dhp-pace-core</artifactId>
            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
        <dependency>
            <groupId>org.scala-lang.modules</groupId>
            <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
            <version>1.0.2</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang.modules</groupId>
            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
            <version>2.11.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>com.arakelian</groupId>
            <artifactId>java-jq</artifactId>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple3;
 import scala.collection.JavaConversions;
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
 			Dataset<Row> pivotHistory = spark
 				.createDataset(
 					Collections.emptyList(),
-					RowEncoder
+					SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
 						.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
 			if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
 				pivotHistory = spark
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.ReduceFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.util.SparkCompatUtils;
 import scala.Tuple2;
 import scala.Tuple3;
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
 		StructType idsSchema = StructType
 			.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
-		Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+		Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
 		for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
 			String entityPath = graphBasePath + '/' + entityType.name();
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -61,7 +61,8 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
 					subject.getQualifier().setClassname(vocabulary.getName());
 				}
 			} else {
-				final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
+				final String provenanceActionClassId = Optional
 					.ofNullable(subject.getDataInfo())
 					.map(DataInfo::getProvenanceaction)
 					.map(Qualifier::getClassid)
 					.orElse(null);
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json
@ -0,0 +1,5 @@
 [
  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",  "paramRequired": false},
  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the source Path",           "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json
@ -0,0 +1,166 @@
 {
  "cites":{
    "original":"Cites",
    "inverse":"IsCitedBy"
  },
  "compiles":{
    "original":"Compiles",
    "inverse":"IsCompiledBy"
  },
  "continues":{
    "original":"Continues",
    "inverse":"IsContinuedBy"
  },
  "derives":{
    "original":"IsSourceOf",
    "inverse":"IsDerivedFrom"
  },
  "describes":{
    "original":"Describes",
    "inverse":"IsDescribedBy"
  },
  "documents":{
    "original":"Documents",
    "inverse":"IsDocumentedBy"
  },
  "hasmetadata":{
    "original":"HasMetadata",
    "inverse":"IsMetadataOf"
  },
  "hasassociationwith":{
    "original":"HasAssociationWith",
    "inverse":"HasAssociationWith"
  },
  "haspart":{
    "original":"HasPart",
    "inverse":"IsPartOf"
  },
  "hasversion":{
    "original":"HasVersion",
    "inverse":"IsVersionOf"
  },
  "iscitedby":{
    "original":"IsCitedBy",
    "inverse":"Cites"
  },
  "iscompiledby":{
    "original":"IsCompiledBy",
    "inverse":"Compiles"
  },
  "iscontinuedby":{
    "original":"IsContinuedBy",
    "inverse":"Continues"
  },
  "isderivedfrom":{
    "original":"IsDerivedFrom",
    "inverse":"IsSourceOf"
  },
  "isdescribedby":{
    "original":"IsDescribedBy",
    "inverse":"Describes"
  },
  "isdocumentedby":{
    "original":"IsDocumentedBy",
    "inverse":"Documents"
  },
  "isidenticalto":{
    "original":"IsIdenticalTo",
    "inverse":"IsIdenticalTo"
  },
  "ismetadatafor":{
    "original":"IsMetadataFor",
    "inverse":"IsMetadataOf"
  },
  "ismetadataof":{
    "original":"IsMetadataOf",
    "inverse":"IsMetadataFor"
  },
  "isnewversionof":{
    "original":"IsNewVersionOf",
    "inverse":"IsPreviousVersionOf"
  },
  "isobsoletedby":{
    "original":"IsObsoletedBy",
    "inverse":"Obsoletes"
  },
  "isoriginalformof":{
    "original":"IsOriginalFormOf",
    "inverse":"IsVariantFormOf"
  },
  "ispartof":{
    "original":"IsPartOf",
    "inverse":"HasPart"
  },
  "ispreviousversionof":{
    "original":"IsPreviousVersionOf",
    "inverse":"IsNewVersionOf"
  },
  "isreferencedby":{
    "original":"IsReferencedBy",
    "inverse":"References"
  },
  "isrelatedto":{
    "original":"IsRelatedTo",
    "inverse":"IsRelatedTo"
  },
  "isrequiredby":{
    "original":"IsRequiredBy",
    "inverse":"Requires"
  },
  "isreviewedby":{
    "original":"IsReviewedBy",
    "inverse":"Reviews"
  },
  "issourceof":{
    "original":"IsSourceOf",
    "inverse":"IsDerivedFrom"
  },
  "issupplementedby":{
    "original":"IsSupplementedBy",
    "inverse":"IsSupplementTo"
  },
  "issupplementto":{
    "original":"IsSupplementTo",
    "inverse":"IsSupplementedBy"
  },
  "isvariantformof":{
    "original":"IsVariantFormOf",
    "inverse":"IsOriginalFormOf"
  },
  "isversionof":{
    "original":"IsVersionOf",
    "inverse":"HasVersion"
  },
  "obsoletes":{
    "original":"Obsoletes",
    "inverse":"IsObsoletedBy"
  },
  "references":{
    "original":"References",
    "inverse":"IsReferencedBy"
  },
  "requires":{
    "original":"Requires",
    "inverse":"IsRequiredBy"
  },
  "related":{
    "original":"IsRelatedTo",
    "inverse":"IsRelatedTo"
  },
  "reviews":{
    "original":"Reviews",
    "inverse":"IsReviewedBy"
  },
  "unknown":{
    "original":"Unknown",
    "inverse":"Unknown"
  },
  "isamongtopnsimilardocuments": {
    "original": "IsAmongTopNSimilarDocuments",
    "inverse": "HasAmongTopNSimilarDocuments"
  },
  "hasamongtopnsimilardocuments": {
    "original": "HasAmongTopNSimilarDocuments",
    "inverse": "IsAmongTopNSimilarDocuments"
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala
@ -0,0 +1,258 @@
 package eu.dnetlib.dhp.sx.graph
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
 import eu.dnetlib.dhp.schema.sx.scholix.{
  Scholix,
  ScholixCollectedFrom,
  ScholixEntityId,
  ScholixIdentifier,
  ScholixRelationship,
  ScholixResource
 }
 import org.json4s
 import org.json4s.DefaultFormats
 import org.json4s.jackson.JsonMethods.parse
 import scala.collection.JavaConverters._
 import scala.io.Source
 case class RelationInfo(
  source: String,
  target: String,
  relclass: String,
  id: String,
  collectedfrom: Seq[RelKeyValue]
 ) {}
 case class RelKeyValue(key: String, value: String) {}
 object ScholexplorerUtils {
  val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
  val mapper = new ObjectMapper()
  case class RelationVocabulary(original: String, inverse: String) {}
  val relations: Map[String, RelationVocabulary] = {
    val input = Source
      .fromInputStream(
        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
      )
      .mkString
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input)
    json.extract[Map[String, RelationVocabulary]]
  }
  def invRel(rel: String): String = {
    val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
    if (semanticRelation != null)
      semanticRelation.inverse
    else
      null
  }
  def generateDatasourceOpenAIREURLS(id: String): String = {
    if (id != null && id.length > 12)
      s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
    else
      null
  }
  def findURLForPID(
    pidValue: List[StructuredProperty],
    urls: List[String]
  ): List[(StructuredProperty, String)] = {
    pidValue.map { p =>
      val pv = p.getValue
      val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
      (p, r.orNull)
    }
  }
  def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
    if (r.getInstance() == null || r.getInstance().isEmpty)
      return List()
    r.getInstance()
      .asScala
      .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
      .filter(i => i.getPid != null && i.getUrl != null)
      .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
      .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
      .distinct
      .toList
  }
  def generateScholixResourceFromResult(result: Result): ScholixResource = {
    if (result.getInstance() == null || result.getInstance().size() == 0)
      return null
    if (result.getPid == null || result.getPid.isEmpty)
      return null
    val r = new ScholixResource
    r.setDnetIdentifier(result.getId)
    val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
    if (persistentIdentifiers.isEmpty)
      return null
    r.setIdentifier(persistentIdentifiers.asJava)
    r.setObjectType(result.getResulttype.getClassid)
    r.setObjectSubType(
      result
        .getInstance()
        .asScala
        .filter(i => i != null && i.getInstancetype != null)
        .map(i => i.getInstancetype.getClassname)
        .distinct
        .head
    )
    if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
      val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
      if (titles.nonEmpty)
        r.setTitle(titles.head)
      else
        return null
    }
    if (result.getAuthor != null && !result.getAuthor.isEmpty) {
      val authors: List[ScholixEntityId] =
        result.getAuthor.asScala
          .map(a => {
            val entity = new ScholixEntityId()
            entity.setName(a.getFullname)
            if (a.getPid != null && a.getPid.size() > 0)
              entity.setIdentifiers(
                a.getPid.asScala
                  .map(sp => {
                    val id = new ScholixIdentifier()
                    id.setIdentifier(sp.getValue)
                    id.setSchema(sp.getQualifier.getClassid)
                    id
                  })
                  .take(3)
                  .toList
                  .asJava
              )
            entity
          })
          .toList
      if (authors.nonEmpty)
        r.setCreator(authors.asJava)
    }
    val dt: List[String] = result
      .getInstance()
      .asScala
      .filter(i => i.getDateofacceptance != null)
      .map(i => i.getDateofacceptance.getValue)
      .toList
    if (dt.nonEmpty)
      r.setPublicationDate(dt.distinct.head)
    r.setPublisher(
      result
        .getInstance()
        .asScala
        .map(i => i.getHostedby)
        .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
        .map(h => {
          val eid = new ScholixEntityId()
          eid.setName(h.getValue)
          val id = new ScholixIdentifier()
          id.setIdentifier(h.getKey)
          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
          id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
          eid.setIdentifiers(List(id).asJava)
          eid
        })
        .distinct
        .asJava
    )
    r.setCollectedFrom(
      result.getCollectedfrom.asScala
        .map(cf => {
          val scf = new ScholixCollectedFrom()
          scf.setProvisionMode("collected")
          scf.setCompletionStatus("complete")
          val eid = new ScholixEntityId()
          eid.setName(cf.getValue)
          val id = new ScholixIdentifier()
          id.setIdentifier(cf.getKey)
          id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
          id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
          eid.setIdentifiers(List(id).asJava)
          scf.setProvider(eid)
          scf
        })
        .asJava
    )
    r
  }
  def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
    val s: Scholix = new Scholix
    s.setSource(source)
    if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
      s.setLinkprovider(
        relation.collectedfrom
          .map(cf => {
            val eid = new ScholixEntityId()
            eid.setName(cf.value)
            val id = new ScholixIdentifier()
            id.setIdentifier(cf.key)
            id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
            id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
            eid.setIdentifiers(List(id).asJava)
            eid
          })
          .toList
          .asJava
      )
    else {
      val eid = new ScholixEntityId()
      eid.setName("OpenAIRE")
      val id = new ScholixIdentifier()
      id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
      id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
      id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
      eid.setIdentifiers(List(id).asJava)
      s.setLinkprovider(List(eid).asJava)
    }
    s.setIdentifier(relation.id)
    val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
    if (semanticRelation == null)
      return null
    s.setRelationship(
      new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
    )
    s.setPublicationDate(source.getPublicationDate)
    s.setPublisher(source.getPublisher)
    val mockTarget = new ScholixResource
    mockTarget.setDnetIdentifier(relation.target)
    s.setTarget(mockTarget)
    s
  }
  def updateTarget(s: Scholix, t: ScholixResource): String = {
    s.setTarget(t)
    val spublishers: Seq[ScholixEntityId] =
      if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
    val tpublishers: Seq[ScholixEntityId] =
      if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
    val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
    s.setPublisher(mergedPublishers.asJava)
    mapper.writeValueAsString(s)
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@ -0,0 +1,141 @@
 package eu.dnetlib.dhp.sx.graph
 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.schema.oaf.{
  KeyValue,
  OtherResearchProduct,
  Publication,
  Relation,
  Result,
  Software,
  Dataset => OafDataset
 }
 import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
 import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
    extends AbstractScalaApplication(propertyPath, args, log: Logger) {
  /** Here all the spark applications runs this method
    * where the whole logic of the spark node is defined
    */
  override def run(): Unit = {
    val sourcePath = parser.get("sourcePath")
    log.info("sourcePath: {}", sourcePath)
    val targetPath = parser.get("targetPath")
    log.info("targetPath: {}", targetPath)
    generateBidirectionalRelations(sourcePath, targetPath, spark)
    generateScholixResource(sourcePath, targetPath, spark)
    generateScholix(targetPath, spark)
  }
  def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
    val entityMap: Map[String, StructType] = Map(
      "publication"          -> Encoders.bean(classOf[Publication]).schema,
      "dataset"              -> Encoders.bean(classOf[OafDataset]).schema,
      "software"             -> Encoders.bean(classOf[Software]).schema,
      "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
    )
    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
    val resDs = spark.emptyDataset[ScholixResource]
    val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
      println(s"adding ${item._1}")
      res.union(
        spark.read
          .schema(item._2)
          .json(s"$inputPath/${item._1}")
          .as[Result]
          .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
          .filter(s => s != null)
      )
    })
    scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
  }
  def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
    val relSchema = Encoders.bean(classOf[Relation]).schema
    val relDF = spark.read
      .schema(relSchema)
      .json(s"$inputPath/relation")
      .where(
        "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
        "and relClass <> 'merges' and relClass <> 'isMergedIn'"
      )
      .select("source", "target", "collectedfrom", "relClass")
    def invRel: String => String = { s =>
      ScholexplorerUtils.invRel(s)
    }
    import org.apache.spark.sql.functions.udf
    val inverseRelationUDF = udf(invRel)
    val inverseRelation = relDF.select(
      col("target").alias("source"),
      col("source").alias("target"),
      col("collectedfrom"),
      inverseRelationUDF(col("relClass")).alias("relClass")
    )
    val bidRel = inverseRelation
      .union(relDF)
      .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
      .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
      .drop("collectedfrom")
      .withColumnRenamed("cf", "collectedfrom")
      .groupBy(col("id"))
      .agg(
        first("source").alias("source"),
        first("target").alias("target"),
        first("relClass").alias("relClass"),
        first("collectedfrom").alias("collectedfrom")
      )
    bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
  }
  def generateScholix(outputPath: String, spark: SparkSession): Unit = {
    implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
    import spark.implicits._
    val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
    val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
    val scholix_one_verse = relations
      .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
      .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
      .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
    val resourceTarget = relations
      .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
      .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
    scholix_one_verse
      .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
      .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
      .write
      .mode(SaveMode.Overwrite)
      .option("compression", "gzip")
      .text(s"$outputPath/scholix")
  }
 }
 object SparkCreateScholexplorerDump {
  val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
  def main(args: Array[String]): Unit = {
    new SparkCreateScholexplorerDump(
      log = logger,
      args = args,
      propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
    ).initialize().run()
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@ -0,0 +1,26 @@
 package eu.dnetlib.dhp.sx.graph.scholix
 import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
 import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.Test
 import org.objenesis.strategy.StdInstantiatorStrategy
 class ScholixGenerationTest {
  @Test
  def generateScholix(): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
    val app = new SparkCreateScholexplorerDump(null, null, null)
 //   app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
 //    app.generateBidirectionalRelations(
 //      "/home/sandro/Downloads/scholix_sample/",
 //      "/home/sandro/Downloads/scholix/",
 //      spark
 //    )
    app.generateScholix("/home/sandro/Downloads/scholix/", spark)
  }
 }
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -18,7 +18,7 @@
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
-                        <phase>initialize</phase>
+                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
@ -59,12 +59,6 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -160,6 +154,26 @@
                    <groupId>org.apache.zookeeper</groupId>
                    <artifactId>zookeeper</artifactId>
                </exclusion>
                <exclusion>
                    <artifactId>ant</artifactId>
                    <groupId>org.apache.ant</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>antlr4-runtime</artifactId>
                    <groupId>org.antlr</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>woodstox-core</artifactId>
                    <groupId>com.fasterxml.woodstox</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>log4j</artifactId>
                    <groupId>*</groupId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.logging.log4j</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
@ -206,5 +220,90 @@
    </dependencies>
    <profiles>
        <profile>
            <id>spark-24</id>
            <activation>
                <activeByDefault>true</activeByDefault>
            </activation>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-3</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>spark-34</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-4</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
        <profile>
            <id>spark-35</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.codehaus.mojo</groupId>
                        <artifactId>build-helper-maven-plugin</artifactId>
                        <version>3.4.0</version>
                        <executions>
                            <execution>
                                <phase>generate-sources</phase>
                                <goals>
                                    <goal>add-source</goal>
                                </goals>
                                <configuration>
                                    <sources>
                                        <source>src/main/sparksolr-4</source>
                                    </sources>
                                </configuration>
                            </execution>
                        </executions>
                    </plugin>
                </plugins>
            </build>
        </profile>
    </profiles>
 </project>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
 import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -129,7 +130,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
 			.javaRDD()
 			.map(
 				t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
-		SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+		DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
 import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import javax.swing.text.html.Option;
 import org.apache.commons.lang3.StringUtils;
 import org.stringtemplate.v4.ST;
--- a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@ -0,0 +1,12 @@
 package eu.dnetlib.dhp.sparksolr;
 import com.lucidworks.spark.util.SolrSupport;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.rdd.RDD;
 public class DHPSolrSupport {
    static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
        SolrSupport.indexDocs(zkhost, collection, batchSize, docs);
    }
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@ -0,0 +1,12 @@
 package eu.dnetlib.dhp.sparksolr;
 import com.lucidworks.spark.util.SolrSupport;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.rdd.RDD;
 public class DHPSolrSupport {
    static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
        SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs);
    }
 }
--- a/dhp-workflows/dhp-stats-actionsets/pom.xml
+++ b/dhp-workflows/dhp-stats-actionsets/pom.xml
@ -16,11 +16,11 @@
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-core_2.11</artifactId>
+            <artifactId>spark-core_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-sql_2.11</artifactId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
--- a/dhp-workflows/dhp-stats-hist-snaps/pom.xml
+++ b/dhp-workflows/dhp-stats-hist-snaps/pom.xml
@ -10,11 +10,11 @@
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-core_2.11</artifactId>
+            <artifactId>spark-core_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-sql_2.11</artifactId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
    </dependencies>
 	<build>
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@ -8,6 +8,8 @@ fi
 export HADOOP_USER_NAME=$2
 SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
@ -30,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 1
    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
 IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 # Set the SED command arguments for column-names with reserved words:
 DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
 DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
 DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+function print_elapsed_time()
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+{
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+  start_time=$1
-
+  end_time=$(date +%s)
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+  elapsed_time=$(($end_time-$start_time))
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+  hours=$((elapsed_time / 3600))
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+  minutes=$(((elapsed_time % 3600) / 60))
  seconds=$((elapsed_time % 60))
  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
 }
 function copydb() {
  db=$1
  start_db_time=$(date +%s)
  echo -e "\nStart processing db: '${db}'..\n"
  # Delete the old DB from Impala cluster (if exists).
@ -67,8 +70,10 @@ function copydb() {
  if [ -n "$log_errors" ]; then
    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
    fi
  fi
  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
  # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -91,8 +96,10 @@ function copydb() {
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
    fi
  fi
  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -109,17 +116,13 @@ function copydb() {
  num_tables=0
  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
-
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
    if [ -n "$create_view_statement_test" ]; then
      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
      all_create_view_statements+=("$create_view_statement")
    else
      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -127,12 +130,17 @@ function copydb() {
      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 4
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 5
          fi
        fi
      fi
    fi
@ -176,7 +184,9 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
        exit 6
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
    else
@ -204,11 +214,14 @@ function copydb() {
  else
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 7
    fi
  fi
  rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
  print_elapsed_time start_db_time
 }
--- a/dhp-workflows/dhp-stats-monitor-irish/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-irish/pom.xml
@ -10,11 +10,11 @@
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-core_2.11</artifactId>
+            <artifactId>spark-core_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-sql_2.11</artifactId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
    </dependencies>
 	<build>
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@ -8,6 +8,9 @@ fi
 export HADOOP_USER_NAME=$2
 SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
 echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 1
    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
 IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 # Set the SED command arguments for column-names with reserved words:
 DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
 DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
 DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+function print_elapsed_time()
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+{
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+  start_time=$1
-
+  end_time=$(date +%s)
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+  elapsed_time=$(($end_time-$start_time))
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+  hours=$((elapsed_time / 3600))
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+  minutes=$(((elapsed_time % 3600) / 60))
  seconds=$((elapsed_time % 60))
  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
 }
 function copydb() {
  db=$1
  start_db_time=$(date +%s)
  echo -e "\nStart processing db: '${db}'..\n"
  # Delete the old DB from Impala cluster (if exists).
@ -66,8 +70,10 @@ function copydb() {
  if [ -n "$log_errors" ]; then
    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
    fi
  fi
  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
  # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -90,8 +96,10 @@ function copydb() {
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
    fi
  fi
  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -108,17 +116,13 @@ function copydb() {
  num_tables=0
  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
-
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
    if [ -n "$create_view_statement_test" ]; then
      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
      all_create_view_statements+=("$create_view_statement")
    else
      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -126,12 +130,17 @@ function copydb() {
      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 4
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 5
          fi
        fi
      fi
    fi
@ -175,7 +184,9 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
        exit 6
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
    else
@ -203,11 +214,14 @@ function copydb() {
  else
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 7
    fi
  fi
  rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
  print_elapsed_time start_db_time
 }
--- a/dhp-workflows/dhp-stats-monitor-update/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-update/pom.xml
@ -10,11 +10,11 @@
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-core_2.11</artifactId>
+            <artifactId>spark-core_${scala.binary.version}</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-sql_2.11</artifactId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
        </dependency>
    </dependencies>
 	<build>
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@ -8,6 +8,9 @@ fi
 export HADOOP_USER_NAME=$2
 SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
 echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 1
    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
 IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 # Set the SED command arguments for column-names with reserved words:
 DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
 DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
 DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+function print_elapsed_time()
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+{
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+  start_time=$1
-
+  end_time=$(date +%s)
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+  elapsed_time=$(($end_time-$start_time))
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+  hours=$((elapsed_time / 3600))
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+  minutes=$(((elapsed_time % 3600) / 60))
  seconds=$((elapsed_time % 60))
  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
 }
 function copydb() {
  db=$1
  start_db_time=$(date +%s)
  echo -e "\nStart processing db: '${db}'..\n"
  # Delete the old DB from Impala cluster (if exists).
@ -66,8 +70,10 @@ function copydb() {
  if [ -n "$log_errors" ]; then
    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
    fi
  fi
  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
  # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -90,8 +96,10 @@ function copydb() {
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
    fi
  fi
  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -108,17 +116,13 @@ function copydb() {
  num_tables=0
  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
-
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
    if [ -n "$create_view_statement_test" ]; then
      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
      all_create_view_statements+=("$create_view_statement")
    else
      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -126,12 +130,17 @@ function copydb() {
      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 4
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 5
          fi
        fi
      fi
    fi
@ -175,7 +184,9 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
        exit 6
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
    else
@ -203,11 +214,14 @@ function copydb() {
  else
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 7
    fi
  fi
  rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
  print_elapsed_time start_db_time
 }
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@ -6,6 +6,8 @@ then
    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
 fi
 SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 1
    fi
 fi
 echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
 # Set sed arguments.
 LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
 # Set the SED command arguments for column-names with reserved words:
 DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
 DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
 DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
 HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
 HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
 HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
 LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
 LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
 LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
 export HADOOP_USER_NAME=$6
 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
 function print_elapsed_time()
 {
  start_time=$1
  end_time=$(date +%s)
  elapsed_time=$(($end_time-$start_time))
  hours=$((elapsed_time / 3600))
  minutes=$(((elapsed_time % 3600) / 60))
  seconds=$((elapsed_time % 60))
  printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
 }
 function copydb() {
  db=$1
  start_db_time=$(date +%s)
  echo -e "\nStart processing db: '${db}'..\n"
  # Delete the old DB from Impala cluster (if exists).
@ -68,8 +72,10 @@ function copydb() {
  if [ -n "$log_errors" ]; then
    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
    fi
  fi
  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
  # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -92,8 +98,10 @@ function copydb() {
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
    fi
  fi
  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -110,17 +118,13 @@ function copydb() {
  num_tables=0
  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
-  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
-    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+    create_entity_statement=`hive --database ${db} -e "show create table ${i};"`  # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
-
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`  # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
    if [ -n "$create_view_statement_test" ]; then
      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
-      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
      all_create_view_statements+=("$create_view_statement")
    else
      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -128,12 +132,17 @@ function copydb() {
      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
-          exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 4
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
            exit 5
          fi
        fi
      fi
    fi
@ -177,7 +186,9 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
-      exit 5
+      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
        exit 6
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
    else
@ -205,11 +216,14 @@ function copydb() {
  else
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
-    exit 6
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 7
    fi
  fi
  rm -f error.log
-  echo -e "\n\nFinished processing db: ${db}\n\n"
+  echo -e "\n\nFinished processing db: ${db}\n"
  print_elapsed_time start_db_time
 }
 STATS_DB=$1
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
 with
    lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
    lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
-    lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
+    lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
-select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
+    lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
 select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
 from lvl1
 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
- join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
+ join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
 join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
 DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
 			.map(
 				(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
 				Encoders.bean(Software.class))
-			.filter(t -> t.getCodeRepositoryUrl() != null)
+			.filter((FilterFunction<Software>) t -> t.getCodeRepositoryUrl() != null)
 			.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
 	}
--- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml
+++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
@ -39,8 +39,8 @@
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-        <cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
+        <cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
-        <cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
+        <cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
    </properties>
    <dependencies>
@ -72,6 +72,12 @@
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${cdh.hadoop.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>jdk.tools</groupId>
                    <artifactId>jdk.tools</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
--- a/dhp-workflows/dhp-usage-stats-build/pom.xml
+++ b/dhp-workflows/dhp-usage-stats-build/pom.xml
@ -39,8 +39,8 @@
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-        <cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
+        <cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
-        <cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
+        <cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
 	</properties>
    <dependencies>
@ -67,11 +67,23 @@
 			<groupId>org.apache.hive</groupId>
 			<artifactId>hive-jdbc</artifactId>
 			<version>${cdh.hive.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>jdk.tools</groupId>
                    <artifactId>jdk.tools</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
 		<dependency>
  			<groupId>org.apache.hadoop</groupId>
  			<artifactId>hadoop-common</artifactId>
  			<version>${cdh.hadoop.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>jdk.tools</groupId>
                    <artifactId>jdk.tools</artifactId>
                </exclusion>
            </exclusions>
 		</dependency>        
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
--- a/pom.xml
+++ b/pom.xml
@ -13,7 +13,8 @@
            <distribution>repo</distribution>
            <comments>This program is free software: you can redistribute it and/or modify it under the terms of the
                GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
-				License, or (at your option) any later version.</comments>
+                License, or (at your option) any later version.
            </comments>
        </license>
    </licenses>
@ -22,6 +23,7 @@
        <module>dhp-pace-core</module>
        <module>dhp-common</module>
        <module>dhp-workflows</module>
        <module>dhp-shade-package</module>
    </modules>
    <issueManagement>
@ -47,6 +49,19 @@
    </pluginRepositories>
    <repositories>
        <repository>
            <id>Openaire-third-parties-snaphot</id>
            <name>Openaire third parties Snapshot</name>
            <url>https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/</url>
            <releases>
                <enabled>false</enabled>
            </releases>
            <snapshots>
                <enabled>true</enabled>
            </snapshots>
        </repository>
        <repository>
            <id>dnet45-releases</id>
            <name>D-Net 45 releases</name>
@ -125,6 +140,13 @@
    </repositories>
    <dependencies>
        <!-- Quick FIX not to remove lombok everywhere -->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.28</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.junit.jupiter</groupId>
            <artifactId>junit-jupiter</artifactId>
@ -152,7 +174,7 @@
        <dependencies>
            <dependency>
                <groupId>eu.dnetlib.dhp</groupId>
-				<artifactId>${dhp-schemas.artifact}</artifactId>
+                <artifactId>dhp-schemas</artifactId>
                <version>${dhp-schemas.version}</version>
            </dependency>
            <dependency>
@ -206,33 +228,76 @@
            <dependency>
                <groupId>org.slf4j</groupId>
-				<artifactId>jcl-over-slf4j</artifactId>
+                <artifactId>slf4j-api</artifactId>
-				<version>1.7.25</version>
+                <version>${org.slf4j.version}</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
                <version>${org.slf4j.version}</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>jcl-over-slf4j</artifactId>
                <version>${org.slf4j.version}</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-slf4j2-impl</artifactId>
                <version>${log4j.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-api</artifactId>
                <version>${log4j.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-core</artifactId>
                <version>${log4j.version}</version>
            </dependency>
            <dependency>
                <!-- API bridge between log4j 1 and 2 -->
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-1.2-api</artifactId>
                <version>${log4j.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
                <version>${dhp.commons.lang.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-beanutils</artifactId>
                <version>${commons-beanutils.version}</version>
            </dependency>
            <dependency>
                <groupId>commons-validator</groupId>
                <artifactId>commons-validator</artifactId>
-				<version>1.7</version>
+                <version>${commons-validator.version}</version>
            </dependency>
            <dependency>
                <groupId>com.github.sisyphsu</groupId>
                <artifactId>dateparser</artifactId>
-				<version>1.0.7</version>
+                <version>${dateparser.version}</version>
            </dependency>
            <dependency>
                <groupId>me.xuender</groupId>
                <artifactId>unidecode</artifactId>
-				<version>0.0.7</version>
+                <version>${unidecode.version}</version>
            </dependency>
            <dependency>
@ -245,13 +310,13 @@
            <dependency>
                <groupId>commons-codec</groupId>
                <artifactId>commons-codec</artifactId>
-				<version>1.9</version>
+                <version>${commons-codec.version}</version>
            </dependency>
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
-				<version>2.4</version>
+                <version>${commons-io.version}</version>
            </dependency>
            <dependency>
@ -345,7 +410,7 @@
            <dependency>
                <groupId>org.apache.zookeeper</groupId>
                <artifactId>zookeeper</artifactId>
-				<version>3.4.11</version>
+                <version>${zookeeper.version}</version>
            </dependency>
            <dependency>
@ -415,6 +480,7 @@
                <artifactId>cxf-rt-transports-http</artifactId>
                <version>3.1.5</version>
            </dependency>
            <dependency>
                <groupId>javax.persistence</groupId>
                <artifactId>javax.persistence-api</artifactId>
@ -504,16 +570,11 @@
                <artifactId>commons-compress</artifactId>
                <version>${common.compress.version}</version>
            </dependency>
 			<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-csv</artifactId>
                <version>${common.csv.version}</version>
            </dependency>
 			<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
@ -568,14 +629,12 @@
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-math3</artifactId>
                <version>3.6.1</version>
            </dependency>
            <dependency>
                <groupId>com.google.code.gson</groupId>
                <artifactId>gson</artifactId>
@ -596,7 +655,7 @@
            <dependency>
                <groupId>org.reflections</groupId>
                <artifactId>reflections</artifactId>
-				<version>0.9.10</version>
+                <version>${reflections.version}</version>
            </dependency>
            <dependency>
@ -610,6 +669,12 @@
                <artifactId>icu4j</artifactId>
                <version>70.1</version>
            </dependency>
            <dependency>
                <groupId>org.javassist</groupId>
                <artifactId>javassist</artifactId>
                <version>${javassist.version}</version>
            </dependency>
        </dependencies>
    </dependencyManagement>
@ -677,6 +742,7 @@
                    <version>3.0.0-M4</version>
                    <configuration>
                        <redirectTestOutputToFile>true</redirectTestOutputToFile>
                        <trimStackTrace>false</trimStackTrace>
                    </configuration>
                </plugin>
                <plugin>
@ -746,7 +812,7 @@
            <plugin>
                <groupId>net.revelc.code</groupId>
                <artifactId>impsort-maven-plugin</artifactId>
-				<version>1.4.1</version>
+                <version>1.6.2</version>
                <configuration>
                    <groups>java.,javax.,org.,com.</groups>
                    <staticGroups>java,*</staticGroups>
@ -767,7 +833,9 @@
                <groupId>org.antipathy</groupId>
                <artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
                <configuration>
-					<configLocation>https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
+                    <configLocation>
                        https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
                    </configLocation>
                    <skipTestSources>false</skipTestSources>
                    <skipSources>false</skipSources>
                    <sourceDirectories>
@ -798,7 +866,7 @@
            <plugin>
                <groupId>org.jacoco</groupId>
                <artifactId>jacoco-maven-plugin</artifactId>
-				<version>0.7.9</version>
+                <version>0.8.10</version>
                <configuration>
                    <excludes>
                        <exclude>**/schemas/*</exclude>
@ -866,90 +934,174 @@
        <dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 		<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
-		<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
+
-		<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
+        <!-- scala version -->
 		<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
 		<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
 		<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
 		<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
 		<sparksolr.version>3.6.0</sparksolr.version>
 		<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
 		<dhp.jackson.version>2.9.6</dhp.jackson.version>
 		<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
 		<dhp.site.skip>true</dhp.site.skip>
 		<dhp.guava.version>11.0.2</dhp.guava.version>
        <scala.version>2.11.12</scala.version>
        <scala.binary.version>2.11</scala.binary.version>
-		<scala-xml.version>1.3.0</scala-xml.version>
+
-		<junit-jupiter.version>5.6.1</junit-jupiter.version>
+        <!-- plugin versions -->
-		<mockito-core.version>3.3.3</mockito-core.version>
+        <maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
-		<mongodb.driver.version>3.4.2</mongodb.driver.version>
+        <maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
-		<vtd.version>[2.12,3.0)</vtd.version>
+        <properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
        <net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
        <!-- dependency versions -->
        <apache.poi.version>4.1.2</apache.poi.version>
        <cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
        <common.compress.version>1.20</common.compress.version>
        <common.csv.version>1.8</common.csv.version>
        <common.text.version>1.8</common.text.version>
        <commons-beanutils.version>1.9.4</commons-beanutils.version>
        <commons-codec.version>1.9</commons-codec.version>
        <commons.collections.version>3.2.1</commons.collections.version>
        <commons-io.version>2.4</commons-io.version>
        <commons.logging.version>1.1.3</commons.logging.version>
        <commons-validator.version>1.7</commons-validator.version>
        <dateparser.version>1.0.7</dateparser.version>
        <dhp-schemas.version>[6.1.2]</dhp-schemas.version>
        <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
        <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
        <dhp.guava.version>11.0.2</dhp.guava.version>
        <dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
        <dhp.jackson.version>2.9.6</dhp.jackson.version>
        <dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
        <dhp.site.skip>true</dhp.site.skip>
        <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
        <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
        <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
        <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
-		<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
+        <google.gson.version>2.2.2</google.gson.version>
-		<solr.version>7.5.0</solr.version>
+        <log4j.version>1.2.17</log4j.version>
-		<okhttp.version>4.7.2</okhttp.version>
+        <javassist.version>3.19.0-GA</javassist.version>
 		<common.compress.version>1.20</common.compress.version>
        <json4s.version>3.5.3</json4s.version>
        <jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
-		<common.csv.version>1.8</common.csv.version>
+        <junit-jupiter.version>5.6.1</junit-jupiter.version>
-		<apache.poi.version>4.1.2</apache.poi.version>
+        <mockito-core.version>3.3.3</mockito-core.version>
-		<common.text.version>1.8</common.text.version>
+        <mongodb.driver.version>3.4.2</mongodb.driver.version>
        <okhttp.version>4.7.2</okhttp.version>
        <org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
-		<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
+        <org.slf4j.version>1.7.25</org.slf4j.version>
-		<google.gson.version>2.2.2</google.gson.version>
+        <reflections.version>0.9.10</reflections.version>
-		<commons.logging.version>1.1.3</commons.logging.version>
+        <scala-xml.version>1.3.0</scala-xml.version>
-		<commons.collections.version>3.2.1</commons.collections.version>
+        <solr.version>7.5.0</solr.version>
        <sparksolr.version>3.6.0</sparksolr.version>
        <unidecode.version>0.0.7</unidecode.version>
        <vtd.version>[2.12,3.0)</vtd.version>
        <zookeeper.version>3.4.6</zookeeper.version>
    </properties>
    <!-- Build with scala 12 and Spark 3.4 -->
    <profiles>
        <profile>
-			<id>scala-2.12</id>
+            <id>spark-34</id>
            <properties>
                <scala.binary.version>2.12</scala.binary.version>
                <scala.version>2.12.18</scala.version>
-				<!-- scala-xml.version>2.1.0</scala-xml.version -->
+                <scala-xml.version>1.3.0</scala-xml.version>
-
+                <!-- plugin versions -->
 				<sparksolr.version>4.0.2</sparksolr.version>
 				<dhp.spark.version>3.4.1</dhp.spark.version>
 				<dhp.jackson.version>2.14.2</dhp.jackson.version>
 				<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
 				<json4s.version>3.7.0-M11</json4s.version>
                <net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
-				<!--
+                <!-- dependencies -->
-				<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
+                <common.compress.version>1.22</common.compress.version>
-				<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
+                <common.csv.version>1.8</common.csv.version>
-				 -->
+                <common.text.version>1.10.0</common.text.version>
                <commons-beanutils.version>1.9.4</commons-beanutils.version>
                <commons-codec.version>1.15</commons-codec.version>
                <commons.collections.version>3.2.2</commons.collections.version>
                <commons-io.version>2.11.0</commons-io.version>
                <commons.logging.version>1.1.3</commons.logging.version>
                <commons-validator.version>1.7</commons-validator.version>
                <dhp.guava.version>14.0.1</dhp.guava.version>
                <solr.version>8.11.0</solr.version>
                <sparksolr.version>4.0.4</sparksolr.version>
                <dhp.spark.version>3.4.2.openaire</dhp.spark.version>
                <dhp.jackson.version>2.14.2</dhp.jackson.version>
                <dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
                <log4j.version>2.19.0</log4j.version>
                <json4s.version>3.7.0-M11</json4s.version>
                <javassist.version>3.25.0-GA</javassist.version>
                <okhttp.version>4.10.0</okhttp.version>
                <org.slf4j.version>2.0.6</org.slf4j.version>
                <reflections.version>0.10.2</reflections.version>
                <zookeeper.version>3.6.3</zookeeper.version>
            </properties>
        </profile>
 		<!-- Activate ARM-compatible snappy dependency on new Silicon Macs -->
        <profile>
-			<id>arm-silicon-mac</id>
+            <id>spark-35</id>
            <properties>
                <scala.binary.version>2.12</scala.binary.version>
                <scala.version>2.12.18</scala.version>
                <scala-xml.version>1.3.0</scala-xml.version>
                <!-- plugin versions -->
                <net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
                <!-- dependencies -->
                <common.compress.version>1.23.0</common.compress.version>
                <common.csv.version>1.8</common.csv.version>
                <common.text.version>1.10.0</common.text.version>
                <commons-beanutils.version>1.9.4</commons-beanutils.version>
                <commons-codec.version>1.16.0</commons-codec.version>
                <commons.collections.version>3.2.2</commons.collections.version>
                <commons-io.version>2.13.0</commons-io.version>
                <commons.logging.version>1.1.3</commons.logging.version>
                <commons-validator.version>1.7</commons-validator.version>
                <dhp.guava.version>14.0.1</dhp.guava.version>
                <solr.version>8.11.0</solr.version>
                <sparksolr.version>4.0.4</sparksolr.version>
                <dhp.spark.version>3.5.1.openaire-SNAPSHOT</dhp.spark.version>
                <dhp.jackson.version>2.15.2</dhp.jackson.version>
                <dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
                <log4j.version>2.20.0</log4j.version>
                <json4s.version>3.7.0-M11</json4s.version>
                <javassist.version>3.25.0-GA</javassist.version>
                <okhttp.version>4.10.0</okhttp.version>
                <org.slf4j.version>2.0.7</org.slf4j.version>
                <reflections.version>0.10.2</reflections.version>
                <zookeeper.version>3.6.3</zookeeper.version>
            </properties>
        </profile>
        <profile>
            <id>java11</id>
            <activation>
-				<os>
+                <jdk>[11</jdk>
 					<arch>aarch64</arch>
 					<family>mac</family>
 				</os>
            </activation>
-			<dependencyManagement>
+            <build>
-				<dependencies>
+                <pluginManagement>
-					<dependency>
+                    <plugins>
-						<groupId>org.xerial.snappy</groupId>
+                        <plugin>
-						<artifactId>snappy-java</artifactId>
+                            <groupId>org.apache.maven.plugins</groupId>
-						<version>1.1.8.4</version>
+                            <artifactId>maven-surefire-plugin</artifactId>
-					</dependency>
+                            <version>3.0.0-M4</version>
-				</dependencies>
+                            <configuration>
-			</dependencyManagement>
+                                <!-- only for java 11+ to run spark in tests -->
                                <argLine>--add-opens=java.base/java.lang=ALL-UNNAMED
                                    --add-opens=java.base/java.lang.invoke=ALL-UNNAMED
                                    --add-opens=java.base/java.lang.reflect=ALL-UNNAMED
                                    --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED
                                    --add-opens=java.base/java.nio=ALL-UNNAMED
                                    --add-opens=java.base/java.util=ALL-UNNAMED
                                    --add-opens=java.base/java.util.concurrent=ALL-UNNAMED
                                    --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
                                    --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
                                    --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
                                    --add-opens=java.base/sun.security.action=ALL-UNNAMED
                                    --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
                                </argLine>
                                <redirectTestOutputToFile>true</redirectTestOutputToFile>
                                <trimStackTrace>false</trimStackTrace>
                            </configuration>
                        </plugin>
                    </plugins>
                </pluginManagement>
            </build>
        </profile>
 	</profiles>
 </project>