Merge pull request '[graph indexing] sets spark memoryOverhead in the join operations to the same value used for the memory executor' (#426 ) from provision_memoryOverhead into master

Reviewed-on: #426
[graph indexing] sets spark memoryOverhead in the join operations to the same value used for the memory executor
2024-04-19 16:59:45 +02:00 · 2024-04-19 16:57:55 +02:00 · 2024-04-18 11:25:24 +02:00 · 2024-04-18 11:23:43 +02:00 · 2024-04-17 16:40:29 +02:00 · 2024-04-17 15:13:28 +02:00
91 changed files with 5624 additions and 896 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -312,7 +312,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 		}

 		if (value instanceof Datasource) {
-			// nothing to evaluate here
+			final Datasource d = (Datasource) value;
+			return Objects.nonNull(d.getOfficialname()) && StringUtils.isNotBlank(d.getOfficialname().getValue());
 		} else if (value instanceof Project) {
 			final Project p = (Project) value;
 			return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue());
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteAction.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteAction.java
@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024.
+ * SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package eu.dnetlib.dhp.actionmanager.promote;
+
+/** Encodes the Actionset promotion strategies */
+public class PromoteAction {
+
+	/** The supported actionset promotion strategies
+	 *
+	 * ENRICH: promotes only records in the actionset matching another record in the
+	 *  graph and enriches them applying the given MergeAndGet strategy
+	 * UPSERT: promotes all the records in an actionset, matching records are updated
+	 *  using the given MergeAndGet strategy, the non-matching record as inserted as they are.
+	 */
+	public enum Strategy {
+		ENRICH, UPSERT
+	}
+
+	/**
+	 * Returns the string representation of the join type implementing the given PromoteAction.
+	 *
+	 * @param strategy the strategy to be used to promote the Actionset contents
+	 * @return the join type used to implement the promotion strategy
+	 */
+	public static String joinTypeForStrategy(PromoteAction.Strategy strategy) {
+		switch (strategy) {
+			case ENRICH:
+				return "left_outer";
+			case UPSERT:
+				return "full_outer";
+			default:
+				throw new IllegalStateException("unsupported PromoteAction: " + strategy.toString());
+		}
+	}
+}
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
@ -67,8 +67,9 @@ public class PromoteActionPayloadForGraphTableJob {
 		String outputGraphTablePath = parser.get("outputGraphTablePath");
 		logger.info("outputGraphTablePath: {}", outputGraphTablePath);

-		MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
-		logger.info("strategy: {}", strategy);
+		MergeAndGet.Strategy mergeAndGetStrategy = MergeAndGet.Strategy
+			.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
+		logger.info("mergeAndGetStrategy: {}", mergeAndGetStrategy);

 		Boolean shouldGroupById = Optional
 			.ofNullable(parser.get("shouldGroupById"))
@ -76,6 +77,12 @@ public class PromoteActionPayloadForGraphTableJob {
 			.orElse(true);
 		logger.info("shouldGroupById: {}", shouldGroupById);

+		PromoteAction.Strategy promoteActionStrategy = Optional
+			.ofNullable(parser.get("promoteActionStrategy"))
+			.map(PromoteAction.Strategy::valueOf)
+			.orElse(PromoteAction.Strategy.UPSERT);
+		logger.info("promoteActionStrategy: {}", promoteActionStrategy);
+
 		@SuppressWarnings("unchecked")
 		Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
 		@SuppressWarnings("unchecked")
@ -97,7 +104,8 @@ public class PromoteActionPayloadForGraphTableJob {
 					inputGraphTablePath,
 					inputActionPayloadPath,
 					outputGraphTablePath,
-					strategy,
+					mergeAndGetStrategy,
+					promoteActionStrategy,
 					rowClazz,
 					actionPayloadClazz,
 					shouldGroupById);
@ -124,14 +132,16 @@ public class PromoteActionPayloadForGraphTableJob {
 		String inputGraphTablePath,
 		String inputActionPayloadPath,
 		String outputGraphTablePath,
-		MergeAndGet.Strategy strategy,
+		MergeAndGet.Strategy mergeAndGetStrategy,
+		PromoteAction.Strategy promoteActionStrategy,
 		Class<G> rowClazz,
 		Class<A> actionPayloadClazz, Boolean shouldGroupById) {
 		Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
 		Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);

 		Dataset<G> result = promoteActionPayloadForGraphTable(
-			rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
+			rowDS, actionPayloadDS, mergeAndGetStrategy, promoteActionStrategy, rowClazz, actionPayloadClazz,
+			shouldGroupById)
 				.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));

 		saveGraphTable(result, outputGraphTablePath);
@ -183,7 +193,8 @@ public class PromoteActionPayloadForGraphTableJob {
 	private static <G extends Oaf, A extends Oaf> Dataset<G> promoteActionPayloadForGraphTable(
 		Dataset<G> rowDS,
 		Dataset<A> actionPayloadDS,
-		MergeAndGet.Strategy strategy,
+		MergeAndGet.Strategy mergeAndGetStrategy,
+		PromoteAction.Strategy promoteActionStrategy,
 		Class<G> rowClazz,
 		Class<A> actionPayloadClazz,
 		Boolean shouldGroupById) {
@ -195,8 +206,9 @@ public class PromoteActionPayloadForGraphTableJob {

 		SerializableSupplier<Function<G, String>> rowIdFn = ModelSupport::idFn;
 		SerializableSupplier<Function<A, String>> actionPayloadIdFn = ModelSupport::idFn;
-		SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
-		SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
+		SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet
+			.functionFor(mergeAndGetStrategy);
+		SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(mergeAndGetStrategy);
 		SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
 		SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;

@ -207,6 +219,7 @@ public class PromoteActionPayloadForGraphTableJob {
 				rowIdFn,
 				actionPayloadIdFn,
 				mergeRowWithActionPayloadAndGetFn,
+				promoteActionStrategy,
 				rowClazz,
 				actionPayloadClazz);

--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
@ -34,6 +34,7 @@ public class PromoteActionPayloadFunctions {
 	 * @param rowIdFn Function used to get the id of graph table row
 	 * @param actionPayloadIdFn Function used to get id of action payload instance
 	 * @param mergeAndGetFn Function used to merge graph table row and action payload instance
+	 * @param promoteActionStrategy the Actionset promotion strategy
 	 * @param rowClazz Class of graph table
 	 * @param actionPayloadClazz Class of action payload
 	 * @param <G> Type of graph table row
@ -46,6 +47,7 @@ public class PromoteActionPayloadFunctions {
 		SerializableSupplier<Function<G, String>> rowIdFn,
 		SerializableSupplier<Function<A, String>> actionPayloadIdFn,
 		SerializableSupplier<BiFunction<G, A, G>> mergeAndGetFn,
+		PromoteAction.Strategy promoteActionStrategy,
 		Class<G> rowClazz,
 		Class<A> actionPayloadClazz) {
 		if (!isSubClass(rowClazz, actionPayloadClazz)) {
@ -61,7 +63,7 @@ public class PromoteActionPayloadFunctions {
 			.joinWith(
 				actionPayloadWithIdDS,
 				rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")),
-				"full_outer")
+				PromoteAction.joinTypeForStrategy(promoteActionStrategy))
 			.map(
 				(MapFunction<Tuple2<Tuple2<String, G>, Tuple2<String, A>>, G>) value -> {
 					Optional<G> rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2);
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json
@ -41,6 +41,12 @@
    "paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
    "paramRequired": true
  },
+  {
+    "paramName": "pas",
+    "paramLongName": "promoteActionStrategy",
+    "paramDescription": "strategy for promoting the actionset contents into the graph tables, ENRICH or UPSERT (default)",
+    "paramRequired": false
+  },
  {
    "paramName": "sgid",
    "paramLongName": "shouldGroupById",
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml
@ -115,6 +115,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
@ -167,6 +168,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml
@ -106,6 +106,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/datasource</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml
@ -106,6 +106,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/organization</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml
@ -114,6 +114,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
@ -166,6 +167,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml
@ -106,6 +106,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/project</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml
@ -115,6 +115,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
@ -167,6 +168,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml
@ -107,6 +107,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/relation</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml
@ -114,6 +114,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
@ -166,6 +167,7 @@
            <arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
            <arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
            <arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
+            <arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
            <arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java
+++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java
@ -54,7 +54,7 @@ public class PromoteActionPayloadFunctionsTest {
 				RuntimeException.class,
 				() -> PromoteActionPayloadFunctions
 					.joinGraphTableWithActionPayloadAndMerge(
-						null, null, null, null, null, OafImplSubSub.class, OafImpl.class));
+						null, null, null, null, null, null, OafImplSubSub.class, OafImpl.class));
 		}

 		@Test
@ -104,6 +104,7 @@ public class PromoteActionPayloadFunctionsTest {
 					rowIdFn,
 					actionPayloadIdFn,
 					mergeAndGetFn,
+					PromoteAction.Strategy.UPSERT,
 					OafImplSubSub.class,
 					OafImplSubSub.class)
 				.collectAsList();
@ -183,6 +184,7 @@ public class PromoteActionPayloadFunctionsTest {
 					rowIdFn,
 					actionPayloadIdFn,
 					mergeAndGetFn,
+					PromoteAction.Strategy.UPSERT,
 					OafImplSubSub.class,
 					OafImplSub.class)
 				.collectAsList();
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java
@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
 import eu.dnetlib.dhp.aggregation.common.ReportingJob;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
 import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
 import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
 import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
@ -120,6 +121,8 @@ public class CollectorWorker extends ReportingJob {
 				return new FileCollectorPlugin(fileSystem);
 			case fileGzip:
 				return new FileGZipCollectorPlugin(fileSystem);
+			case baseDump:
+				return new BaseCollectorPlugin(this.fileSystem);
 			case other:
 				final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
 					.ofNullable(api.getParams().get("other_plugin_type"))
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java
@ -10,7 +10,8 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
 public interface CollectorPlugin {

 	enum NAME {
-		oai, other, rest_json2xml, file, fileGzip;
+
+		oai, other, rest_json2xml, file, fileGzip, baseDump;

 		public enum OTHER_NAME {
 			mdstore_mongodb_dump, mdstore_mongodb
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIterator.java
@ -0,0 +1,171 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Iterator;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLEventWriter;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLOutputFactory;
+import javax.xml.stream.events.EndElement;
+import javax.xml.stream.events.StartElement;
+import javax.xml.stream.events.XMLEvent;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+
+public class BaseCollectorIterator implements Iterator<String> {
+
+	private String nextElement;
+
+	private final BlockingQueue<String> queue = new LinkedBlockingQueue<>(100);
+
+	private static final Logger log = LoggerFactory.getLogger(BaseCollectorIterator.class);
+
+	private static final String END_ELEM = "__END__";
+
+	public BaseCollectorIterator(final FileSystem fs, final Path filePath, final AggregatorReport report) {
+		new Thread(() -> importHadoopFile(fs, filePath, report)).start();
+		try {
+			this.nextElement = this.queue.take();
+		} catch (final InterruptedException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	protected BaseCollectorIterator(final String resourcePath, final AggregatorReport report) {
+		new Thread(() -> importTestFile(resourcePath, report)).start();
+		try {
+			this.nextElement = this.queue.take();
+		} catch (final InterruptedException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	@Override
+	public synchronized boolean hasNext() {
+		return (this.nextElement != null) & !END_ELEM.equals(this.nextElement);
+	}
+
+	@Override
+	public synchronized String next() {
+		try {
+			return END_ELEM.equals(this.nextElement) ? null : this.nextElement;
+		} finally {
+			try {
+				this.nextElement = this.queue.take();
+			} catch (final InterruptedException e) {
+				throw new RuntimeException(e);
+			}
+		}
+
+	}
+
+	private void importHadoopFile(final FileSystem fs, final Path filePath, final AggregatorReport report) {
+		log.info("I start to read the TAR stream");
+
+		try (InputStream origInputStream = fs.open(filePath);
+			final TarArchiveInputStream tarInputStream = new TarArchiveInputStream(origInputStream)) {
+			importTarStream(tarInputStream, report);
+		} catch (final Throwable e) {
+			throw new RuntimeException("Error processing BASE records", e);
+		}
+	}
+
+	private void importTestFile(final String resourcePath, final AggregatorReport report) {
+		try (final InputStream origInputStream = BaseCollectorIterator.class.getResourceAsStream(resourcePath);
+			final TarArchiveInputStream tarInputStream = new TarArchiveInputStream(origInputStream)) {
+			importTarStream(tarInputStream, report);
+		} catch (final Throwable e) {
+			throw new RuntimeException("Error processing BASE records", e);
+		}
+	}
+
+	private void importTarStream(final TarArchiveInputStream tarInputStream, final AggregatorReport report) {
+		long count = 0;
+
+		final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
+		final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
+
+		try {
+			TarArchiveEntry entry;
+			while ((entry = (TarArchiveEntry) tarInputStream.getNextEntry()) != null) {
+				final String name = entry.getName();
+
+				if (!entry.isDirectory() && name.contains("ListRecords") && name.endsWith(".bz2")) {
+
+					log.info("Processing file (BZIP): " + name);
+
+					final byte[] bzipData = new byte[(int) entry.getSize()];
+					IOUtils.readFully(tarInputStream, bzipData);
+
+					try (InputStream bzipIs = new ByteArrayInputStream(bzipData);
+						final BufferedInputStream bzipBis = new BufferedInputStream(bzipIs);
+						final CompressorInputStream bzipInput = new CompressorStreamFactory()
+							.createCompressorInputStream(bzipBis)) {
+
+						final XMLEventReader reader = xmlInputFactory.createXMLEventReader(bzipInput);
+
+						XMLEventWriter eventWriter = null;
+						StringWriter xmlWriter = null;
+
+						while (reader.hasNext()) {
+							final XMLEvent nextEvent = reader.nextEvent();
+
+							if (nextEvent.isStartElement()) {
+								final StartElement startElement = nextEvent.asStartElement();
+								if ("record".equals(startElement.getName().getLocalPart())) {
+									xmlWriter = new StringWriter();
+									eventWriter = xmlOutputFactory.createXMLEventWriter(xmlWriter);
+								}
+							}
+
+							if (eventWriter != null) {
+								eventWriter.add(nextEvent);
+							}
+
+							if (nextEvent.isEndElement()) {
+								final EndElement endElement = nextEvent.asEndElement();
+								if ("record".equals(endElement.getName().getLocalPart())) {
+									eventWriter.flush();
+									eventWriter.close();
+
+									this.queue.put(xmlWriter.toString());
+
+									eventWriter = null;
+									xmlWriter = null;
+									count++;
+								}
+							}
+
+						}
+					}
+				}
+			}
+
+			this.queue.put(END_ELEM); // TO INDICATE THE END OF THE QUEUE
+		} catch (final Throwable e) {
+			log.error("Error processing BASE records", e);
+			report.put(e.getClass().getName(), e.getMessage());
+			throw new RuntimeException("Error processing BASE records", e);
+		} finally {
+			log.info("Total records (written in queue): " + count);
+		}
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPlugin.java
@ -0,0 +1,159 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import java.io.IOException;
+import java.sql.SQLException;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Optional;
+import java.util.Set;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.DocumentHelper;
+import org.dom4j.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.file.AbstractSplittedRecordPlugin;
+import eu.dnetlib.dhp.common.DbClient;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+
+public class BaseCollectorPlugin implements CollectorPlugin {
+
+	private final FileSystem fs;
+
+	private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
+
+	// MAPPING AND FILTERING ARE DEFINED HERE:
+	// https://docs.google.com/document/d/1Aj-ZAV11b44MCrAAUCPiS2TUlXb6PnJEu1utCMAcCOU/edit
+
+	public BaseCollectorPlugin(final FileSystem fs) {
+		this.fs = fs;
+	}
+
+	@Override
+	public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
+		// the path of the dump file on HDFS
+		// http://oai.base-search.net/initial_load/base_oaipmh_dump-current.tar
+		// it could be downloaded from iis-cdh5-test-gw.ocean.icm.edu.pl and then copied on HDFS
+		final Path filePath = Optional
+			.ofNullable(api.getBaseUrl())
+			.map(Path::new)
+			.orElseThrow(() -> new CollectorException("missing baseUrl"));
+
+		// get the parameters for the connection to the OpenAIRE database.
+		// the database is used to obtain the list of the datasources that the plugin will collect
+		final String dbUrl = api.getParams().get("dbUrl");
+		final String dbUser = api.getParams().get("dbUser");
+		final String dbPassword = api.getParams().get("dbPassword");
+
+		// the types(comma separated, empty value for all) that the plugin will collect,
+		// the types should be expressed in the format of the normalized types of BASE (for example 1,121,...)
+		final String acceptedNormTypesString = api.getParams().get("acceptedNormTypes");
+
+		log.info("baseUrl: {}", filePath);
+		log.info("dbUrl: {}", dbUrl);
+		log.info("dbUser: {}", dbUser);
+		log.info("dbPassword: {}", "***");
+		log.info("acceptedNormTypes: {}", acceptedNormTypesString);
+
+		try {
+			if (!this.fs.exists(filePath)) {
+				throw new CollectorException("path does not exist: " + filePath);
+			}
+		} catch (final Throwable e) {
+			throw new CollectorException(e);
+		}
+
+		final Set<String> acceptedOpendoarIds = findAcceptedOpendoarIds(dbUrl, dbUser, dbPassword);
+
+		final Set<String> acceptedNormTypes = new HashSet<>();
+		if (StringUtils.isNotBlank(acceptedNormTypesString)) {
+			for (final String s : StringUtils.split(acceptedNormTypesString, ",")) {
+				if (StringUtils.isNotBlank(s)) {
+					acceptedNormTypes.add(s.trim());
+				}
+			}
+		}
+
+		final Iterator<String> iterator = new BaseCollectorIterator(this.fs, filePath, report);
+		final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
+		return StreamSupport
+			.stream(spliterator, false)
+			.filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes));
+	}
+
+	private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword)
+		throws CollectorException {
+		final Set<String> accepted = new HashSet<>();
+
+		try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
+
+			final String sql = IOUtils
+				.toString(
+					getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
+
+			dbClient.processResults(sql, row -> {
+				try {
+					final String dsId = row.getString("id");
+					log.info("Accepted Datasource: " + dsId);
+					accepted.add(dsId);
+				} catch (final SQLException e) {
+					log.error("Error in SQL", e);
+					throw new RuntimeException("Error in SQL", e);
+				}
+			});
+
+		} catch (final IOException e) {
+			log.error("Error accessong SQL", e);
+			throw new CollectorException("Error accessong SQL", e);
+		}
+
+		log.info("Accepted Datasources (TOTAL): " + accepted.size());
+
+		return accepted;
+	}
+
+	protected static boolean filterXml(final String xml,
+		final Set<String> acceptedOpendoarIds,
+		final Set<String> acceptedNormTypes) {
+		try {
+
+			final Document doc = DocumentHelper.parseText(xml);
+
+			final String id = doc.valueOf("//*[local-name()='collection']/@opendoar_id").trim();
+
+			if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) {
+				return false;
+			}
+
+			if (acceptedNormTypes.isEmpty()) {
+				return true;
+			}
+
+			for (final Object s : doc.selectNodes("//*[local-name()='typenorm']")) {
+				if (acceptedNormTypes.contains(((Node) s).getText().trim())) {
+					return true;
+				}
+			}
+
+			return false;
+		} catch (final DocumentException e) {
+			log.error("Error parsing document", e);
+			throw new RuntimeException("Error parsing document", e);
+		}
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/sql/base.sql
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/sql/base.sql
@ -0,0 +1,114 @@
+BEGIN;
+
+INSERT INTO dsm_services(
+	_dnet_resource_identifier_, 
+	id,
+	officialname,
+	englishname,
+	namespaceprefix,
+	websiteurl,
+	logourl,
+	platform,
+	contactemail,
+	collectedfrom,
+	provenanceaction,
+	_typology_to_remove_,
+	eosc_type,
+	eosc_datasource_type,
+	research_entity_types,
+	thematic
+) VALUES (
+	'openaire____::base_search',
+	'openaire____::base_search',
+	'Bielefeld Academic Search Engine (BASE)',
+	'Bielefeld Academic Search Engine (BASE)',
+	'base_search_',
+	'https://www.base-search.net',
+	'https://www.base-search.net/about/download/logo_224x57_white.gif',
+	'BASE',
+	'openaire-helpdesk@uni-bielefeld.de',
+	'infrastruct_::openaire',
+	'user:insert',
+	'aggregator::pubsrepository::unknown',
+	'Data Source',
+	'Aggregator',
+	ARRAY['Research Products'],
+	false
+);
+
+INSERT INTO dsm_service_organization(
+	_dnet_resource_identifier_,
+	organization,
+	service
+) VALUES (
+	'fairsharing_::org::214@@openaire____::base_search',
+	'fairsharing_::org::214',
+	'openaire____::base_search'
+);
+
+INSERT INTO dsm_api(
+	_dnet_resource_identifier_,
+	id,
+	service,
+	protocol,
+	baseurl,
+	metadata_identifier_path
+) VALUES (
+	'api_________::openaire____::base_search::dump',
+	'api_________::openaire____::base_search::dump',
+	'openaire____::base_search',
+	'baseDump',
+	'/user/michele.artini/base-import/base_oaipmh_dump-current.tar',
+	'//*[local-name()=''header'']/*[local-name()=''identifier'']'
+);
+
+
+INSERT INTO dsm_apiparams(
+	_dnet_resource_identifier_, 
+	api, 
+	param, 
+	value
+) VALUES (
+	'api_________::openaire____::base_search::dump@@dbUrl',
+	'api_________::openaire____::base_search::dump',
+	'dbUrl',
+	'jdbc:postgresql://postgresql.services.openaire.eu:5432/dnet_openaireplus'
+);
+
+INSERT INTO dsm_apiparams(
+	_dnet_resource_identifier_, 
+	api, 
+	param, 
+	value
+) VALUES (
+	'api_________::openaire____::base_search::dump@@dbUser',
+	'api_________::openaire____::base_search::dump',
+	'dbUser',
+	'dnet'
+);
+
+INSERT INTO dsm_apiparams(
+	_dnet_resource_identifier_, 
+	api, 
+	param, 
+	value
+) VALUES (
+	'api_________::openaire____::base_search::dump@@dbPassword',
+	'api_________::openaire____::base_search::dump',
+	'dbPassword',
+	'***'
+);
+
+INSERT INTO dsm_apiparams(
+	_dnet_resource_identifier_, 
+	api, 
+	param, 
+	value
+) VALUES (
+	'api_________::openaire____::base_search::dump@@acceptedNormTypes',
+	'api_________::openaire____::base_search::dump',
+	'acceptedNormTypes',
+	'1,11,111,121,13,14,15,18,181,182,183,1A,6,7'
+);
+
+COMMIT;
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql
@ -0,0 +1,9 @@
+select s.id as id 
+from dsm_services s 
+where collectedfrom = 'openaire____::opendoar' 
+and jurisdiction = 'Institutional'
+and s.id in (
+	select service from dsm_api where coalesce(compatibility_override, compatibility) = 'driver' or coalesce(compatibility_override, compatibility) = 'UNKNOWN'
+) and s.id not in (
+	select service from dsm_api where coalesce(compatibility_override, compatibility) like '%openaire%'
+);
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-aggregation-status.sql
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-aggregation-status.sql
@ -0,0 +1,11 @@
+select 
+	s.id           as id, 
+	s.jurisdiction as jurisdiction, 
+	array_remove(array_agg(a.id || ' (compliance: ' || coalesce(a.compatibility_override, a.compatibility, 'UNKNOWN') || ')@@@' || coalesce(a.last_collection_total, 0)), NULL) as aggregations
+from 
+	dsm_services s 
+	join dsm_api a on (s.id = a.service) 
+where 
+	collectedfrom = 'openaire____::opendoar'
+group by 
+	s.id;
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/xml/base-types.vocabulary.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/xml/base-types.vocabulary.xml
@ -0,0 +1,180 @@
+<RESOURCE_PROFILE>
+	<HEADER>
+		<RESOURCE_IDENTIFIER value="c67911d6-9988-4a3b-b965-7d39bdd4a31d_Vm9jYWJ1bGFyeURTUmVzb3VyY2VzL1ZvY2FidWxhcnlEU1Jlc291cmNlVHlwZQ==" />
+		<RESOURCE_TYPE value="VocabularyDSResourceType" />
+		<RESOURCE_KIND value="VocabularyDSResources" />
+		<RESOURCE_URI value="" />
+		<DATE_OF_CREATION value="2024-02-13T11:15:48+00:00" />
+	</HEADER>
+	<BODY>
+		<CONFIGURATION>
+			<VOCABULARY_NAME code="base:normalized_types">base:normalized_types</VOCABULARY_NAME>
+			<VOCABULARY_DESCRIPTION>base:normalized_types</VOCABULARY_DESCRIPTION>
+			<TERMS>
+				<TERM native_name="Text" code="Text" english_name="Text" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="1" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Book" code="Book" english_name="Book" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="11" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Book part" code="Book part" english_name="Book part" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="111" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Journal/Newspaper" code="Journal/Newspaper" english_name="Journal/Newspaper" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="12" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Article contribution" code="Article contribution" english_name="Article contribution" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="121" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Other non-article" code="Other non-article" english_name="Other non-article" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="122" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Conference object" code="Conference object" english_name="Conference object" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="13" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Report" code="Report" english_name="Report" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="14" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Review" code="Review" english_name="Review" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="15" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Course material" code="Course material" english_name="Course material" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="16" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Lecture" code="Lecture" english_name="Lecture" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="17" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Thesis" code="Thesis" english_name="Thesis" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="18" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Bachelor's thesis" code="Bachelor's thesis" english_name="Bachelor's thesis" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="181" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Master's thesis" code="Master's thesis" english_name="Master's thesis" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="182" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Doctoral and postdoctoral thesis" code="Doctoral and postdoctoral thesis" english_name="Doctoral and postdoctoral thesis" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="183" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Manuscript" code="Manuscript" english_name="Manuscript" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="19" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Patent" code="Patent" english_name="Patent" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="1A" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Musical notation" code="Musical notation" english_name="Musical notation" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="2" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Map" code="Map" english_name="Map" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="3" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Audio" code="Audio" english_name="Audio" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="4" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Image/Video" code="Image/Video" english_name="Image/Video" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="5" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Still image" code="Still image" english_name="Still image" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="51" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Moving image/Video" code="Moving image/Video" english_name="Moving image/Video" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="52" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Software" code="Software" english_name="Software" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="6" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Dataset" code="Dataset" english_name="Dataset" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="7" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+				<TERM native_name="Unknown" code="Unknown" english_name="Unknown" encoding="BASE">
+					<SYNONYMS>
+						<SYNONYM term="F" encoding="BASE" />
+					</SYNONYMS>
+					<RELATIONS />
+				</TERM>
+
+			</TERMS>
+		</CONFIGURATION>
+		<STATUS>
+			<LAST_UPDATE value="2013-11-18T10:46:36Z" />
+		</STATUS>
+		<SECURITY_PARAMETERS>String</SECURITY_PARAMETERS>
+	</BODY>
+</RESOURCE_PROFILE>
+
+                
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/xml/base2oaf.transformationRule.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/xml/base2oaf.transformationRule.xml
@ -0,0 +1,298 @@
+<RESOURCE_PROFILE>
+	<HEADER>
+		<RESOURCE_IDENTIFIER value="" />
+		<RESOURCE_TYPE value="TransformationRuleDSResourceType" />
+		<RESOURCE_KIND value="TransformationRuleDSResources" />
+		<RESOURCE_URI value="" />
+		<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00" />
+	</HEADER>
+	<BODY>
+		<CONFIGURATION>
+			<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc" />
+			<SINK_METADATA_FORMAT name="oaf_hbase" />
+			<IMPORTED />
+			<SCRIPT>
+				<TITLE>xslt_base2oaf_hadoop</TITLE>
+				<CODE>
+					<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
+						xmlns:base_dc="http://oai.base-search.net/base_dc/"
+						xmlns:datacite="http://datacite.org/schema/kernel-4" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+						xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:vocabulary="http://eu/dnetlib/transform/clean" xmlns:oaf="http://namespace.openaire.eu/oaf"
+						xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dc="http://purl.org/dc/elements/1.1/"
+						exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
+						<xsl:param name="varOfficialName" />
+						<xsl:param name="varDataSourceId" />
+						<xsl:param name="varFP7" select="'corda_______::'" />
+						<xsl:param name="varH2020" select="'corda__h2020::'" />
+						<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
+						<xsl:param name="index" select="0" />
+						<xsl:param name="transDate" select="current-dateTime()" />
+
+						<xsl:template name="terminate">
+							<xsl:message terminate="yes">
+								record is not compliant, transformation is interrupted.
+							</xsl:message>
+						</xsl:template>
+
+						<xsl:template match="/">
+							<record>
+								<xsl:apply-templates select="//*[local-name() = 'header']" />
+
+
+<!-- TO EVALUATE
+base_dc:authod_id
+base_dc:authod_id/base_dc:creator_id
+base_dc:authod_id/base_dc:creator_name
+
+example:
+
+<dc:creator>ALBU, Svetlana</dc:creator>
+
+<base_dc:authod_id>
+	<base_dc:creator_name>ALBU, Svetlana</base_dc:creator_name>
+    <base_dc:creator_id>https://orcid.org/0000-0002-8648-950X</base_dc:creator_id>
+</base_dc:authod_id>
+-->
+
+<!-- NOT USED 
+base_dc:global_id (I used oai:identifier)
+base_dc:collection/text()
+
+base_dc:continent
+base_dc:country
+base_dc:year (I used dc:date)
+dc:coverage
+dc:language (I used base_dc:lang)
+base_dc:link (I used dc:identifier)
+-->
+
+
+								<xsl:variable name="varBaseNormType" select="vocabulary:clean(//base_dc:typenorm, 'base:normalized_types')" />
+								
+
+								<metadata>
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:title" />
+										<xsl:with-param name="targetElement" select="'dc:title'" />
+									</xsl:call-template>
+
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:creator/replace(., '^(.*)\|.*$', '$1')" />
+										<xsl:with-param name="targetElement" select="'dc:creator'" />
+									</xsl:call-template>
+
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:contributor" />
+										<xsl:with-param name="targetElement" select="'dc:contributor'" />
+									</xsl:call-template>
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:description" />
+										<xsl:with-param name="targetElement" select="'dc:description'" />
+									</xsl:call-template>
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:subject" />
+										<xsl:with-param name="targetElement" select="'dc:subject'" />
+									</xsl:call-template>
+									
+									<!-- TODO: I'm not sure if this is the correct encoding -->
+									<xsl:for-each select="//base_dc:classcode|//base_dc:autoclasscode">
+										<dc:subject><xsl:value-of select="concat(@type, ':', .)" /></dc:subject>
+									</xsl:for-each>
+									<!-- END TODO -->
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:publisher" />
+										<xsl:with-param name="targetElement" select="'dc:publisher'" />
+									</xsl:call-template>
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:format" />
+										<xsl:with-param name="targetElement" select="'dc:format'" />
+									</xsl:call-template>
+									
+									<dc:type>
+										<xsl:value-of select="$varBaseNormType" />
+									</dc:type>
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:type" />
+										<xsl:with-param name="targetElement" select="'dc:type'" />
+									</xsl:call-template>
+									
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:source" />
+										<xsl:with-param name="targetElement" select="'dc:source'" />
+									</xsl:call-template>
+									
+									<dc:language>
+										<xsl:value-of select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" />
+									</dc:language>
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:rights" />
+										<xsl:with-param name="targetElement" select="'dc:rights'" />
+									</xsl:call-template>
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:relation" />
+										<xsl:with-param name="targetElement" select="'dc:relation'" />
+									</xsl:call-template>
+									
+									<xsl:if test="not(//dc:identifier[starts-with(., 'http')])">
+										<xsl:call-template name="terminate" />
+									</xsl:if>
+									
+									<xsl:call-template name="allElements">
+										<xsl:with-param name="sourceElement" select="//dc:identifier[starts-with(., 'http')]" />
+										<xsl:with-param name="targetElement" select="'dc:identifier'" />
+									</xsl:call-template>
+									
+									<xsl:for-each select="//dc:relation">
+										<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
+											<oaf:projectid>
+												<xsl:value-of select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
+											</oaf:projectid>
+										</xsl:if>
+										<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
+											<oaf:projectid>
+												<xsl:value-of select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
+											</oaf:projectid>
+										</xsl:if>
+									</xsl:for-each>
+
+									<dr:CobjCategory>
+										<xsl:variable name="varCobjCategory" select="vocabulary:clean($varBaseNormType, 'dnet:publication_resource')" />
+										<xsl:variable name="varSuperType" select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')" />
+										<xsl:attribute name="type" select="$varSuperType" />
+										<xsl:value-of select="$varCobjCategory" />
+									</dr:CobjCategory>
+									
+									<oaf:accessrights>
+										<xsl:choose>
+											<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
+											<xsl:when test="//base_dc:rightsnorm">
+												<xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')" />
+											</xsl:when>
+											<xsl:when test="//dc:rights">
+												<xsl:value-of select="vocabulary:clean( //dc:rights, 'dnet:access_modes')" />
+											</xsl:when>
+											<xsl:otherwise>UNKNOWN</xsl:otherwise>
+										</xsl:choose>
+									</oaf:accessrights>
+									
+									<xsl:for-each select="//base_dc:doi">
+										<oaf:identifier identifierType="doi">
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+
+									<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
+										<oaf:identifier identifierType="url">
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+
+									<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
+										<oaf:identifier identifierType="handle">
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>									
+
+									<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
+										<oaf:identifier identifierType='urn'>
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+									
+									<oaf:identifier identifierType="oai-original">
+										<xsl:value-of
+											select="//*[local-name() = 'about']/*[local-name() = 'provenance']//*[local-name() = 'originDescription' and not(./*[local-name() = 'originDescription'])]/*[local-name() = 'identifier']" />
+									</oaf:identifier>
+									
+									<oaf:hostedBy>
+										<xsl:attribute name="name">
+											<xsl:value-of select="//base_dc:collname" />
+										</xsl:attribute>
+										<xsl:attribute name="id">
+											<xsl:value-of select="concat('opendoar____::', //base_dc:collection/@opendoar_id)" />
+										</xsl:attribute>
+									</oaf:hostedBy>
+									
+									<oaf:collectedFrom>
+										<xsl:attribute name="name">
+											<xsl:value-of select="$varOfficialName" />
+										</xsl:attribute>
+										<xsl:attribute name="id">
+											<xsl:value-of select="$varDataSourceId" />
+										</xsl:attribute>
+									</oaf:collectedFrom>
+									
+									<oaf:dateAccepted>
+										<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )" />
+									</oaf:dateAccepted>
+									
+									<xsl:if test="//base_dc:oa[.='1']">
+										<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
+											<oaf:fulltext>
+												<xsl:value-of select="normalize-space(.)" />
+											</oaf:fulltext>
+										</xsl:for-each>
+									</xsl:if>
+									
+									<xsl:for-each select="//base_dc:collection/@ror_id">
+										<oaf:relation relType="resultOrganization"
+										 	subRelType="affiliation"
+										 	relClass="hasAuthorInstitution"
+											targetType="organization">
+											<xsl:choose>
+												<xsl:when test="contains(.,'https://ror.org/')">
+													<xsl:value-of select="concat('ror_________::', normalize-space(.))" />
+												</xsl:when>
+												<xsl:otherwise>
+													<xsl:value-of select="concat('ror_________::https://ror.org/', normalize-space(.))" />
+												</xsl:otherwise>
+											</xsl:choose>
+										</oaf:relation>
+									</xsl:for-each>
+								</metadata>
+								<xsl:copy-of select="//*[local-name() = 'about']" />
+							</record>
+						</xsl:template>
+
+						<xsl:template name="allElements">
+							<xsl:param name="sourceElement" />
+							<xsl:param name="targetElement" />
+							<xsl:for-each select="$sourceElement">
+								<xsl:element name="{$targetElement}">
+									<xsl:value-of select="normalize-space(.)" />
+								</xsl:element>
+							</xsl:for-each>
+						</xsl:template>
+
+						<xsl:template match="//*[local-name() = 'header']">
+							<xsl:if test="//oai:header/@status='deleted'">
+								<xsl:call-template name="terminate" />
+							</xsl:if>
+							<xsl:copy>
+								<xsl:apply-templates select="node()|@*" />
+								<xsl:element name="dr:dateOfTransformation">
+									<xsl:value-of select="$transDate" />
+								</xsl:element>
+							</xsl:copy>
+						</xsl:template>
+
+						<xsl:template match="node()|@*">
+							<xsl:copy>
+								<xsl:apply-templates select="node()|@*" />
+							</xsl:copy>
+						</xsl:template>
+					</xsl:stylesheet>
+				</CODE>
+			</SCRIPT>
+		</CONFIGURATION>
+		<STATUS />
+		<SECURITY_PARAMETERS />
+	</BODY>
+</RESOURCE_PROFILE>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/xml/base2odf.transformationRule.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/xml/base2odf.transformationRule.xml
@ -0,0 +1,322 @@
+<RESOURCE_PROFILE>
+	<HEADER>
+		<RESOURCE_IDENTIFIER value="2ad0cdd9-c96c-484c-8b0e-ed56d86891fe_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU=" />
+		<RESOURCE_TYPE value="TransformationRuleDSResourceType" />
+		<RESOURCE_KIND value="TransformationRuleDSResources" />
+		<RESOURCE_URI value="" />
+		<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00" />
+	</HEADER>
+	<BODY>
+		<CONFIGURATION>
+			<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc" />
+			<SINK_METADATA_FORMAT name="odf_hbase" />
+			<IMPORTED />
+			<SCRIPT>
+				<TITLE>xslt_base2odf_hadoop</TITLE>
+				<CODE>
+					<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO" xmlns:base_dc="http://oai.base-search.net/base_dc/"
+						xmlns:datacite="http://datacite.org/schema/kernel-4" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+						xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:vocabulary="http://eu/dnetlib/transform/clean" xmlns:oaf="http://namespace.openaire.eu/oaf"
+						xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dc="http://purl.org/dc/elements/1.1/"
+						exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
+						<xsl:param name="varOfficialName" />
+						<xsl:param name="varDataSourceId" />
+						<xsl:param name="varFP7" select="'corda_______::'" />
+						<xsl:param name="varH2020" select="'corda__h2020::'" />
+						<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
+						<xsl:param name="index" select="0" />
+						<xsl:param name="transDate" select="current-dateTime()" />
+
+						<xsl:template name="terminate">
+							<xsl:message terminate="yes">
+								record is not compliant, transformation is interrupted.
+							</xsl:message>
+						</xsl:template>
+
+						<xsl:template match="/">
+							<record>
+								<xsl:apply-templates select="//*[local-name() = 'header']" />
+
+
+								<!-- NOT USED 
+									base_dc:global_id (I used oai:identifier) 
+									base_dc:collection/text() 
+									base_dc:continent 
+									base_dc:country  
+									dc:coverage
+									dc:source
+									dc:relation
+									dc:type (I used //base_dc:typenorm)
+									dc:language (I used base_dc:lang) 
+									base_dc:link (I used dc:identifier)
+								 -->
+
+
+								<xsl:variable name="varBaseNormType" select="vocabulary:clean(//base_dc:typenorm, 'base:normalized_types')" />
+
+
+								<metadata>
+									<datacite:resource>
+
+										<xsl:for-each select="//base_dc:doi">
+											<datacite:identifier identifierType="DOI">
+												<xsl:value-of select="." />
+											</datacite:identifier>
+										</xsl:for-each>										
+
+										<datacite:alternateIdentifiers>
+											<xsl:for-each
+												select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
+												<datacite:identifier alternateIdentifierType="url">
+													<xsl:value-of select="." />
+												</datacite:identifier>
+											</xsl:for-each>
+
+											<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
+												<datacite:identifier alternateIdentifierType="handle">
+													<xsl:value-of select="." />
+												</datacite:identifier>
+											</xsl:for-each>
+
+											<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
+												<datacite:identifier alternateIdentifierType='urn'>
+													<xsl:value-of select="." />
+												</datacite:identifier>
+											</xsl:for-each>
+
+											<datacite:identifier alternateIdentifierType="oai-original">
+												<xsl:value-of
+													select="//*[local-name() = 'about']/*[local-name() = 'provenance']//*[local-name() = 'originDescription' and not(./*[local-name() = 'originDescription'])]/*[local-name() = 'identifier']" />
+											</datacite:identifier>
+										</datacite:alternateIdentifiers>
+
+										<datacite:relatedIdentifiers />
+
+
+										<datacite:resourceType><xsl:value-of select="$varBaseNormType" /></datacite:resourceType>
+
+										<datacite:titles>
+											<xsl:for-each select="//dc:title">
+												<datacite:title>
+													<xsl:value-of select="normalize-space(.)" />
+												</datacite:title>
+											</xsl:for-each>
+										</datacite:titles>
+
+										<datacite:creators>
+											<xsl:for-each select="//dc:creator">
+												<xsl:variable name="author" select="normalize-space(.)" />
+												<datacite:creator>
+													<datacite:creatorName>
+														<xsl:value-of select="$author" />
+													</datacite:creatorName>
+													<xsl:for-each select="//base_dc:authod_id[normalize-space(./base_dc:creator_name) = $author]/base_dc:creator_id ">
+														<xsl:if test="contains(.,'https://orcid.org/')">
+															<nameIdentifier schemeURI="https://orcid.org/" nameIdentifierScheme="ORCID">
+																<xsl:value-of select="substring-after(., 'https://orcid.org/')" />
+															</nameIdentifier>
+														</xsl:if>
+													</xsl:for-each>
+												</datacite:creator>
+											</xsl:for-each>
+										</datacite:creators>
+
+										<datacite:contributors>
+											<xsl:for-each select="//dc:contributor">
+												<datacite:contributor>
+													<datacite:contributorName>
+														<xsl:value-of select="normalize-space(.)" />
+													</datacite:contributorName>
+												</datacite:contributor>
+											</xsl:for-each>
+										</datacite:contributors>
+
+										<datacite:descriptions>
+											<xsl:for-each select="//dc:description">
+												<datacite:description descriptionType="Abstract">
+													<xsl:value-of select="normalize-space(.)" />
+												</datacite:description>
+											</xsl:for-each>
+										</datacite:descriptions>
+
+										<datacite:subjects>
+											<xsl:for-each select="//dc:subject">
+												<datacite:subject>
+													<xsl:value-of select="normalize-space(.)" />
+												</datacite:subject>
+											</xsl:for-each>
+											
+											<xsl:for-each select="//base_dc:classcode|//base_dc:autoclasscode">
+												<datacite:subject subjectScheme="{@type}" classificationCode="{normalize-space(.)}">
+													<!-- TODO the value should be obtained by the Code -->
+													<xsl:value-of select="normalize-space(.)" />
+												</datacite:subject>
+											</xsl:for-each>
+										</datacite:subjects>
+										
+										<datacite:publisher>
+											<xsl:value-of select="normalize-space(//dc:publisher)" />
+										</datacite:publisher>
+										
+										<datacite:publicationYear>
+											<xsl:value-of select="normalize-space(//base_dc:year)" />
+										</datacite:publicationYear>
+										
+										<datacite:formats>
+											<xsl:for-each select="//dc:format">
+												<datacite:format>
+													<xsl:value-of select="normalize-space(.)" />
+												</datacite:format>
+											</xsl:for-each>
+										</datacite:formats>
+										
+										<datacite:language>
+											<xsl:value-of select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" />
+										</datacite:language>
+
+										<oaf:accessrights>
+											<xsl:if test="//base_dc:oa[.='1']">
+												<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
+											</xsl:if>
+											<xsl:for-each select="//dc:rights|//base_dc:rightsnorm">
+												<datacite:rights><xsl:value-of select="vocabulary:clean(., 'dnet:access_modes')" /></datacite:rights>	
+											</xsl:for-each>
+										</oaf:accessrights>
+
+									</datacite:resource>
+
+										<xsl:for-each select="//dc:relation">
+											<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
+												<oaf:projectid>
+													<xsl:value-of select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
+												</oaf:projectid>
+											</xsl:if>
+											<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
+												<oaf:projectid>
+													<xsl:value-of select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
+												</oaf:projectid>
+											</xsl:if>
+										</xsl:for-each>
+
+									<dr:CobjCategory>
+										<xsl:variable name="varCobjCategory" select="vocabulary:clean($varBaseNormType, 'dnet:publication_resource')" />
+										<xsl:variable name="varSuperType" select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')" />
+										<xsl:attribute name="type" select="$varSuperType" />
+										<xsl:value-of select="$varCobjCategory" />
+									</dr:CobjCategory>
+
+									<oaf:accessrights>
+										<xsl:choose>
+											<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
+											<xsl:when test="//base_dc:rightsnorm">
+												<xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')" />
+											</xsl:when>
+											<xsl:when test="//dc:rights">
+												<xsl:value-of select="vocabulary:clean( //dc:rights, 'dnet:access_modes')" />
+											</xsl:when>
+											<xsl:otherwise>UNKNOWN</xsl:otherwise>
+										</xsl:choose>
+									</oaf:accessrights>
+
+									<xsl:for-each select="//base_dc:doi">
+										<oaf:identifier identifierType="doi">
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+
+									<xsl:for-each
+										select="distinct-values(//dc:identifier[starts-with(., 'http') and ( not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
+										<oaf:identifier identifierType="url">
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+
+									<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
+										<oaf:identifier identifierType="handle">
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+
+									<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
+										<oaf:identifier identifierType='urn'>
+											<xsl:value-of select="." />
+										</oaf:identifier>
+									</xsl:for-each>
+
+									<oaf:identifier identifierType="oai-original">
+										<xsl:value-of
+											select="//*[local-name() = 'about']/*[local-name() = 'provenance']//*[local-name() = 'originDescription' and not(./*[local-name() = 'originDescription'])]/*[local-name() = 'identifier']" />
+									</oaf:identifier>
+
+									<oaf:hostedBy>
+										<xsl:attribute name="name">
+											<xsl:value-of select="//base_dc:collname" />
+										</xsl:attribute>
+										<xsl:attribute name="id">
+											<xsl:value-of select="concat('opendoar____::', //base_dc:collection/@opendoar_id)" />
+										</xsl:attribute>
+									</oaf:hostedBy>
+
+									<oaf:collectedFrom>
+										<xsl:attribute name="name">
+											<xsl:value-of select="$varOfficialName" />
+										</xsl:attribute>
+										<xsl:attribute name="id">
+											<xsl:value-of select="$varDataSourceId" />
+										</xsl:attribute>
+									</oaf:collectedFrom>
+
+									<oaf:dateAccepted>
+										<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )" />
+									</oaf:dateAccepted>
+
+									<xsl:if test="//base_dc:oa[.='1']">
+										<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
+											<oaf:fulltext>
+												<xsl:value-of select="normalize-space(.)" />
+											</oaf:fulltext>
+										</xsl:for-each>
+									</xsl:if>
+
+									<xsl:for-each select="//base_dc:collection/@ror_id">
+										<oaf:relation relType="resultOrganization" subRelType="affiliation" relClass="hasAuthorInstitution" targetType="organization">
+											<xsl:choose>
+												<xsl:when test="contains(.,'https://ror.org/')">
+													<xsl:value-of select="concat('ror_________::', normalize-space(.))" />
+												</xsl:when>
+												<xsl:otherwise>
+													<xsl:value-of select="concat('ror_________::https://ror.org/', normalize-space(.))" />
+												</xsl:otherwise>
+											</xsl:choose>
+										</oaf:relation>
+									</xsl:for-each>
+								</metadata>
+								<xsl:copy-of select="//*[local-name() = 'about']" />
+							</record>
+						</xsl:template>
+
+						<xsl:template match="//*[local-name() = 'header']">
+							<xsl:if test="//oai:header/@status='deleted'">
+								<xsl:call-template name="terminate" />
+							</xsl:if>
+							<xsl:copy>
+								<xsl:apply-templates select="node()|@*" />
+								<xsl:element name="dr:dateOfTransformation">
+									<xsl:value-of select="$transDate" />
+								</xsl:element>
+							</xsl:copy>
+						</xsl:template>
+
+						<xsl:template match="node()|@*">
+							<xsl:copy>
+								<xsl:apply-templates select="node()|@*" />
+							</xsl:copy>
+						</xsl:template>
+					</xsl:stylesheet>
+				</CODE>
+			</SCRIPT>
+		</CONFIGURATION>
+		<STATUS />
+		<SECURITY_PARAMETERS />
+	</BODY>
+</RESOURCE_PROFILE>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json
@ -1048,5 +1048,10 @@
  "openaire_id": "re3data_____::r3d100010399",
  "datacite_name": "ZEW Forschungsdatenzentrum",
  "official_name": "ZEW Forschungsdatenzentrum"
+ },
+ "HBP.NEUROINF": {
+  "openaire_id": "fairsharing_::2975",
+  "datacite_name": "EBRAINS",
+  "official_name": "EBRAINS"
 }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.bio

 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH,MDSTORE_SIZE_PATH}
+import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
 import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
 import eu.dnetlib.dhp.schema.oaf.Oaf
 import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
@ -11,6 +11,7 @@ import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
 import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
+
 object SparkTransformBioDatabaseToOAF {

  def main(args: Array[String]): Unit = {
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectionInfo.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectionInfo.java
@ -0,0 +1,38 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import java.io.Serializable;
+
+public class BaseCollectionInfo implements Serializable {
+
+	private static final long serialVersionUID = 5766333937429419647L;
+
+	private String id;
+	private String opendoarId;
+	private String rorId;
+
+	public String getId() {
+		return this.id;
+	}
+
+	public void setId(final String id) {
+		this.id = id;
+	}
+
+	public String getOpendoarId() {
+		return this.opendoarId;
+	}
+
+	public void setOpendoarId(final String opendoarId) {
+		this.opendoarId = opendoarId;
+	}
+
+	public String getRorId() {
+		return this.rorId;
+	}
+
+	public void setRorId(final String rorId) {
+		this.rorId = rorId;
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java
@ -0,0 +1,184 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.dom4j.Attribute;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.DocumentHelper;
+import org.dom4j.Element;
+import org.dom4j.Node;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+
+@Disabled
+public class BaseCollectorIteratorTest {
+
+	@Test
+	void testImportFile() throws Exception {
+
+		long count = 0;
+
+		final BaseCollectorIterator iterator = new BaseCollectorIterator("base-sample.tar", new AggregatorReport());
+
+		final Map<String, Map<String, String>> collections = new HashMap<>();
+		final Map<String, AtomicInteger> fields = new HashMap<>();
+		final Set<String> types = new HashSet<>();
+
+		while (iterator.hasNext()) {
+
+			final Document record = DocumentHelper.parseText(iterator.next());
+
+			count++;
+
+			if ((count % 1000) == 0) {
+				System.out.println("# Read records: " + count);
+			}
+
+			// System.out.println(record.asXML());
+
+			for (final Object o : record.selectNodes("//*|//@*")) {
+				final String path = ((Node) o).getPath();
+
+				if (fields.containsKey(path)) {
+					fields.get(path).incrementAndGet();
+				} else {
+					fields.put(path, new AtomicInteger(1));
+				}
+
+				if (o instanceof Element) {
+					final Element n = (Element) o;
+
+					if ("collection".equals(n.getName())) {
+						final String collName = n.getText().trim();
+						if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) {
+							final Map<String, String> collAttrs = new HashMap<>();
+							for (final Object ao : n.attributes()) {
+								collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
+							}
+							collections.put(collName, collAttrs);
+						}
+					} else if ("type".equals(n.getName())) {
+						types.add(n.getText().trim());
+					}
+
+				}
+			}
+
+		}
+
+		final ObjectMapper mapper = new ObjectMapper();
+		for (final Entry<String, Map<String, String>> e : collections.entrySet()) {
+			System.out.println(e.getKey() + ": " + mapper.writeValueAsString(e.getValue()));
+
+		}
+
+		for (final Entry<String, AtomicInteger> e : fields.entrySet()) {
+			System.out.println(e.getKey() + ": " + e.getValue().get());
+
+		}
+
+		System.out.println("TYPES: ");
+		for (final String s : types) {
+			System.out.println(s);
+
+		}
+
+		assertEquals(30000, count);
+	}
+
+	@Test
+	public void testParquet() throws Exception {
+
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
+
+		final SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
+
+		final List<BaseRecordInfo> ls = new ArrayList<>();
+
+		for (int i = 0; i < 10; i++) {
+			ls.add(extractInfo(xml));
+		}
+
+		final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
+			.fromSparkContext(spark.sparkContext())
+			.parallelize(ls);
+
+		final Dataset<BaseRecordInfo> df = spark
+			.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class));
+
+		df.printSchema();
+
+		df.show(false);
+	}
+
+	private BaseRecordInfo extractInfo(final String s) {
+		try {
+			final Document record = DocumentHelper.parseText(s);
+
+			final BaseRecordInfo info = new BaseRecordInfo();
+
+			final Set<String> paths = new LinkedHashSet<>();
+			final Set<String> types = new LinkedHashSet<>();
+			final List<BaseCollectionInfo> colls = new ArrayList<>();
+
+			for (final Object o : record.selectNodes("//*|//@*")) {
+				paths.add(((Node) o).getPath());
+
+				if (o instanceof Element) {
+					final Element n = (Element) o;
+
+					final String nodeName = n.getName();
+
+					if ("collection".equals(nodeName)) {
+						final String collName = n.getText().trim();
+
+						if (StringUtils.isNotBlank(collName)) {
+							final BaseCollectionInfo coll = new BaseCollectionInfo();
+							coll.setId(collName);
+							coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
+							coll.setRorId(n.valueOf("@ror_id").trim());
+							colls.add(coll);
+						}
+					} else if ("type".equals(nodeName)) {
+						types.add("TYPE: " + n.getText().trim());
+					} else if ("typenorm".equals(nodeName)) {
+						types.add("TYPE_NORM: " + n.getText().trim());
+					}
+				}
+			}
+
+			info.setId(record.valueOf("//*[local-name() = 'header']/*[local-name() = 'identifier']").trim());
+			info.getTypes().addAll(types);
+			info.getPaths().addAll(paths);
+			info.setCollections(colls);
+
+			return info;
+		} catch (final DocumentException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPluginTest.java
@ -0,0 +1,32 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
+
+class BaseCollectorPluginTest {
+
+	@Test
+	void testFilterXml() throws Exception {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
+
+		final Set<String> validIds = new HashSet<>(Arrays.asList("opendoar____::1234", "opendoar____::4567"));
+		final Set<String> validTypes = new HashSet<>(Arrays.asList("1", "121"));
+		final Set<String> validTypes2 = new HashSet<>(Arrays.asList("1", "11"));
+
+		assertTrue(BaseCollectorPlugin.filterXml(xml, validIds, validTypes));
+		assertTrue(BaseCollectorPlugin.filterXml(xml, validIds, new HashSet<>()));
+
+		assertFalse(BaseCollectorPlugin.filterXml(xml, new HashSet<>(), validTypes));
+		assertFalse(BaseCollectorPlugin.filterXml(xml, validIds, validTypes2));
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseRecordInfo.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseRecordInfo.java
@ -0,0 +1,49 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+public class BaseRecordInfo implements Serializable {
+
+	private static final long serialVersionUID = -8848232018350074593L;
+
+	private String id;
+	private List<BaseCollectionInfo> collections = new ArrayList<>();
+	private List<String> paths = new ArrayList<>();
+	private List<String> types = new ArrayList<>();
+
+	public String getId() {
+		return this.id;
+	}
+
+	public void setId(final String id) {
+		this.id = id;
+	}
+
+	public List<String> getPaths() {
+		return this.paths;
+	}
+
+	public void setPaths(final List<String> paths) {
+		this.paths = paths;
+	}
+
+	public List<String> getTypes() {
+		return this.types;
+	}
+
+	public void setTypes(final List<String> types) {
+		this.types = types;
+	}
+
+	public List<BaseCollectionInfo> getCollections() {
+		return this.collections;
+	}
+
+	public void setCollections(final List<BaseCollectionInfo> collections) {
+		this.collections = collections;
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseTransfomationTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseTransfomationTest.java
@ -0,0 +1,78 @@
+
+package eu.dnetlib.dhp.collection.plugin.base;
+
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.util.LongAccumulator;
+import org.dom4j.io.SAXReader;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.schema.mdstore.Provenance;
+import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+
+@Disabled
+@ExtendWith(MockitoExtension.class)
+public class BaseTransfomationTest extends AbstractVocabularyTest {
+
+	private SparkConf sparkConf;
+
+	@BeforeEach
+	public void setUp() throws IOException, ISLookUpException {
+		setUpVocabulary();
+
+		this.sparkConf = new SparkConf();
+		this.sparkConf.setMaster("local[*]");
+		this.sparkConf.set("spark.driver.host", "localhost");
+		this.sparkConf.set("spark.ui.enabled", "false");
+	}
+
+	@Test
+	void testBase2ODF() throws Exception {
+
+		final MetadataRecord mr = new MetadataRecord();
+		mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
+		mr.setBody(IOUtils.toString(getClass().getResourceAsStream("record.xml")));
+
+		final XSLTTransformationFunction tr = loadTransformationRule("xml/base2odf.transformationRule.xml");
+
+		final MetadataRecord result = tr.call(mr);
+
+		System.out.println(result.getBody());
+	}
+
+	@Test
+	void testBase2OAF() throws Exception {
+
+		final MetadataRecord mr = new MetadataRecord();
+		mr.setProvenance(new Provenance("DSID", "DSNAME", "PREFIX"));
+		mr.setBody(IOUtils.toString(getClass().getResourceAsStream("record.xml")));
+
+		final XSLTTransformationFunction tr = loadTransformationRule("xml/base2oaf.transformationRule.xml");
+
+		final MetadataRecord result = tr.call(mr);
+
+		System.out.println(result.getBody());
+	}
+
+	private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception {
+		final String xslt = new SAXReader()
+			.read(this.getClass().getResourceAsStream(path))
+			.selectSingleNode("//CODE/*")
+			.asXML();
+
+		final LongAccumulator la = new LongAccumulator();
+
+		return new XSLTTransformationFunction(new AggregationCounter(la, la, la), xslt, 0, this.vocabularies);
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/base/base-sample.tar
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/base/base-sample.tar
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/base/record.xml
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/base/record.xml
@ -0,0 +1,58 @@
+<record>
+  <header xmlns="http://www.openarchives.org/OAI/2.0/">
+    <identifier>ftdoajarticles:oai:doaj.org/article:e2d5b5126b2d4e479933cc7f9a9ae0c1</identifier>
+    <datestamp>2022-12-31T11:48:55Z</datestamp>
+  </header>
+  <metadata xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/">
+    <base_dc:dc xsi:schemaLocation="http://oai.base-search.net/base_dc/ http://oai.base-search.net/base_dc/base_dc.xsd">
+      <base_dc:global_id>ftdoajarticles:oai:doaj.org/article:e2d5b5126b2d4e479933cc7f9a9ae0c1</base_dc:global_id>
+      <base_dc:continent>cww</base_dc:continent>
+      <base_dc:country>org</base_dc:country>
+      <base_dc:collection opendoar_id="1234" ror_id="ror1234">ftdoajarticles</base_dc:collection>
+      <base_dc:collname>TEST REPO</base_dc:collname>
+      <dc:title>Assessment of cultural heritage: the legislative and methodological framework of Russian Federation</dc:title>
+      <dc:creator>ALBU, Svetlana</dc:creator>
+      <dc:creator>LEȘAN, Anna</dc:creator>
+      <dc:subject>architectural heritage</dc:subject>
+      <dc:subject>evaluation of architectural heritage</dc:subject>
+      <dc:subject>types of values</dc:subject>
+      <dc:subject>experience of russian federation</dc:subject>
+      <dc:subject>Social Sciences</dc:subject>
+      <dc:subject>H</dc:subject>
+      <dc:description>Architectural heritage is the real estate inheritance by population of a country becoming an extremely valuable and specific category, preserving and capitalizing on those assets requires considerable effort. The state does not have sufficient means to maintain and preserve cultural heritage, as a result it is included in the civil circuit. The transfer of property right or of some partial rights over the architectural patrimony is accompanied by the necessity to estimate the value of goods. In this article, the authors examine the experience of Russian Federation (one of the largest countries with a huge architectural heritage) on the legislative framework of architectural and methodological heritage of architectural heritage assessment. The particularities of cultural assets valuation compared to other categories of real estate are examined, as well as the methodological aspects (types of values, methods applied in valuation, approaches according to the purpose of valuation) regarding the valuation of real estate with architectural value in Russian Federation.</dc:description>
+      <dc:publisher>Technical University of Moldova</dc:publisher>
+      <dc:date>2020-09-01T00:00:00Z</dc:date>
+      <base_dc:year>2020</base_dc:year>
+      <dc:type>article</dc:type>
+      <base_dc:typenorm>121</base_dc:typenorm>
+      <dc:identifier>https://doi.org/10.5281/zenodo.3971988</dc:identifier>
+      <dc:identifier>https://doaj.org/article/e2d5b5126b2d4e479933cc7f9a9ae0c1</dc:identifier>
+      <base_dc:link>https://doi.org/10.5281/zenodo.3971988</base_dc:link>
+      <dc:source>Journal of Social Sciences, Vol 3, Iss 3, Pp 134-143 (2020)</dc:source>
+      <dc:language>EN</dc:language>
+      <dc:language>FR</dc:language>
+      <dc:language>RO</dc:language>
+      <dc:relation>http://ibn.idsi.md/sites/default/files/imag_file/JSS-3-2020_134-143.pdf</dc:relation>
+      <dc:relation>https://doaj.org/toc/2587-3490</dc:relation>
+      <dc:relation>https://doaj.org/toc/2587-3504</dc:relation>
+      <dc:relation>doi:10.5281/zenodo.3971988</dc:relation>
+      <dc:relation>2587-3490</dc:relation>
+      <dc:relation>2587-3504</dc:relation>
+      <dc:relation>https://doaj.org/article/e2d5b5126b2d4e479933cc7f9a9ae0c1</dc:relation>
+      <base_dc:autoclasscode type="ddc">720</base_dc:autoclasscode>
+      <base_dc:authod_id>
+        <base_dc:creator_name>ALBU, Svetlana</base_dc:creator_name>
+        <base_dc:creator_id>https://orcid.org/0000-0002-8648-950X</base_dc:creator_id>
+      </base_dc:authod_id>
+      <base_dc:authod_id>
+        <base_dc:creator_name>LEȘAN, Anna</base_dc:creator_name>
+        <base_dc:creator_id>https://orcid.org/0000-0003-3284-0525</base_dc:creator_id>
+      </base_dc:authod_id>
+      <base_dc:doi>https://doi.org/10.5281/zenodo.3971988</base_dc:doi>
+      <base_dc:oa>1</base_dc:oa>
+      <base_dc:lang>eng</base_dc:lang>
+      <base_dc:lang>fre</base_dc:lang>
+      <base_dc:lang>rum</base_dc:lang>
+    </base_dc:dc>
+  </metadata>
+</record>
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt
@ -1496,4 +1496,30 @@ cnr:institutes @=@ __CDS131__ @=@ IBE - Istituto per la BioEconomia
 cnr:institutes @=@ https://ror.org/0263zy895 @=@ CDS132
 cnr:institutes @=@ https://ror.org/0263zy895 @=@ SCITEC - Istituto di Scienze e Tecnologie Chimiche \"Giulio Natta\"
 cnr:institutes @=@ __CDS133__ @=@ CDS133
-cnr:institutes @=@ __CDS133__ @=@ STEMS - Istituto di Scienze e Tecnologie per l'Energia e la Mobilità Sostenibili
+cnr:institutes @=@ __CDS133__ @=@ STEMS - Istituto di Scienze e Tecnologie per l'Energia e la Mobilità Sostenibili
+base:normalized_types @=@ Text @=@ 1
+base:normalized_types @=@ Book @=@ 11
+base:normalized_types @=@ Book part @=@ 111
+base:normalized_types @=@ Journal/Newspaper @=@ 12
+base:normalized_types @=@ Article contribution @=@ 121
+base:normalized_types @=@ Other non-article @=@ 122
+base:normalized_types @=@ Conference object @=@ 13
+base:normalized_types @=@ Report @=@ 14
+base:normalized_types @=@ Review @=@ 15
+base:normalized_types @=@ Course material @=@ 16
+base:normalized_types @=@ Lecture @=@ 17
+base:normalized_types @=@ Thesis @=@ 18
+base:normalized_types @=@ Bachelor's thesis @=@ 181
+base:normalized_types @=@ Master's thesis @=@ 182
+base:normalized_types @=@ Doctoral and postdoctoral thesis @=@ 183
+base:normalized_types @=@ Manuscript @=@ 19
+base:normalized_types @=@ Patent @=@ 1A
+base:normalized_types @=@ Musical notation @=@ 2
+base:normalized_types @=@ Map @=@ 3
+base:normalized_types @=@ Audio @=@ 4
+base:normalized_types @=@ Image/Video @=@ 5
+base:normalized_types @=@ Still image @=@ 51
+base:normalized_types @=@ Moving image/Video @=@ 52
+base:normalized_types @=@ Software @=@ 6
+base:normalized_types @=@ Dataset @=@ 7
+base:normalized_types @=@ Unknown @=@ F
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt
@ -1210,4 +1210,29 @@ cnr:institutes @=@ cnr:institutes @=@ __CDS130__ @=@ __CDS130__
 cnr:institutes @=@ cnr:institutes @=@ __CDS131__ @=@ __CDS131__
 cnr:institutes @=@ cnr:institutes @=@ https://ror.org/0263zy895 @=@ https://ror.org/0263zy895
 cnr:institutes @=@ cnr:institutes @=@ __CDS133__ @=@ __CDS133__
-
+base:normalized_types @=@ base:normalized_types @=@ Text @=@ Text
+base:normalized_types @=@ base:normalized_types @=@ Book @=@ Book
+base:normalized_types @=@ base:normalized_types @=@ Book part @=@ Book part
+base:normalized_types @=@ base:normalized_types @=@ Journal/Newspaper @=@ Journal/Newspaper
+base:normalized_types @=@ base:normalized_types @=@ Article contribution @=@ Article contribution
+base:normalized_types @=@ base:normalized_types @=@ Other non-article @=@ Other non-article
+base:normalized_types @=@ base:normalized_types @=@ Conference object @=@ Conference object
+base:normalized_types @=@ base:normalized_types @=@ Report @=@ Report
+base:normalized_types @=@ base:normalized_types @=@ Review @=@ Review
+base:normalized_types @=@ base:normalized_types @=@ Course material @=@ Course material
+base:normalized_types @=@ base:normalized_types @=@ Lecture @=@ Lecture
+base:normalized_types @=@ base:normalized_types @=@ Thesis @=@ Thesis
+base:normalized_types @=@ base:normalized_types @=@ Bachelor's thesis @=@ Bachelor's thesis
+base:normalized_types @=@ base:normalized_types @=@ Master's thesis @=@ Master's thesis
+base:normalized_types @=@ base:normalized_types @=@ Doctoral and postdoctoral thesis @=@ Doctoral and postdoctoral thesis
+base:normalized_types @=@ base:normalized_types @=@ Manuscript @=@ Manuscript
+base:normalized_types @=@ base:normalized_types @=@ Patent @=@ Patent
+base:normalized_types @=@ base:normalized_types @=@ Musical notation @=@ Musical notation
+base:normalized_types @=@ base:normalized_types @=@ Map @=@ Map
+base:normalized_types @=@ base:normalized_types @=@ Audio @=@ Audio
+base:normalized_types @=@ base:normalized_types @=@ Image/Video @=@ Image/Video
+base:normalized_types @=@ base:normalized_types @=@ Still image @=@ Still image
+base:normalized_types @=@ base:normalized_types @=@ Moving image/Video @=@ Moving image/Video
+base:normalized_types @=@ base:normalized_types @=@ Software @=@ Software
+base:normalized_types @=@ base:normalized_types @=@ Dataset @=@ Dataset
+base:normalized_types @=@ base:normalized_types @=@ Unknown @=@ Unknown
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -122,22 +122,41 @@ public class DedupRecordFactory {
 				}

 				return Stream
-					.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
-					.map(id -> {
-						try {
-							OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
-							res.setId(id);
-							res.setDataInfo(dataInfo);
-							res.setLastupdatetimestamp(ts);
-							return res;
-						} catch (Exception e) {
-							throw new RuntimeException(e);
-						}
-					})
+					.concat(
+						Stream
+							.of(agg.getDedupId())
+							.map(id -> createDedupOafEntity(id, agg.entity, dataInfo, ts)),
+						agg.aliases
+							.stream()
+							.map(id -> createMergedDedupAliasOafEntity(id, agg.entity, dataInfo, ts)))
 					.iterator();
 			}, beanEncoder);
 	}

+	private static OafEntity createDedupOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
+		try {
+			OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
+			res.setId(id);
+			res.setDataInfo(dataInfo);
+			res.setLastupdatetimestamp(ts);
+			return res;
+		} catch (Exception e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static OafEntity createMergedDedupAliasOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
+		try {
+			OafEntity res = createDedupOafEntity(id, base, dataInfo, ts);
+			DataInfo ds = (DataInfo) BeanUtils.cloneBean(dataInfo);
+			ds.setDeletedbyinference(true);
+			res.setDataInfo(ds);
+			return res;
+		} catch (Exception e) {
+			throw new RuntimeException(e);
+		}
+	}
+
 	private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {

 		if (duplicate == null) {
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml
@ -15,4 +15,12 @@
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
+    <property>
+        <name>hiveMetastoreUris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>pivotHistoryDatabase</name>
+        <value>&#x200B;</value>
+    </property>
 </configuration>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
@ -198,6 +198,8 @@
            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
            <arg>--actionSetId</arg><arg>${actionSetId}</arg>
            <arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
+            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
+            <arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
        </spark>
        <ok to="PrepareOrgRels"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json
@ -73,12 +73,6 @@
    "name": "Irish Nephrology Society",
    "synonym": []
  },
-  {
-    "id": "100011062",
-    "uri": "http://dx.doi.org/10.13039/100011062",
-    "name": "Asian Spinal Cord Network",
-    "synonym": []
-  },
  {
    "id": "100011096",
    "uri": "http://dx.doi.org/10.13039/100011096",
@ -223,12 +217,6 @@
    "name": "Global Brain Health Institute",
    "synonym": []
  },
-  {
-    "id": "100015776",
-    "uri": "http://dx.doi.org/10.13039/100015776",
-    "name": "Health and Social Care Board",
-    "synonym": []
-  },
  {
    "id": "100015992",
    "uri": "http://dx.doi.org/10.13039/100015992",
@ -403,18 +391,6 @@
    "name": "Irish Hospice Foundation",
    "synonym": []
  },
-  {
-    "id": "501100001596",
-    "uri": "http://dx.doi.org/10.13039/501100001596",
-    "name": "Irish Research Council for Science, Engineering and Technology",
-    "synonym": []
-  },
-  {
-    "id": "501100001597",
-    "uri": "http://dx.doi.org/10.13039/501100001597",
-    "name": "Irish Research Council for the Humanities and Social Sciences",
-    "synonym": []
-  },
  {
    "id": "501100001598",
    "uri": "http://dx.doi.org/10.13039/501100001598",
@ -515,7 +491,7 @@
    "id": "501100002081",
    "uri": "http://dx.doi.org/10.13039/501100002081",
    "name": "Irish Research Council",
-    "synonym": []
+    "synonym": ["501100001596", "501100001597"]
  },
  {
    "id": "501100002736",
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -587,7 +587,15 @@ case object Crossref2Oaf {
                "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
                "10.13039/501100013589" | "10.13039/501100000271" =>
              generateSimpleRelationFromAward(funder, "ukri________", a => a)
-
+            //HFRI
+            case "10.13039/501100013209" =>
+              generateSimpleRelationFromAward(funder, "hfri________", a => a)
+              val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
+              queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+              queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+            //ERASMUS+
+            case "10.13039/501100010790" =>
+              generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
            case _ => logger.debug("no match for " + funder.DOI.get)

          }
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java
@ -53,6 +53,8 @@ public class Constraints implements Serializable {

 		for (Constraint sc : constraint) {
 			boolean verified = false;
+			if(!param.containsKey(sc.getField()))
+				return false;
 			for (String value : param.get(sc.getField())) {
 				if (sc.verifyCriteria(value.trim())) {
 					verified = true;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -317,7 +317,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 					listKeyValues(
 						createOpenaireId(10, rs.getString("collectedfromid"), true),
 						rs.getString("collectedfromname")));
-			p.setPid(new ArrayList<>());
+			p.setPid(prepareListOfStructProps(rs.getArray("pid"), info));
 			p.setDateofcollection(asString(rs.getDate("dateofcollection")));
 			p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
 			p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -238,11 +238,23 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 				(Element) doc
 					.selectSingleNode(
 						"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
-			.map(element -> {
-				final String resourceTypeURI = element.attributeValue("uri");
-				final String resourceTypeAnyURI = element.attributeValue("anyURI");
-				final String resourceTypeTxt = element.getText();
-				final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
+			.map(e -> {
+				final String resourceTypeURI = Optional
+					.ofNullable(e.attributeValue("uri"))
+					.filter(StringUtils::isNotBlank)
+					.orElse(null);
+				final String resourceTypeAnyURI = Optional
+					.ofNullable(e.attributeValue("anyURI"))
+					.filter(StringUtils::isNotBlank)
+					.orElse(null);
+				final String resourceTypeTxt = Optional
+					.ofNullable(e.getText())
+					.filter(StringUtils::isNotBlank)
+					.orElse(null);
+				final String resourceTypeGeneral = Optional
+					.ofNullable(e.attributeValue("resourceTypeGeneral"))
+					.filter(StringUtils::isNotBlank)
+					.orElse(null);

 				return ObjectUtils
 					.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjects.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjects.sql
@ -33,7 +33,7 @@ SELECT
                dc.officialname                                                                                            AS collectedfromname,
                p.contracttype || '@@@' || p.contracttypescheme                                                            AS contracttype,
                p.provenanceactionclass || '@@@' || p.provenanceactionscheme                                             AS provenanceaction,
-                array_agg(DISTINCT i.pid || '###' || i.issuertype)                                                                  AS pid,
+                array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL)            AS pid,
                array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme)          AS subjects,
                array_agg(DISTINCT fp.path)                                                                                         AS fundingtree

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjects_production.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjects_production.sql
@ -33,7 +33,7 @@ SELECT
                dc.officialname                                                                                            AS collectedfromname,
                p.contracttypeclass || '@@@' || p.contracttypescheme                                                       AS contracttype,
                p.provenanceactionclass || '@@@' || p.provenanceactionscheme                                               AS provenanceaction,
-                array_agg(DISTINCT i.pid || '###' || i.issuertype)                                                         AS pid,
+                array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL)            AS pid,
                array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
                array_agg(DISTINCT fp.path)                                                                                AS fundingtree
        FROM projects p
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
@ -93,8 +93,8 @@ object CopyHdfsOafSparkApplication {
      hasSource != null && hasTarget != null
    } else {
      val hasId = (json \ "id").extractOrElse[String](null)
-      val resultType = (json \ "resulttype" \ "classid").extractOrElse[String](null)
-      hasId != null && oafType.equalsIgnoreCase(resultType)
+      val resultType = (json \ "resulttype" \ "classid").extractOrElse[String]("")
+      hasId != null && oafType.startsWith(resultType)
    }

  }
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplicationTest.java
@ -59,7 +59,19 @@ public class CopyHdfsOafSparkApplicationTest {
 								.getResourceAsStream(
 									"/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json")),
 					"publication"));
+	}

+	@Test
+	void isOafType_Datacite_ORP() throws IOException {
+		assertTrue(
+				CopyHdfsOafSparkApplication
+						.isOafType(
+								IOUtils
+										.toString(
+												getClass()
+														.getResourceAsStream(
+																"/eu/dnetlib/dhp/oa/graph/raw/datacite_orp.json")),
+								"otherresearchproduct"));
 	}

 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -1171,6 +1171,34 @@ class MappersTest {

 	}

+	@Test
+	void test_Zenodo2() throws IOException {
+		final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo2.xml")));
+		final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
+
+		assertEquals(3, list.size());
+		Publication p = cleanup((Publication) list.get(0), vocs);
+
+		assertNotNull(p.getInstance());
+		assertEquals(1, p.getInstance().size());
+
+		final Instance instance = p.getInstance().get(0);
+
+		assertNotNull(instance.getInstanceTypeMapping());
+		assertEquals(1, instance.getInstanceTypeMapping().size());
+
+		Optional<InstanceTypeMapping> coarType = instance
+			.getInstanceTypeMapping()
+			.stream()
+			.filter(itm -> ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
+			.findFirst();
+
+		assertTrue(coarType.isPresent());
+		assertNotNull(coarType.get().getOriginalType());
+		assertNull(coarType.get().getTypeCode());
+		assertNull(coarType.get().getTypeLabel());
+	}
+
 	@Test
 	void testROHub2() throws IOException {
 		final String xml = IOUtils
@ -1229,7 +1257,7 @@ class MappersTest {
 	}

 	@Test
-	public void testD4ScienceTraining() throws IOException {
+	void testD4ScienceTraining() throws IOException {
 		final String xml = IOUtils
 			.toString(Objects.requireNonNull(getClass().getResourceAsStream("d4science-1-training.xml")));
 		final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1240,7 +1268,7 @@ class MappersTest {
 	}

 	@Test
-	public void testD4ScienceDataset() throws IOException {
+	void testD4ScienceDataset() throws IOException {
 		final String xml = IOUtils
 			.toString(Objects.requireNonNull(getClass().getResourceAsStream("d4science-2-dataset.xml")));
 		final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datacite_orp.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datacite_orp.json
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo2.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo2.xml
@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record xmlns:oaf="http://namespace.openaire.eu/oaf"
+        xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+        xmlns:datacite="http://datacite.org/schema/kernel-3"
+        xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+        xmlns:dri="http://www.driver-repository.eu/namespace/dri">
+    <header xmlns="http://www.openarchives.org/OAI/2.0/">
+        <identifier>oai:zenodo.org:1596086</identifier>
+        <datestamp>2020-01-20T13:50:28Z</datestamp>
+        <setSpec>openaire</setSpec>
+        <dr:dateOfTransformation>2024-02-08T11:03:10.994Z</dr:dateOfTransformation>
+        <dri:objIdentifier>od______2659::036d5555a6688ed00c8d0da97bdece3b</dri:objIdentifier>
+        <dri:dateOfCollection>2024-02-08T11:03:10.994Z</dri:dateOfCollection>
+        <dri:dateOfTransformation>2024-02-08T11:03:10.994Z</dri:dateOfTransformation>
+    </header>
+    <metadata>
+        <resource xmlns="http://datacite.org/schema/kernel-4"
+                  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                  xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
+            <identifier identifierType="URL">https://zenodo.org/record/1596086</identifier>
+            <alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3"/>
+            <creators>
+                <creator>
+                    <creatorName>Bonney, T. G.</creatorName>
+                    <givenName>T. G.</givenName>
+                    <familyName>Bonney</familyName>
+                </creator>
+            </creators>
+            <titles>
+                <title>Ice Blocks on a Moraine</title>
+            </titles>
+            <publisher>Zenodo</publisher>
+            <publicationYear>1889</publicationYear>
+            <dates>
+                <date dateType="Issued">1889-08-22</date>
+            </dates>
+            <resourceType resourceTypeGeneral="JournalArticle"/>
+            <relatedIdentifiers>
+                <relatedIdentifier relatedIdentifierType="DOI" relationType="IsIdenticalTo"
+                >10.1038/040391a0</relatedIdentifier>
+            </relatedIdentifiers>
+            <rightsList>
+                <rights rightsURI="https://creativecommons.org/publicdomain/zero/1.0/legalcode"
+                >Creative Commons Zero v1.0 Universal</rights>
+                <rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
+            </rightsList>
+            <descriptions>
+                <description descriptionType="Abstract">n/a</description>
+            </descriptions>
+        </resource>
+        <dr:CobjCategory type="publication">0001</dr:CobjCategory>
+        <oaf:dateAccepted>1889-08-22</oaf:dateAccepted>
+        <oaf:accessrights>OPEN</oaf:accessrights>
+        <oaf:license>http://creativecommons.org/publicdomain/zero/1.0/legalcode</oaf:license>
+        <oaf:language/>
+        <oaf:hostedBy name="ZENODO" id="opendoar____::2659"/>
+        <oaf:collectedFrom name="ZENODO" id="opendoar____::2659"/>
+    </metadata>
+</record>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -185,6 +185,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -212,6 +213,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -239,6 +241,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -266,6 +269,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -293,6 +297,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -320,6 +325,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -347,6 +353,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -386,6 +393,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -414,6 +422,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -442,6 +451,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -470,6 +480,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -498,6 +509,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -526,6 +538,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -554,6 +567,7 @@
                --executor-cores=${sparkExecutorCoresForJoining}
                --executor-memory=${sparkExecutorMemoryForJoining}
                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--- a/dhp-workflows/dhp-stats-hist-snaps/pom.xml
+++ b/dhp-workflows/dhp-stats-hist-snaps/pom.xml
@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>dhp-workflows</artifactId>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <version>1.2.5-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+    <artifactId>dhp-stats-hist-snaps</artifactId>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+    </dependencies>
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>pl.project13.maven</groupId>
+				<artifactId>git-commit-id-plugin</artifactId>
+                <version>2.1.11</version>
+				<configuration>
+					<failOnNoGitDirectory>false</failOnNoGitDirectory>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+</project>
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/config-default.xml
@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>${jobTracker}</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>${nameNode}</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hive_jdbc_url</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
+    </property>
+	<property>
+		<name>oozie.wf.workflow.notification.url</name>
+		<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
+	</property>
+</configuration>
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@ -0,0 +1,223 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export HADOOP_USER_NAME=$2
+
+
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
+IMPALA_HDFS_NODE=''
+COUNTER=0
+while [ $COUNTER -lt 3 ]; do
+  if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
+      break
+  elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
+      break
+  else
+      IMPALA_HDFS_NODE=''
+      sleep 1
+  fi
+  ((COUNTER++))
+done
+if [ -z "$IMPALA_HDFS_NODE" ]; then
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    exit 1
+fi
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
+
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
+
+function copydb() {
+  db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
+
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    return 1
+  fi
+
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
+
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  else
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
+    rm -f error.log
+    return 2
+  fi
+
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+
+  echo -e "\nCreating schema for db: '${db}'\n"
+
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
+
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
+
+  all_create_view_statements=()
+
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_statements+=("$create_view_statement")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
+  done
+
+  echo -e "\nAll tables have been created, going to create the views..\n"
+
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+
+  previous_num_of_views_to_retry=${#all_create_view_statements}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
+  else
+    echo -e "\nDB '${db}' does not contain any views.\n"
+  fi
+
+  level_counter=0
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
+    ((level_counter++))
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry_create_view_statements=()
+
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry_create_view_statements+=("$create_view_statement")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
+    fi
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo -e "\nComputing stats for tables..\n"
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
+  rm -f error.log
+  echo -e "\n\nFinished processing db: ${db}\n\n"
+}
+
+
+MONITOR_DB=$1
+#HADOOP_USER_NAME=$2
+copydb $MONITOR_DB
+
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/finalizeImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/finalizeImpalaCluster.sh
@ -0,0 +1,41 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+SOURCE=$1
+PRODUCTION=$2
+SHADOW=$3
+MONITOR_PROD=$4
+MONITOR_IRISH_PROD=$5
+
+
+echo ${SOURCE}
+echo ${PRODUCTION}
+
+#echo "Updating ${PRODUCTION} monitor database old cluster"
+#impala-shell -q "create database if not exists ${PRODUCTION}"
+#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
+#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+
+echo "Updating ${PRODUCTION} historical snapshots database"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+echo "Production monitor db ready!"
+
+impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots_fos"
+
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots as select * from ${SOURCE}.historical_snapshots"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots_fos as select * from ${SOURCE}.historical_snapshots_fos"
+
+impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos"
+
+
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish as select * from ${SOURCE}.historical_snapshots_irish"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos as select * from ${SOURCE}.historical_snapshots_irish"
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/hist_snaps.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/hist_snaps.sh
@ -0,0 +1,27 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export TARGET=$2
+export SHADOW=$3
+export SCRIPT_PATH=$4
+
+
+export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
+export HADOOP_USER_NAME="oozie"
+
+echo "Getting file from " $4
+hdfs dfs -copyToLocal $4
+
+#update Monitor DB IRISH
+#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
+cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
+hive $HIVE_OPTS -f foo
+
+echo "Hive shell finished"
+
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsAll.sql
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsAll.sql
@ -0,0 +1,82 @@
+INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
+SELECT * FROM ${hist_db_name_prev}.historical_snapshots_fos;
+
+INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
+select
+    cast(${hist_date} as STRING),
+    count(distinct r.id),
+    r.type,
+    rf.lvl1,
+    rf.lvl2,
+    pf.publicly_funded,
+    r.access_mode,
+    r.gold,
+    r.green,
+    coalesce(gl.green_with_license,0),
+    h.is_hybrid,
+    b.is_bronze_oa,
+    d.in_diamond_journal,
+    t.is_transformative,
+    pr.refereed
+from ${stats_db_name}.result r
+         left outer join ${stats_db_name}.result_fos rf on rf.id=r.id
+         left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
+         left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
+         left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
+         left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
+         left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
+         left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
+         left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
+group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
+
+drop table if exists ${hist_db_name}.historical_snapshots_fos purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_fos STORED AS PARQUET AS
+SELECT * FROM ${hist_db_name}.historical_snapshots_fos_tmp;
+
+drop table if exists ${monitor_db_name}.historical_snapshots_fos purge;
+
+create table ${monitor_db_name}.historical_snapshots_fos stored as parquet
+as select * from ${hist_db_name}.historical_snapshots_fos;
+
+drop table ${hist_db_name}.historical_snapshots_fos_tmp purge;
+
+INSERT INTO ${hist_db_name}.historical_snapshots_tmp as
+SELECT * FROM ${hist_db_name_prev}.historical_snapshots;
+
+INSERT INTO ${hist_db_name}.historical_snapshots_tmp
+select
+    cast(${hist_date} as STRING),
+    count(distinct r.id),
+    r.type,
+    pf.publicly_funded,
+    r.access_mode,
+    r.gold,
+    r.green,
+    coalesce(gl.green_with_license,0),
+    h.is_hybrid,
+    b.is_bronze_oa,
+    d.in_diamond_journal,
+    t.is_transformative,
+    pr.refereed
+from ${stats_db_name}.result r
+         left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
+         left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
+         left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
+         left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
+         left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
+         left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
+         left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
+group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
+
+drop table if exists ${hist_db_name}.historical_snapshots purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots STORED AS PARQUET AS
+SELECT * FROM ${hist_db_name}.historical_snapshots_tmp;
+
+drop table if exists ${monitor_db_name}.historical_snapshots purge;
+
+create table ${monitor_db_name}.historical_snapshots stored as parquet
+as select * from ${hist_db_name}.historical_snapshots;
+
+drop table ${hist_db_name}.historical_snapshots_tmp purge;
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsIrish.sql
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/BuildHistSnapsIrish.sql
@ -0,0 +1,91 @@
+INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
+SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish_fos;
+
+INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
+select
+    cast(${hist_date} as STRING),
+    count(distinct r.id),
+    r.type,
+    rf.lvl1,
+    rf.lvl2,
+    pf.publicly_funded,
+    r.access_mode,
+    r.gold,
+    r.green,
+    coalesce(gl.green_with_license,0),
+    h.is_hybrid,
+    b.is_bronze_oa,
+    d.in_diamond_journal,
+    t.is_transformative,
+    pr.refereed
+from ${stats_irish_db_name}.result r
+         left outer join ${stats_irish_db_name}.result_fos rf on rf.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
+         left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
+group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
+
+drop table if exists ${hist_db_name}.historical_snapshots_irish_fos purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_irish_fos STORED AS PARQUET AS
+SELECT * FROM ${hist_db_name}.historical_snapshots_fos_irish_tmp;
+
+drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
+
+create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
+as select * from ${hist_db_name}.historical_snapshots_irish_fos;
+
+drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
+
+INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
+SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish;
+
+INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
+select
+    cast(${hist_date} as STRING),
+    count(distinct r.id),
+    r.type,
+    pf.publicly_funded,
+    r.access_mode,
+    r.gold,
+    r.green,
+    coalesce(gl.green_with_license,0),
+    h.is_hybrid,
+    b.is_bronze_oa,
+    d.in_diamond_journal,
+    t.is_transformative,
+    pr.refereed
+from ${stats_irish_db_name}.result r
+         left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
+         left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
+         left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
+group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
+
+
+drop table if exists ${hist_db_name}.historical_snapshots_irish purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_irish STORED AS PARQUET AS
+SELECT * FROM ${hist_db_name}.historical_snapshots_irish_tmp;
+
+drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish purge;
+
+create table ${monitor_irish_db_name}.historical_snapshots_irish stored as parquet
+as select * from ${hist_db_name}.historical_snapshots_irish;
+
+drop table ${hist_db_name}.historical_snapshots_irish_tmp purge;
+
+
+drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
+
+create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
+as select * from ${hist_db_name}.historical_snapshots_irish_fos;
+
+drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/CreateDB.sql
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/scripts/CreateDB.sql
@ -0,0 +1,92 @@
+--------------------------------------------------------------
+--------------------------------------------------------------
+-- Historical Snapshots database creation
+--------------------------------------------------------------
+--------------------------------------------------------------
+
+DROP database IF EXISTS ${hist_db_name} CASCADE;
+CREATE database ${hist_db_name};
+
+drop table if exists ${hist_db_name}.historical_snapshots_fos_tmp purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_fos_tmp
+(
+    hist_date        STRING,
+    total            INT,
+    type             STRING,
+    lvl1             STRING,
+    lvl2             STRING,
+    publicly_funded  INT,
+    accessrights      STRING,
+    gold            INT,
+    green          INT,
+    green_with_license          INT,
+    hybrid           INT,
+    bronze         INT,
+    diamond             INT,
+    transformative  INT,
+    peer_reviewed   STRING
+)
+CLUSTERED BY (hist_date) INTO 100 buckets  stored as orc tblproperties ('transactional' = 'true');
+
+drop table if exists ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_fos_irish_tmp
+(
+    hist_date        STRING,
+    total            INT,
+    type             STRING,
+    lvl1             STRING,
+    lvl2             STRING,
+    publicly_funded  INT,
+    accessrights      STRING,
+    gold            INT,
+    green          INT,
+    green_with_license          INT,
+    hybrid           INT,
+    bronze         INT,
+    diamond             INT,
+    transformative  INT,
+    peer_reviewed   STRING
+)
+CLUSTERED BY (hist_date) INTO 100 buckets  stored as orc tblproperties ('transactional' = 'true');
+
+drop table if exists ${hist_db_name}.historical_snapshots_tmp purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_tmp
+(
+    hist_date        STRING,
+    total            INT,
+    type             STRING,
+    publicly_funded  INT,
+    accessrights      STRING,
+    gold            INT,
+    green          INT,
+    green_with_license          INT,
+    hybrid           INT,
+    bronze         INT,
+    diamond             INT,
+    transformative  INT,
+    peer_reviewed   STRING
+)
+CLUSTERED BY (hist_date) INTO 100 buckets  stored as orc tblproperties ('transactional' = 'true');
+
+drop table if exists ${hist_db_name}.historical_snapshots_irish_tmp purge;
+
+CREATE TABLE ${hist_db_name}.historical_snapshots_irish_tmp
+(
+    hist_date        STRING,
+    total            INT,
+    type             STRING,
+    publicly_funded  INT,
+    accessrights      STRING,
+    gold            INT,
+    green          INT,
+    green_with_license          INT,
+    hybrid           INT,
+    bronze         INT,
+    diamond             INT,
+    transformative  INT,
+    peer_reviewed   STRING
+)
+CLUSTERED BY (hist_date) INTO 100 buckets  stored as orc tblproperties ('transactional' = 'true');
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/workflow.xml
@ -0,0 +1,159 @@
+<workflow-app name="Stats Hist Snapshots" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>hist_db_name</name>
+            <description>the target hist database name</description>
+        </property>
+        <property>
+            <name>hist_db_name_prev</name>
+            <description>the hist database name of previous_month</description>
+        </property>
+
+        <property>
+            <name>stats_db_name</name>
+            <description>the stats db name</description>
+        </property>
+        <property>
+            <name>stats_irish_db_name</name>
+            <description>the stats irish db name</description>
+        </property>
+        <property>
+            <name>monitor_db_name</name>
+            <description>the monitor db name</description>
+        </property>
+        <property>
+            <name>monitor_irish_db_name</name>
+            <description>the irish monitor db name</description>
+        </property>
+        <property>
+            <name>hist_db_prod_name</name>
+            <description>the production db</description>
+        </property>
+        <property>
+            <name>hist_db_shadow_name</name>
+            <description>the production shadow db</description>
+        </property>
+        <property>
+            <name>hist_date</name>
+            <description>the snaps date</description>
+        </property>
+        <property>
+            <name>hive_metastore_uris</name>
+            <description>hive server metastore URIs</description>
+        </property>
+        <property>
+            <name>hive_jdbc_url</name>
+            <description>hive server jdbc url</description>
+        </property>
+        <property>
+            <name>hive_timeout</name>
+            <description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
+        </property>
+        <property>
+            <name>hadoop_user_name</name>
+            <description>user name of the wf owner</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>hive.metastore.uris</name>
+                <value>${hive_metastore_uris}</value>
+            </property>
+            <property>
+            	<name>hive.txn.timeout</name>
+            	<value>${hive_timeout}</value>
+            </property>
+	<property>
+	    <name>mapred.job.queue.name</name>
+	    <value>analytics</value>
+	</property>
+        </configuration>
+    </global>
+
+    <start to="resume_from"/>
+    <decision name="resume_from">
+        <switch>
+            <case to="CreateDB">${wf:conf('resumeFrom') eq 'CreateDB'}</case>
+            <case to="BuildHistSnaps">${wf:conf('resumeFrom') eq 'BuildHistSnaps'}</case>
+            <case to="BuildHistSnapsIrish">${wf:conf('resumeFrom') eq 'BuildHistSnapsIrish'}</case>
+            <case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
+            <case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
+            <default to="BuildHistSnaps"/>
+        </switch>
+    </decision>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="CreateDB">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
+            <jdbc-url>${hive_jdbc_url}</jdbc-url>
+            <script>scripts/CreateDB.sql</script>
+            <param>hist_db_name=${hist_db_name}</param>
+        </hive2>
+        <ok to="BuildHistSnaps"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="BuildHistSnaps">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
+            <jdbc-url>${hive_jdbc_url}</jdbc-url>
+            <script>scripts/BuildHistSnapsAll.sql</script>
+            <param>hist_db_name=${hist_db_name}</param>
+            <param>hist_db_name_prev=${hist_db_name_prev}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>monitor_db_name=${monitor_db_name}</param>
+            <param>hist_date=${hist_date}</param>
+        </hive2>
+        <ok to="BuildHistSnapsIrish"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="BuildHistSnapsIrish">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
+            <jdbc-url>${hive_jdbc_url}</jdbc-url>
+            <script>scripts/BuildHistSnapsIrish.sql</script>
+            <param>hist_db_name=${hist_db_name}</param>
+            <param>hist_db_name_prev=${hist_db_name_prev}</param>
+            <param>stats_irish_db_name=${stats_irish_db_name}</param>
+            <param>monitor_irish_db_name=${monitor_irish_db_name}</param>
+            <param>hist_date=${hist_date}</param>
+        </hive2>
+        <ok to="Step2-copyDataToImpalaCluster"/>
+        <error to="Kill"/>
+    </action>
+    <action name="Step2-copyDataToImpalaCluster">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>copyDataToImpalaCluster.sh</exec>
+            <argument>${hist_db_name}</argument>
+            <argument>${hadoop_user_name}</argument>
+            <file>copyDataToImpalaCluster.sh</file>
+        </shell>
+        <ok to="Step3-finalizeImpalaCluster"/>
+        <error to="Kill"/>
+    </action>
+    <action name="Step3-finalizeImpalaCluster">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>finalizeImpalaCluster.sh</exec>
+            <argument>${hist_db_name}</argument>
+            <argument>${hist_db_prod_name}</argument>
+            <argument>${hist_db_shadow_name}</argument>
+            <argument>${monitor_db_prod_name}</argument>
+            <argument>${monitor_irish_db_prod_name}</argument>
+            <file>finalizeImpalaCluster.sh</file>
+        </shell>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-stats-monitor-irish/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-irish/pom.xml
@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>dhp-workflows</artifactId>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <version>1.2.5-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+    <artifactId>dhp-stats-monitor-irish</artifactId>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+    </dependencies>
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>pl.project13.maven</groupId>
+				<artifactId>git-commit-id-plugin</artifactId>
+                <version>2.1.11</version>
+				<configuration>
+					<failOnNoGitDirectory>false</failOnNoGitDirectory>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+</project>
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/config-default.xml
@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>${jobTracker}</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>${nameNode}</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hive_jdbc_url</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
+    </property>
+	<property>
+		<name>oozie.wf.workflow.notification.url</name>
+		<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
+	</property>
+</configuration>
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@ -0,0 +1,222 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export HADOOP_USER_NAME=$2
+
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
+IMPALA_HDFS_NODE=''
+COUNTER=0
+while [ $COUNTER -lt 3 ]; do
+  if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
+      break
+  elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
+      break
+  else
+      IMPALA_HDFS_NODE=''
+      sleep 1
+  fi
+  ((COUNTER++))
+done
+if [ -z "$IMPALA_HDFS_NODE" ]; then
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    exit 1
+fi
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
+
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
+
+function copydb() {
+  db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
+
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    return 1
+  fi
+
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
+
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  else
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
+    rm -f error.log
+    return 2
+  fi
+
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+
+  echo -e "\nCreating schema for db: '${db}'\n"
+
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
+
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
+
+  all_create_view_statements=()
+
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_statements+=("$create_view_statement")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
+  done
+
+  echo -e "\nAll tables have been created, going to create the views..\n"
+
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+
+  previous_num_of_views_to_retry=${#all_create_view_statements}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
+  else
+    echo -e "\nDB '${db}' does not contain any views.\n"
+  fi
+
+  level_counter=0
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
+    ((level_counter++))
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry_create_view_statements=()
+
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry_create_view_statements+=("$create_view_statement")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
+    fi
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo -e "\nComputing stats for tables..\n"
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
+  rm -f error.log
+  echo -e "\n\nFinished processing db: ${db}\n\n"
+}
+
+
+MONITOR_DB=$1
+#HADOOP_USER_NAME=$2
+copydb $MONITOR_DB
+
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/finalizeImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/finalizeImpalaCluster.sh
@ -0,0 +1,23 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+SOURCE=$1
+PRODUCTION=$2
+echo ${SOURCE}
+echo ${PRODUCTION}
+
+#echo "Updating ${PRODUCTION} monitor database old cluster"
+#impala-shell -q "create database if not exists ${PRODUCTION}"
+#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
+#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+
+echo "Updating ${PRODUCTION} monitor database"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+echo "Production monitor db ready!"
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/monitor_irish.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/monitor_irish.sh
@ -0,0 +1,28 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export TARGET=$2
+export SHADOW=$3
+export SCRIPT_PATH=$4
+export GRAPHDB=$5
+
+
+export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
+export HADOOP_USER_NAME="oozie"
+
+echo "Getting file from " $4
+hdfs dfs -copyToLocal $4
+
+#update Monitor DB IRISH
+#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
+cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$5/g1" > foo
+hive $HIVE_OPTS -f foo
+
+echo "Hive shell finished"
+
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/scripts/buildIrishMonitorDB.sql
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/scripts/buildIrishMonitorDB.sql
@ -0,0 +1,241 @@
+drop database if exists TARGET cascade;
+create database if not exists TARGET;
+
+create view if not exists TARGET.category as select * from SOURCE.category;
+create view if not exists TARGET.concept as select * from SOURCE.concept;
+create view if not exists TARGET.context as select * from SOURCE.context;
+create view if not exists TARGET.country as select * from SOURCE.country;
+create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
+create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
+--create view if not exists TARGET.funder as select * from SOURCE.funder;
+create view if not exists TARGET.fundref as select * from SOURCE.fundref;
+create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
+create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
+create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
+create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
+create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
+create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
+
+drop table if exists TARGET.irish_funders;
+
+create TEMPORARY table TARGET.irish_funders as
+select distinct xpath_string(fundingtree[0].value, '//funder/name') as funder from GRAPHDB.project
+                     where xpath_string(fundingtree[0].value, '//funder/jurisdiction')='IE';
+--create TEMPORARY table TARGET.irish_funders as
+--select distinct name as funder from SOURCE.fundref where country='IE';
+
+drop table if exists TARGET.result;
+
+create table TARGET.result stored as parquet as
+select distinct * from (
+       select r.*
+       from SOURCE.result r
+                join SOURCE.result_projects rp on rp.id=r.id
+                join SOURCE.project p on p.id=rp.project
+                join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
+       union all
+       select r.*
+       from SOURCE.result r
+                join SOURCE.result_organization ro on ro.id=r.id
+                join SOURCE.organization o on o.id=ro.organization and o.country='IE'
+       union all
+       select r.*
+       from SOURCE.result r
+                join SOURCE.result_pids pid on pid.id=r.id
+                join stats_ext.transformative_facts tf on tf.doi=pid.pid
+   ) foo;
+
+create view if not exists TARGET.category as select * from SOURCE.category;
+create view if not exists TARGET.concept as select * from SOURCE.concept;
+create view if not exists TARGET.context as select * from SOURCE.context;
+create view if not exists TARGET.country as select * from SOURCE.country;
+create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
+create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
+
+create table TARGET.funder stored as parquet as select * from SOURCE.funder where country='IE';
+
+create view if not exists TARGET.fundref as select * from SOURCE.fundref;
+create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
+create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
+create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
+create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
+create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
+create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
+--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
+
+create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
+
+create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
+create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
+create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
+drop view TARGET.foo1;
+drop view TARGET.foo2;
+
+-- datasources
+create view if not exists TARGET.datasource as select * from SOURCE.datasource;
+create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
+create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
+create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
+
+create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
+
+-- organizations
+create view if not exists TARGET.organization as select * from SOURCE.organization;
+create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
+create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
+create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
+create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
+
+-- projects
+create view if not exists TARGET.project as select * from SOURCE.project;
+create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
+create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
+create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
+create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
+create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
+
+create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
+
+
+-- indicators
+-- Sprint 1 ----
+create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+-- Sprint 2 ----
+create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+---- Sprint 3 ----
+create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
+create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
+create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
+create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
+create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
+create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
+---- Sprint 4 ----
+create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+---- Sprint 5 ----
+create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+---- Sprint 6 ----
+create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+
+create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+
+create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+
+create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+
+---- Sprint 7 ----
+create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
+create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
+create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
+create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
+create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
+create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
+create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
+create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
+create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
+create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s;
+create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
+create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
+create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
+create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
+create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
+create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
+
+create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/workflow.xml
@ -0,0 +1,118 @@
+<workflow-app name="Irish Monitor Update" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>stats_db_name</name>
+            <description>the target stats database name</description>
+        </property>
+        <property>
+            <name>graph_db_name</name>
+            <description>the graph database name</description>
+        </property>
+        <property>
+            <name>monitor_irish_db_name</name>
+            <description>the target monitor db name</description>
+        </property>
+        <property>
+            <name>monitor_irish_db_prod_name</name>
+            <description>the name of the production monitor db</description>
+        </property>
+        <property>
+            <name>monitor_irish_db_shadow_name</name>
+            <description>the name of the shadow monitor db</description>
+        </property>
+        <property>
+            <name>hive_metastore_uris</name>
+            <description>hive server metastore URIs</description>
+        </property>
+        <property>
+            <name>hive_jdbc_url</name>
+            <description>hive server jdbc url</description>
+        </property>
+        <property>
+            <name>hive_timeout</name>
+            <description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
+        </property>
+        <property>
+            <name>hadoop_user_name</name>
+            <description>user name of the wf owner</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>hive.metastore.uris</name>
+                <value>${hive_metastore_uris}</value>
+            </property>
+            <property>
+            	<name>hive.txn.timeout</name>
+            	<value>${hive_timeout}</value>
+            </property>
+	<property>
+	    <name>mapred.job.queue.name</name>
+	    <value>analytics</value>
+	</property>
+        </configuration>
+    </global>
+
+    <start to="resume_from"/>
+    <decision name="resume_from">
+        <switch>
+            <case to="Step1-buildIrishMonitorDB">${wf:conf('resumeFrom') eq 'Step1-buildIrishMonitorDB'}</case>
+            <case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
+            <case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
+            <default to="Step1-buildIrishMonitorDB"/>
+        </switch>
+    </decision>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="Step1-buildIrishMonitorDB">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>monitor_irish.sh</exec>
+            <argument>${stats_db_name}</argument>
+            <argument>${monitor_irish_db_name}</argument>
+            <argument>${monitor_irish_db_shadow_name}</argument>
+            <argument>${wf:appPath()}/scripts/buildIrishMonitorDB.sql</argument>
+            <argument>${graph_db_name}</argument>
+            <file>monitor_irish.sh</file>
+        </shell>
+        <ok to="Step2-copyDataToImpalaCluster"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="Step2-copyDataToImpalaCluster">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>copyDataToImpalaCluster.sh</exec>
+            <argument>${monitor_irish_db_name}</argument>
+            <argument>${hadoop_user_name}</argument>
+            <file>copyDataToImpalaCluster.sh</file>
+        </shell>
+        <ok to="Step3-finalizeImpalaCluster"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="Step3-finalizeImpalaCluster">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>finalizeImpalaCluster.sh</exec>
+            <argument>${monitor_irish_db_name}</argument>
+            <argument>${monitor_irish_db_prod_name}</argument>
+            <argument>${monitor_irish_db_shadow_name}</argument>
+            <file>finalizeImpalaCluster.sh</file>
+        </shell>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-stats-monitor-update/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-update/pom.xml
@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>dhp-workflows</artifactId>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <version>1.2.5-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+    <artifactId>dhp-stats-monitor-update</artifactId>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+    </dependencies>
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>pl.project13.maven</groupId>
+				<artifactId>git-commit-id-plugin</artifactId>
+                <version>2.1.11</version>
+				<configuration>
+					<failOnNoGitDirectory>false</failOnNoGitDirectory>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+</project>
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/config-default.xml
@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>${jobTracker}</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>${nameNode}</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hive_jdbc_url</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
+    </property>
+	<property>
+		<name>oozie.wf.workflow.notification.url</name>
+		<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
+	</property>
+</configuration>
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@ -0,0 +1,223 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export HADOOP_USER_NAME=$2
+
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
+IMPALA_HDFS_NODE=''
+COUNTER=0
+while [ $COUNTER -lt 3 ]; do
+  if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
+      break
+  elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
+      break
+  else
+      IMPALA_HDFS_NODE=''
+      sleep 1
+  fi
+  ((COUNTER++))
+done
+if [ -z "$IMPALA_HDFS_NODE" ]; then
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    exit 1
+fi
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
+
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
+
+function copydb() {
+  db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
+
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    return 1
+  fi
+
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
+
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  else
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
+    rm -f error.log
+    return 2
+  fi
+
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+
+  echo -e "\nCreating schema for db: '${db}'\n"
+
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
+
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
+
+  all_create_view_statements=()
+
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_statements+=("$create_view_statement")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
+  done
+
+  echo -e "\nAll tables have been created, going to create the views..\n"
+
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+
+  previous_num_of_views_to_retry=${#all_create_view_statements}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
+  else
+    echo -e "\nDB '${db}' does not contain any views.\n"
+  fi
+
+  level_counter=0
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
+    ((level_counter++))
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry_create_view_statements=()
+
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry_create_view_statements+=("$create_view_statement")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
+    fi
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo -e "\nComputing stats for tables..\n"
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
+  rm -f error.log
+  echo -e "\n\nFinished processing db: ${db}\n\n"
+}
+
+
+MONITOR_DB=$1
+
+copydb $MONITOR_DB'_institutions'
+copydb $MONITOR_DB
+
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/finalizeImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/finalizeImpalaCluster.sh
@ -0,0 +1,57 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+#
+#function createShadowDB() {
+#  SOURCE=$1
+#  SHADOW=$2
+#
+#  # drop views from db
+#  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} --delimited  -q "show tables"`;
+#    do
+#        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "drop view $i;"`;
+#    done
+#
+#  impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
+#  impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
+##  impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
+#  impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
+#}
+#
+#MONITOR_DB=$1
+#MONITOR_DB_SHADOW=$2
+#
+#createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow'
+#createShadowDB $MONITOR_DB $MONITOR_DB'_shadow'
+
+SOURCE=$1
+PRODUCTION=$2
+echo ${SOURCE}
+echo ${PRODUCTION}
+
+#echo "Updating ${PRODUCTION} monitor database old cluster"
+#impala-shell -q "create database if not exists ${PRODUCTION}"
+#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
+#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+#
+#echo "Updating ${PRODUCTION}_institutions database old cluster"
+#impala-shell -q "create database if not exists ${PRODUCTION}_institutions"
+#impala-shell -d ${PRODUCTION}_institutions -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}_institutions./" | sed "s/$/;/" | impala-shell -c -f -
+#impala-shell -d ${SOURCE}_institutions -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}_institutions.\1 as select * from ${SOURCE}_institutions.\1;/" | impala-shell -c -f -
+#echo "Production insitutions db ready!"
+
+echo "Updating ${PRODUCTION} monitor database"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+echo "Production monitor db ready!"
+
+echo "Updating ${PRODUCTION}_institutions database"
+impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}_institutions"
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}_institutions -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}_institutions./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}_institutions -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}_institutions.\1 as select * from ${SOURCE}_institutions.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
+echo "Production insitutions db ready!"
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/monitor.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/monitor.sh
@ -0,0 +1,60 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export TARGET=$2
+export SHADOW=$3
+export SCRIPT_PATH=$4
+export SCRIPT_PATH2=$5
+export SCRIPT_PATH2=$6
+
+export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
+export HADOOP_USER_NAME="oozie"
+
+echo "Getting file from " $4
+hdfs dfs -copyToLocal $4
+
+echo "Getting file from " $5
+hdfs dfs -copyToLocal $5
+
+echo "Getting file from " $6
+hdfs dfs -copyToLocal $6
+
+#update Monitor DB
+cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
+hive $HIVE_OPTS -f foo
+
+#update Institutions DB
+cat updateMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
+hive $HIVE_OPTS -f foo
+cat updateMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
+hive $HIVE_OPTS -f foo
+
+
+
+echo "Hive shell finished"
+
+#echo "Updating shadow monitor insitutions database"
+#hive -e "drop database if exists ${SHADOW}_institutions cascade"
+#hive -e "create database if not exists ${SHADOW}_institutions"
+#hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
+#hive -f foo
+#echo "Shadow db monitor insitutions ready!"
+#
+##update Monitor DB
+#cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
+#hive $HIVE_OPTS -f foo
+#
+#echo "Hive shell finished"
+#
+#echo "Updating shadow monitor database"
+#hive -e "drop database if exists ${SHADOW} cascade"
+#hive -e "create database if not exists ${SHADOW}"
+#hive $HIVE_OPTS --database ${2} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${2}.\1;/" > foo
+#hive -f foo
+#echo "Shadow db monitor insitutions ready!"
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql
@ -0,0 +1,278 @@
+--drop database if exists TARGET cascade;
+--create database if not exists TARGET;
+--
+--create view if not exists TARGET.category as select * from SOURCE.category;
+--create view if not exists TARGET.concept as select * from SOURCE.concept;
+--create view if not exists TARGET.context as select * from SOURCE.context;
+--create view if not exists TARGET.country as select * from SOURCE.country;
+--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
+--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
+--create view if not exists TARGET.funder as select * from SOURCE.funder;
+--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
+--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
+--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
+--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
+--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
+--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
+--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
+--
+--create table TARGET.result stored as parquet as
+--    select distinct * from (
+--        select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
+--        union all
+--        select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
+--        union all
+--        select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
+--             'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
+--             'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
+--             'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
+--             'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
+--             'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
+--             'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
+--             'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
+--             'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
+--             'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
+--             'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
+--             -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
+--             'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
+--             'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
+--             'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
+--             'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
+--             'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
+--             'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
+--             'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
+--             'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
+--             'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
+--             'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
+--             'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
+--             'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
+--             'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
+--             'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
+--             'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
+--             'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
+--             'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
+--             'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
+--             'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
+--             'openorgs____::e15adb13c4dadd49de4d35c39b5da93a',  -- Nanyang Technological University
+--             'openorgs____::4b34103bde246228fcd837f5f1bf4212',  -- Autonomous University of Barcelona
+--             'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb',	-- McMaster University
+--             'openorgs____::51c7fc556e46381734a25a6fbc3fd398',	-- University of Modena and Reggio Emilia
+--             'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db',	-- Bilkent University
+--             'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06',	-- Saints Cyril and Methodius University of Skopje
+--             'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
+--             'openorgs____::b8b8ca674452579f3f593d9f5e557483',   -- University College Cork
+--             'openorgs____::38d7097854736583dde879d12dacafca'	-- Brown University
+--             'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
+--             'openorgs____::2530baca8a15936ba2e3297f2bce2e7e',	-- University of Cape Town
+--             'openorgs____::d11f981828c485cd23d93f7f24f24db1',  -- Technological University Dublin
+--             'openorgs____::5e6bf8962665cdd040341171e5c631d8',  -- Delft University of Technology
+--             'openorgs____::846cb428d3f52a445f7275561a7beb5d',  -- University of Manitoba
+--             'openorgs____::eb391317ed0dc684aa81ac16265de041',	-- Universitat Rovira i Virgili
+--             'openorgs____::66aa9fc2fceb271423dfabcc38752dc0',  -- Lund University
+--             'openorgs____::3cff625a4370d51e08624cc586138b2f'	-- IMT Atlantique
+--        ) )) foo;
+--
+--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
+
+create view if not exists TARGET.category as select * from SOURCE.category;
+create view if not exists TARGET.concept as select * from SOURCE.concept;
+create view if not exists TARGET.context as select * from SOURCE.context;
+create view if not exists TARGET.country as select * from SOURCE.country;
+create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
+create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
+create view if not exists TARGET.funder as select * from SOURCE.funder;
+create view if not exists TARGET.fundref as select * from SOURCE.fundref;
+create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
+create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
+create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
+create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
+create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
+create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
+--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
+
+create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
+
+create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
+
+create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
+
+create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
+
+create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
+
+create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
+
+create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
+
+create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
+
+create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
+
+create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
+
+create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
+
+create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
+
+create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
+--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
+
+create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
+
+create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
+
+create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
+
+create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
+
+create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
+
+create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
+
+create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
+
+create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
+
+create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
+
+create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
+
+create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
+
+create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
+create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
+create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
+drop view TARGET.foo1;
+drop view TARGET.foo2;
+--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
+
+-- datasources
+create view if not exists TARGET.datasource as select * from SOURCE.datasource;
+create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
+create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
+create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
+
+create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
+--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
+
+-- organizations
+create view if not exists TARGET.organization as select * from SOURCE.organization;
+create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
+create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
+create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
+create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
+
+-- projects
+create view if not exists TARGET.project as select * from SOURCE.project;
+create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
+create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
+create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
+create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
+create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
+
+create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
+--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
+
+-- indicators
+-- Sprint 1 ----
+create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
+create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
+create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
+-- Sprint 2 ----
+create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
+create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
+create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
+create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
+---- Sprint 3 ----
+create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
+create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
+create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
+create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
+create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
+create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
+create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
+---- Sprint 4 ----
+create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
+create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
+create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
+---- Sprint 5 ----
+create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
+---- Sprint 6 ----
+create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
+create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
+---- Sprint 7 ----
+create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
+create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
+create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
+create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
+create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
+create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
+create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
+create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
+create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
+create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
+create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
+create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
+create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
+create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
+create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
+create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
+create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS;
+create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql
@ -0,0 +1,297 @@
+drop database if exists TARGET cascade;
+create database if not exists TARGET;
+
+create view if not exists TARGET.category as select * from SOURCE.category;
+create view if not exists TARGET.concept as select * from SOURCE.concept;
+create view if not exists TARGET.context as select * from SOURCE.context;
+create view if not exists TARGET.country as select * from SOURCE.country;
+create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
+create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
+create view if not exists TARGET.funder as select * from SOURCE.funder;
+create view if not exists TARGET.fundref as select * from SOURCE.fundref;
+create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
+create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
+create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
+create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
+create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
+create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
+--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
+
+create table TARGET.result stored as parquet as
+    select distinct * from (
+        select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
+        union all
+        select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
+        union all
+        select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
+             'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
+             'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
+             'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
+             'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
+             'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
+             'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
+             'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
+             'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
+             'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
+             'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
+             -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
+             'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
+             'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
+             'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
+             'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
+             'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
+             'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
+             'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
+             'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
+             'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
+             'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
+             'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
+             'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
+             'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
+             'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
+             'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
+             'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
+             'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
+             'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
+             'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
+             'openorgs____::e15adb13c4dadd49de4d35c39b5da93a',  -- Nanyang Technological University
+             'openorgs____::4b34103bde246228fcd837f5f1bf4212',  -- Autonomous University of Barcelona
+             'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb',	-- McMaster University
+             'openorgs____::51c7fc556e46381734a25a6fbc3fd398',	-- University of Modena and Reggio Emilia
+             'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db',	-- Bilkent University
+             'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06',	-- Saints Cyril and Methodius University of Skopje
+             'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
+             'openorgs____::b8b8ca674452579f3f593d9f5e557483',   -- University College Cork
+             'openorgs____::38d7097854736583dde879d12dacafca',	-- Brown University
+             'openorgs____::57784c9e047e826fefdb1ef816120d92',  --Arts et Métiers ParisTech
+             'openorgs____::2530baca8a15936ba2e3297f2bce2e7e',	-- University of Cape Town
+             'openorgs____::d11f981828c485cd23d93f7f24f24db1',  -- Technological University Dublin
+             'openorgs____::5e6bf8962665cdd040341171e5c631d8',  -- Delft University of Technology
+             'openorgs____::846cb428d3f52a445f7275561a7beb5d',  -- University of Manitoba
+             'openorgs____::eb391317ed0dc684aa81ac16265de041',	-- Universitat Rovira i Virgili
+             'openorgs____::66aa9fc2fceb271423dfabcc38752dc0',  -- Lund University
+             'openorgs____::3cff625a4370d51e08624cc586138b2f',	-- IMT Atlantique
+             'openorgs____::c0b262bd6eab819e4c994914f9c010e2',  -- National Institute of Geophysics and Volcanology
+             'openorgs____::1624ff7c01bb641b91f4518539a0c28a',   -- Vrije Universiteit Amsterdam
+             'openorgs____::4d4051b56708688235252f1d8fddb8c1',	-- Iscte - Instituto Universitário de Lisboa
+             'openorgs____::ab4ac74c35fa5dada770cf08e5110fab',	-- Universidade Católica Portuguesa
+             'openorgs____::4d4051b56708688235252f1d8fddb8c1',	-- Iscte - Instituto Universitário de Lisboa
+             'openorgs____::5d55fb216b14691cf68218daf5d78cd9',  -- Munster Technological University
+             'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9',  -- Cardiff University
+             'openorgs____::8839b55dae0c84d56fd533f52d5d483a',  -- Leibniz Institute of Ecological Urban and Regional Development
+             'openorgs____::526468206bca24c1c90da6a312295cf4',	-- Cyprus University of Technology
+             'openorgs____::b5ca9d4340e26454e367e2908ef3872f',	-- Alma Mater Studiorum University of Bologna
+             'openorgs____::a6340e6ecf60f6bba163659df985b0f2'	-- TU Dresden
+        ))) foo;
+
+--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
+
+create view if not exists TARGET.category as select * from SOURCE.category;
+create view if not exists TARGET.concept as select * from SOURCE.concept;
+create view if not exists TARGET.context as select * from SOURCE.context;
+create view if not exists TARGET.country as select * from SOURCE.country;
+create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
+create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
+create view if not exists TARGET.funder as select * from SOURCE.funder;
+create view if not exists TARGET.fundref as select * from SOURCE.fundref;
+create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
+create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
+create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
+create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
+create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
+create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
+--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
+
+create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
+
+create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
+
+create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
+
+create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
+
+create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
+
+create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
+
+create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
+
+create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
+
+create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
+
+create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
+
+create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
+
+create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
+
+create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
+--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
+
+create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
+
+create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
+
+create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
+
+create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
+
+create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
+
+create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
+
+create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
+
+create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
+
+create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
+
+create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
+
+create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
+
+create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+
+create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
+create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
+create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
+drop view TARGET.foo1;
+drop view TARGET.foo2;
+--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
+
+-- datasources
+create view if not exists TARGET.datasource as select * from SOURCE.datasource;
+create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
+create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
+create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
+
+create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
+--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
+
+-- organizations
+create view if not exists TARGET.organization as select * from SOURCE.organization;
+create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
+create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
+create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
+create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
+
+-- projects
+create view if not exists TARGET.project as select * from SOURCE.project;
+create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
+create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
+create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
+create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
+create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
+
+create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
+--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
+
+-- indicators
+-- Sprint 1 ----
+create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
+create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
+create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
+-- Sprint 2 ----
+create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
+create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
+create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
+create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
+---- Sprint 3 ----
+create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
+create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
+create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
+create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
+create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
+create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
+create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
+---- Sprint 4 ----
+create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
+create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
+create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
+---- Sprint 5 ----
+create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
+---- Sprint 6 ----
+create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
+create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
+create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
+--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
+---- Sprint 7 ----
+create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
+create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
+create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
+create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
+create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
+create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
+create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
+create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
+create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
+create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
+create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
+create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
+create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
+create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
+create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
+create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
+create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS;
+--create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+--create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
+--create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
+--create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
+--create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
+--create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
+--create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql
@ -0,0 +1,67 @@
+drop database if exists TARGET cascade;
+create database if not exists TARGET;
+
+create table TARGET.result stored as parquet as
+    select distinct * from (
+        select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
+             'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
+             'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
+             'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
+             'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
+             'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
+             'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
+             'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
+             'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
+             'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
+             'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
+             -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
+             'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
+             'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
+             'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
+             'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
+             'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
+             'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
+             'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
+             'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
+             'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
+             'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
+             'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
+             'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
+             'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
+             'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
+             'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
+             'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
+             'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
+             'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
+             'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
+             'openorgs____::e15adb13c4dadd49de4d35c39b5da93a',  -- Nanyang Technological University
+             'openorgs____::4b34103bde246228fcd837f5f1bf4212',  -- Autonomous University of Barcelona
+             'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb',	-- McMaster University
+             'openorgs____::51c7fc556e46381734a25a6fbc3fd398',	-- University of Modena and Reggio Emilia
+             'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db',	-- Bilkent University
+             'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06',	-- Saints Cyril and Methodius University of Skopje
+             'openorgs____::db7686f30f22cbe73a4fde872ce812a6',  -- University of Milan
+             'openorgs____::b8b8ca674452579f3f593d9f5e557483',   -- University College Cork
+             'openorgs____::38d7097854736583dde879d12dacafca',	-- Brown University
+             'openorgs____::57784c9e047e826fefdb1ef816120d92',  --Arts et Métiers ParisTech
+             'openorgs____::2530baca8a15936ba2e3297f2bce2e7e',	-- University of Cape Town
+             'openorgs____::d11f981828c485cd23d93f7f24f24db1',  -- Technological University Dublin
+             'openorgs____::5e6bf8962665cdd040341171e5c631d8',  -- Delft University of Technology
+             'openorgs____::846cb428d3f52a445f7275561a7beb5d',  -- University of Manitoba
+             'openorgs____::eb391317ed0dc684aa81ac16265de041',	-- Universitat Rovira i Virgili
+             'openorgs____::66aa9fc2fceb271423dfabcc38752dc0',  -- Lund University
+             'openorgs____::3cff625a4370d51e08624cc586138b2f',	-- IMT Atlantique
+             'openorgs____::c0b262bd6eab819e4c994914f9c010e2',   -- National Institute of Geophysics and Volcanology
+             'openorgs____::1624ff7c01bb641b91f4518539a0c28a',   -- Vrije Universiteit Amsterdam
+             'openorgs____::4d4051b56708688235252f1d8fddb8c1',	 --Iscte - Instituto Universitário de Lisboa
+             'openorgs____::ab4ac74c35fa5dada770cf08e5110fab',	-- Universidade Católica Portuguesa
+             'openorgs____::4d4051b56708688235252f1d8fddb8c1',	-- Iscte - Instituto Universitário de Lisboa
+             'openorgs____::5d55fb216b14691cf68218daf5d78cd9',  -- Munster Technological University
+             'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9',  -- Cardiff University
+             'openorgs____::8839b55dae0c84d56fd533f52d5d483a',  -- Leibniz Institute of Ecological Urban and Regional Development
+             'openorgs____::526468206bca24c1c90da6a312295cf4',	-- Cyprus University of Technology
+             'openorgs____::b5ca9d4340e26454e367e2908ef3872f',	-- Alma Mater Studiorum University of Bologna
+             'openorgs____::a6340e6ecf60f6bba163659df985b0f2'	-- TU Dresden
+        )))  foo;
+
+--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/workflow.xml
@ -0,0 +1,111 @@
+<workflow-app name="Stats Monitor Update" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>stats_db_name</name>
+            <description>the target stats database name</description>
+        </property>
+        <property>
+            <name>monitor_db_name</name>
+            <description>the target monitor db name</description>
+        </property>
+        <property>
+            <name>monitor_db_shadow_name</name>
+            <description>the name of the shadow monitor db</description>
+        </property>
+        <property>
+            <name>hive_metastore_uris</name>
+            <description>hive server metastore URIs</description>
+        </property>
+        <property>
+            <name>hive_jdbc_url</name>
+            <description>hive server jdbc url</description>
+        </property>
+        <property>
+            <name>hive_timeout</name>
+            <description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
+        </property>
+        <property>
+            <name>hadoop_user_name</name>
+            <description>user name of the wf owner</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>hive.metastore.uris</name>
+                <value>${hive_metastore_uris}</value>
+            </property>
+            <property>
+            	<name>hive.txn.timeout</name>
+            	<value>${hive_timeout}</value>
+            </property>
+	<property>
+	    <name>mapred.job.queue.name</name>
+	    <value>analytics</value>
+	</property>
+        </configuration>
+    </global>
+
+    <start to="resume_from"/>
+    <decision name="resume_from">
+        <switch>
+            <case to="Step1-updateMonitorDB">${wf:conf('resumeFrom') eq 'Step1-updateMonitorDB'}</case>
+            <case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
+            <case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
+            <default to="Step1-updateMonitorDB"/>
+        </switch>
+    </decision>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="Step1-updateMonitorDB">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>monitor.sh</exec>
+            <argument>${stats_db_name}</argument>
+            <argument>${monitor_db_name}</argument>
+            <argument>${monitor_db_shadow_name}</argument>
+            <argument>${wf:appPath()}/scripts/updateMonitorDB_institutions.sql</argument>
+            <argument>${wf:appPath()}/scripts/updateMonitorDB.sql</argument>
+            <argument>${wf:appPath()}/scripts/updateMonitorDBAll.sql</argument>
+            <file>monitor.sh</file>
+        </shell>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="Step2-copyDataToImpalaCluster">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>copyDataToImpalaCluster.sh</exec>
+            <argument>${monitor_db_name}</argument>
+            <argument>${hadoop_user_name}</argument>
+            <file>copyDataToImpalaCluster.sh</file>
+        </shell>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="Step3-finalizeImpalaCluster">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>finalizeImpalaCluster.sh</exec>
+            <argument>${monitor_db_name}</argument>
+            <argument>${monitor_db_prod_name}</argument>
+            <argument>${monitor_db_shadow_name}</argument>
+            <file>finalizeImpalaCluster.sh</file>
+        </shell>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-stats-update/pom.xml
+++ b/dhp-workflows/dhp-stats-update/pom.xml
@ -8,6 +8,11 @@
    <modelVersion>4.0.0</modelVersion>
    <artifactId>dhp-stats-update</artifactId>
    <dependencies>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-common</artifactId>
+            <version>${project.version}</version>
+        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
@ -35,12 +35,20 @@ export HADOOP_USER="oozie"
 export HADOOP_USER_NAME="oozie"

 echo "Creating and populating impala tables"
-hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
-hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
-hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
-hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
-hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
-hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.context_csv (id string, name string) row format delimited fields terminated by ','"
+hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context_csv"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.context stored as parquet as select * from ${TARGET_DB}.context_csv"
+hive $HIVE_OPTS -e "drop table ${TARGET_DB}.context_csv purge"
+
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.category_csv (context string, id string, name string) row format delimited fields terminated by ','"
+hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category_csv"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.category stored as parquet as select * from ${TARGET_DB}.category_csv"
+hive $HIVE_OPTS -e "drop table ${TARGET_DB}.category_csv purge"
+
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept_csv (category string, id string, name string) row format delimited fields terminated by ','"
+hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept_csv"
+hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept stored as parquet as select * from ${TARGET_DB}.concept_csv"
+hive $HIVE_OPTS -e "drop table ${TARGET_DB}.concept_csv purge"

 echo "Cleaning up"
 rm concepts.csv
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@ -6,68 +6,215 @@ then
    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
 fi

+
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
+IMPALA_HDFS_NODE=''
+COUNTER=0
+while [ $COUNTER -lt 3 ]; do
+  if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
+      break
+  elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
+      IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
+      break
+  else
+      IMPALA_HDFS_NODE=''
+      sleep 1
+  fi
+  ((COUNTER++))
+done
+if [ -z "$IMPALA_HDFS_NODE" ]; then
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    exit 1
+fi
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
+
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
+
 export HADOOP_USER_NAME=$6
 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
+
+
 function copydb() {
  db=$1
-  FILE=("hive_wf_tmp_"$RANDOM)
-  hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
-  # copy the databases from ocean to impala
+  echo -e "\nStart processing db: '${db}'..\n"

-  echo "copying $db"
-  hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    return 1
+  fi

-  # change ownership to impala
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"

-  # drop tables from db
-  for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
-    done
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}

-  # drop views from db
-  for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
-    done
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  else
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
+    rm -f error.log
+    return 2
+  fi

-  # delete the database
-  impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db

-  # create the databases
-  impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
+  echo -e "\nCreating schema for db: '${db}'\n"

-  impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
-  echo "creating schema for ${db}"
-  for ((  k  = 0;  k  < 5;  k ++ )); do
-  for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show tables"`;
-    do
-      impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show create table $i";
-    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
+
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
+
+  all_create_view_statements=()
+
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
+
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_statements+=("$create_view_statement")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
  done

-#  for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show tables"`;
-#    do
-#      impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show create table $i";
-#    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
-#
-#  # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
-#  for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show tables"`;
-#    do
-#      impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show create table $i";
-#    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  echo -e "\nAll tables have been created, going to create the views..\n"

-  # load the data from /tmp in the respective tables
-  echo "copying data in tables and computing stats"
-  for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-      do
-        impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
-        impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
-      done
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.

-  # deleting the remaining directory from hdfs
-hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
+  previous_num_of_views_to_retry=${#all_create_view_statements}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
+  else
+    echo -e "\nDB '${db}' does not contain any views.\n"
+  fi
+
+  level_counter=0
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
+    ((level_counter++))
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry_create_view_statements=()
+
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry_create_view_statements+=("$create_view_statement")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
+    fi
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo -e "\nComputing stats for tables..\n"
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
+  rm -f error.log
+  echo -e "\n\nFinished processing db: ${db}\n\n"
 }

 STATS_DB=$1
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh
@ -85,12 +85,12 @@ hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/
 hive -f foo
 echo "Updated shadow monitor funded database"

-echo "Updating shadow monitor insitutions database"
+echo "Updating shadow monitor institutions database"
 hive -e "drop database if exists ${SHADOW}_institutions cascade"
 hive -e "create database if not exists ${SHADOW}_institutions"
 hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
 hive -f foo
-echo "Shadow db monitor insitutions ready!"
+echo "Shadow db monitor institutions ready!"

 echo "Updating shadow monitor RIs database"
 for i in $contexts
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
@ -69,7 +69,7 @@ SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
 DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;

 CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
-select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
+select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
 from (
    SELECT substr(res.id, 4) as id, auth_pid.value as orcid
    FROM ${openaire_db_name}.result res
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
@ -7,32 +7,76 @@
 ------------------------------------------------------

 DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
-
 CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
+with peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
+non_peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
+select distinct *
+from (
+    select peer_reviewed.* from peer_reviewed
+    union all
+    select non_peer_reviewed.* from non_peer_reviewed
+    left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
+    where peer_reviewed.id is null) pr;

 DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
-
 CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
+with peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
+non_peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
+select distinct *
+from (
+    select peer_reviewed.* from peer_reviewed
+    union all
+    select non_peer_reviewed.* from non_peer_reviewed
+    left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
+    where peer_reviewed.id is null) pr;

 DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
-
 CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
+with peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
+non_peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
+select distinct *
+from (
+    select peer_reviewed.* from peer_reviewed
+    union all
+    select non_peer_reviewed.* from non_peer_reviewed
+    left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
+    where peer_reviewed.id is null) pr;

 DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
-
 CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
-select substr(r.id, 4) as id, inst.refereed.classname as refereed
-from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
-where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
+with peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
+non_peer_reviewed as (
+    select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
+    from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
+    where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
+select distinct *
+from (
+    select peer_reviewed.* from peer_reviewed
+    union all
+    select non_peer_reviewed.* from non_peer_reviewed
+    left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
+    where peer_reviewed.id is null) pr;

 CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
 select * from ${stats_db_name}.publication_refereed
@ -60,4 +104,4 @@ rel.properties[1].value apc_currency
 from ${openaire_db_name}.relation rel
 join ${openaire_db_name}.organization o on o.id=rel.source
 join ${openaire_db_name}.result r on r.id=rel.target
-where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
+where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql
@ -50,13 +50,13 @@ select distinct r.*
 from (
         select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
                substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
-         from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
+         from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
 join ${stats_db_name}.result res on res.id=r.id;

 DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;

 create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
-select r.id, r.amount, r.currency
+select distinct r.id, r.amount, r.currency
 from (
         select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
         from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
@ -180,4 +180,12 @@ create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_opene
 create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
 create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
 create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
-create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
+create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
+
+create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql
@ -80,8 +80,12 @@ create table TARGET.result stored as parquet as
             'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9',  -- Cardiff University
             'openorgs____::8839b55dae0c84d56fd533f52d5d483a',   -- Leibniz Institute of Ecological Urban and Regional Development
             'openorgs____::526468206bca24c1c90da6a312295cf4',	-- Cyprus University of Technology
-             'openorgs____::b5ca9d4340e26454e367e2908ef3872f'	-- Alma Mater Studiorum University of Bologna
-
+             'openorgs____::b5ca9d4340e26454e367e2908ef3872f',	-- Alma Mater Studiorum University of Bologna
+             'openorgs____::a6340e6ecf60f6bba163659df985b0f2',  -- TU Dresden
+             'openorgs____::64badd35233ba2cd4946368ef2f4cf57',  --	University of Vienna
+             'openorgs____::7501d66d2297a963ebfb075c43fff88e',  -- Royal Institute of Technology
+             'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf',  -- Sorbonne University
+             'openorgs____::b316f25380d106aac402f5ae8653910d'  --	Centre for Research on Ecology and Forestry Applications
        ) )) foo;

 create view if not exists TARGET.category as select * from SOURCE.category;
@ -264,4 +268,11 @@ create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
 create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
 create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;

+create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
+
+create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);

--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql
@ -60,5 +60,10 @@ create table TARGET.result stored as parquet as
             'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9',  -- Cardiff University
             'openorgs____::8839b55dae0c84d56fd533f52d5d483a',   -- Leibniz Institute of Ecological Urban and Regional Development
             'openorgs____::526468206bca24c1c90da6a312295cf4',	-- Cyprus University of Technology
-             'openorgs____::b5ca9d4340e26454e367e2908ef3872f'	-- Alma Mater Studiorum University of Bologna
+             'openorgs____::b5ca9d4340e26454e367e2908ef3872f',	-- Alma Mater Studiorum University of Bologna
+             'openorgs____::a6340e6ecf60f6bba163659df985b0f2',	-- TU Dresden
+             'openorgs____::64badd35233ba2cd4946368ef2f4cf57',  -- University of Vienna
+             'openorgs____::7501d66d2297a963ebfb075c43fff88e',  -- Royal Institute of Technology
+             'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf',	-- Sorbonne University
+             'openorgs____::b316f25380d106aac402f5ae8653910d'   -- Centre for Research on Ecology and Forestry Applications
        )))  foo;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql
@ -2,9 +2,8 @@ create table ${observatory_db_name}.result_cc_licence stored as parquet as
 select r.id, coalesce(rln.count, 0) > 0 as cc_licence
 from ${stats_db_name}.result r
         left outer join (
-    select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count
+    select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
    from ${stats_db_name}.result_licenses rl
-        left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license
    group by rl.id
 ) rln on rln.id=r.id;

--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@ -95,7 +95,8 @@ DROP TABLE IF EXISTS ${stats_db_name}.funder purge;
 create table ${stats_db_name}.funder STORED AS PARQUET as
 select distinct xpath_string(fund, '//funder/id')        as id,
                xpath_string(fund, '//funder/name')      as name,
-                xpath_string(fund, '//funder/shortname') as shortname
+                xpath_string(fund, '//funder/shortname') as shortname,
+                xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
 from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund;

 DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge;
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@ -64,6 +64,26 @@
            <name>hadoop_user_name</name>
            <description>user name of the wf owner</description>
        </property>
+
+        <property>
+            <name>sparkSqlWarehouseDir</name>
+        </property>
+        <!-- General oozie workflow properties -->
+        <property>
+            <name>sparkClusterOpts</name>
+            <value>--conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
+            <description>spark cluster-wide options</description>
+        </property>
+        <property>
+            <name>sparkResourceOpts</name>
+            <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
+            <description>spark resource options</description>
+        </property>
+        <property>
+            <name>sparkApplicationOpts</name>
+            <value>--conf spark.sql.shuffle.partitions=3840</value>
+            <description>spark resource options</description>
+        </property>
    </parameters>

    <global>
@ -75,17 +95,21 @@
                <value>${hive_metastore_uris}</value>
            </property>
            <property>
-            	<name>hive.txn.timeout</name>
-            	<value>${hive_timeout}</value>
+                <name>hive.txn.timeout</name>
+                <value>${hive_timeout}</value>
            </property>
            <property>
                <name>hive.mapjoin.followby.gby.localtask.max.memory.usage</name>
                <value>0.80</value>
            </property>
-	<property>
-	    <name>mapred.job.queue.name</name>
-	    <value>analytics</value>
-	</property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+            <property>
+                <name>mapred.job.queue.name</name>
+                <value>analytics</value>
+            </property>
        </configuration>
    </global>

@ -133,164 +157,164 @@
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step1.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step2"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step2">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step2.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step3"/>
        <error to="Kill"/>
    </action>
-        
+
    <action name="Step3">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step3.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step4"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step4">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step4.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step5"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step5">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step5.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step6"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step6">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step6.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step7"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step7">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step7.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step8"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step8">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step8.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step9"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step9">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step9.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step10"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step10">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step10.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
-			<param>external_stats_db_name=${external_stats_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
+            <param>external_stats_db_name=${external_stats_db_name}</param>
        </hive2>
        <ok to="Step11"/>
        <error to="Kill"/>
-    </action>    
+    </action>

    <action name="Step11">
-    	<hive2 xmlns="uri:oozie:hive2-action:0.1">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step11.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
-			<param>external_stats_db_name=${external_stats_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
+            <param>external_stats_db_name=${external_stats_db_name}</param>
        </hive2>
        <ok to="Step12"/>
        <error to="Kill"/>
-    </action>  
-    
+    </action>
+
    <action name="Step12">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step12.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step13"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step13">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step13.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step14"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step14">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step14.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step15"/>
        <error to="Kill"/>
    </action>
-    
+
    <action name="Step15">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <jdbc-url>${hive_jdbc_url}</jdbc-url>
            <script>scripts/step15.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>openaire_db_name=${openaire_db_name}</param>
+            <param>stats_db_name=${stats_db_name}</param>
+            <param>openaire_db_name=${openaire_db_name}</param>
        </hive2>
        <ok to="Step15_5"/>
        <error to="Kill"/>
@ -321,13 +345,35 @@
        <error to="Kill"/>
    </action>

+<!--    <action name="Step16-createIndicatorsTables">-->
+<!--        <hive2 xmlns="uri:oozie:hive2-action:0.1">-->
+<!--            <jdbc-url>${hive_jdbc_url}</jdbc-url>-->
+<!--            <script>scripts/step16-createIndicatorsTables.sql</script>-->
+<!--            <param>stats_db_name=${stats_db_name}</param>-->
+<!--            <param>external_stats_db_name=${external_stats_db_name}</param>-->
+<!--        </hive2>-->
+<!--        <ok to="Step16_1-definitions"/>-->
+<!--        <error to="Kill"/>-->
+<!--    </action>-->
+
    <action name="Step16-createIndicatorsTables">
-        <hive2 xmlns="uri:oozie:hive2-action:0.1">
-            <jdbc-url>${hive_jdbc_url}</jdbc-url>
-            <script>scripts/step16-createIndicatorsTables.sql</script>
-            <param>stats_db_name=${stats_db_name}</param>
-            <param>external_stats_db_name=${external_stats_db_name}</param>
-        </hive2>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Step16-createIndicatorsTables</name>
+            <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
+            <jar>dhp-stats-update-${projectVersion}.jar</jar>
+            <spark-opts>
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                ${sparkClusterOpts}
+                ${sparkResourceOpts}
+                ${sparkApplicationOpts}
+            </spark-opts>
+            <arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
+            <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
+            <arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
+            <arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
+        </spark>
        <ok to="Step16_1-definitions"/>
        <error to="Kill"/>
    </action>
@ -387,18 +433,18 @@
        <error to="Kill"/>
    </action>

-<!--    <action name="step20-createMonitorDB-post">-->
-<!--        <shell xmlns="uri:oozie:shell-action:0.1">-->
-<!--            <job-tracker>${jobTracker}</job-tracker>-->
-<!--            <name-node>${nameNode}</name-node>-->
-<!--            <exec>monitor-post.sh</exec>-->
-<!--            <argument>${monitor_db_name}</argument>-->
-<!--            <argument>${monitor_db_shadow_name}</argument>-->
-<!--            <file>monitor-post.sh</file>-->
-<!--        </shell>-->
-<!--        <ok to="step21-createObservatoryDB-pre"/>-->
-<!--        <error to="Kill"/>-->
-<!--    </action>-->
+    <!--    <action name="step20-createMonitorDB-post">-->
+    <!--        <shell xmlns="uri:oozie:shell-action:0.1">-->
+    <!--            <job-tracker>${jobTracker}</job-tracker>-->
+    <!--            <name-node>${nameNode}</name-node>-->
+    <!--            <exec>monitor-post.sh</exec>-->
+    <!--            <argument>${monitor_db_name}</argument>-->
+    <!--            <argument>${monitor_db_shadow_name}</argument>-->
+    <!--            <file>monitor-post.sh</file>-->
+    <!--        </shell>-->
+    <!--        <ok to="step21-createObservatoryDB-pre"/>-->
+    <!--        <error to="Kill"/>-->
+    <!--    </action>-->

    <action name="step21-createObservatoryDB-pre">
        <shell xmlns="uri:oozie:shell-action:0.1">
@ -443,8 +489,8 @@
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <exec>copyDataToImpalaCluster.sh</exec>
-<!--            <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
-<!--            <argument>${external_stats_db_name}</argument>-->
+            <!--            <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
+            <!--            <argument>${external_stats_db_name}</argument>-->
            <argument>${stats_db_name}</argument>
            <argument>${monitor_db_name}</argument>
            <argument>${observatory_db_name}</argument>
@ -505,4 +551,4 @@
    </action>

    <end name="End"/>
-</workflow-app>
+</workflow-app>
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -31,6 +31,10 @@
        <module>dhp-enrichment</module>
        <module>dhp-graph-provision</module>
        <module>dhp-blacklist</module>
+        <module>dhp-stats-actionsets</module>
+        <module>dhp-stats-hist-snaps</module>
+        <module>dhp-stats-monitor-irish</module>
+        <module>dhp-stats-monitor-update</module>
        <module>dhp-stats-update</module>
        <module>dhp-stats-promote</module>
        <module>dhp-usage-stats-build</module>
Author	SHA1	Message	Date
Claudio Atzori	795e1b2629	Merge pull request '[graph indexing] sets spark memoryOverhead in the join operations to the same value used for the memory executor' (#426 ) from provision_memoryOverhead into master Reviewed-on: #426	2024-04-19 16:59:45 +02:00
Claudio Atzori	0c05abe50b	[graph indexing] sets spark memoryOverhead in the join operations to the same value used for the memory executor	2024-04-19 16:57:55 +02:00
Claudio Atzori	8fdd0244ad	Merge pull request 'Various fixes for the stats DB update workflow, step16-createIndicatorsTables.sql' (#425 ) from stats_step16_fix into master Reviewed-on: #425	2024-04-18 11:25:24 +02:00
Claudio Atzori	18fdaaf548	integrating suggestion from #9699 to improve the result_country table construction	2024-04-18 11:23:43 +02:00
Claudio Atzori	43e123c624	added column alias	2024-04-17 16:40:29 +02:00
Claudio Atzori	62a07b7add	added missing end of statement /EOS/	2024-04-17 15:13:28 +02:00
Claudio Atzori	96bddcc921	revised query implementation for indi_pub_gold_oa	2024-04-17 15:06:50 +02:00
Miriam Baglioni	0486cea4c4	removed the funder id : 100011062 Asian Spinal Cord Network, wrongly associated to Ireland	2024-04-16 15:36:40 +02:00
Claudio Atzori	013935c593	Merge pull request 'Improvements to copying data from ocean to impala' (#420 ) from antonis.lempesis/dnet-hadoop:beta into master Reviewed-on: #420	2024-04-16 14:17:47 +02:00
Lampros Smyrnaios	d7da4f814b	Minor updates to the copying operation to Impala Cluster: - Improve logging. - Code optimization/polishing.	2024-04-12 18:12:06 +03:00
Lampros Smyrnaios	14719dcd62	Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views. - Add check for successful execution of the "hadoop distcp" command. - Add a check for successful copy operation of all entities. - Upon facing an error in a DB, exit the method, instead of the whole script. - Improve logging. - Code polishing.	2024-04-12 15:36:13 +03:00
Lampros Smyrnaios	22745027c8	Use the "HADOOP_USER_NAME" value from the "workflow-property", in "copyDataToImpalaCluster.sh", in "stats-monitor-updates".	2024-04-11 17:46:33 +03:00
Lampros Smyrnaios	abf0b69f29	Upgrade the copying operation to Impala Cluster: - Use only hive commands in the Ocean Cluster, as the "impala-shell" will be removed from there to free-up resources. - Hugely improve the performance in every aspect of the copying process: a) speedup file-transferring and DB-deletion, b) eliminate permissions-assignment, "load" operations and "use $db" queries, c) retry only the "create view" statements and only as long as they depend on other non-created views, instead of trying to recreate all tables and views 5 consecutive times. - Add error-checks for the creation of tables and views.	2024-04-11 17:12:12 +03:00
Claudio Atzori	6132bd028e	Merge pull request 'Extend Crossref-funders mapping and datacite hostedbymap' (#417 ) from CrossrefFundersMap into master Reviewed-on: #417	2024-04-09 10:30:53 +02:00
Miriam Baglioni	519db1ddef	Extended mapping of funder from crossref (#9169 , #9277 ) and change the correspondece files for the irish fundrs (#9635 ). Extended the datacite map to include the association between metadata and the EBRAINS datasource (SciLake)	2024-04-09 09:33:09 +02:00
Claudio Atzori	5add51f38c	Merge pull request 'fixed the result_country definition and updated the stats DB copy procedure' (#412 ) from antonis.lempesis/dnet-hadoop:beta into master Reviewed-on: #412	2024-04-03 12:34:17 +02:00
Lampros Smyrnaios	b7c8acc563	- Update the code which acquires the "IMPALA_HDFS_NODE", to test the "tmp"-dir, instead of the base-dir and introduce retries, to overcome potential file-system failures. This change was suggested by "Sebastian Tymkow" and "Grzegorz Bakalarski". - Fix typos.	2024-04-03 13:15:37 +03:00
Antonis Lempesis	df6e3bda04	added new orgs in monitor	2024-04-01 22:45:29 +03:00
Antonis Lempesis	573b081f1d	added new orgs in monitor	2024-04-01 22:24:46 +03:00
Antonis Lempesis	0bf2a7a359	fixed the result_country definition	2024-04-01 15:23:22 +03:00
Claudio Atzori	f01390702e	Merge pull request 'fixed typo in indicator query' (#410 ) from antonis.lempesis/dnet-hadoop:beta into master Reviewed-on: #410	2024-03-27 13:42:07 +01:00
Antonis Lempesis	9ff44eed96	fixed typo in indicator query added more institutions	2024-03-27 14:39:01 +02:00
Claudio Atzori	5592ccc37a	Merge pull request 'added missing EOS, Generate tables with parquet-files, instead of csv in the contexts.sh script' (#408 ) from antonis.lempesis/dnet-hadoop:beta into master Reviewed-on: #408	2024-03-27 12:02:57 +01:00
Antonis Lempesis	1fee4124e0	added missing EOS	2024-03-27 12:58:25 +02:00
Claudio Atzori	d16c15da8d	adjusted pom files	2024-03-26 14:00:44 +01:00
Lampros Smyrnaios	036ba03fcd	Generate tables with parquet-files, instead of csv, in "dhp-stats-update/.../contexts.sh" script.	2024-03-26 13:29:04 +02:00
Claudio Atzori	09a6d17059	Merge pull request '[Stats wf] #372 , #405 to production' (#406 ) from antonis.lempesis/dnet-hadoop:beta into master Reviewed-on: #406	2024-03-26 12:18:26 +01:00
Claudio Atzori	d70793847d	resolving conflicts on step16-createIndicatorsTables.sql	2024-03-26 12:17:52 +01:00
Lampros Smyrnaios	bc8c97182d	Automatically select the ACTIVE HDFS NODE for Impala cluster, in all "copyDataToImpalaCluster.sh" scripts.	2024-03-26 13:01:12 +02:00
Lampros Smyrnaios	92cc27e7eb	Use the ACTIVE HDFS NODE for Impala cluster, in "copyDataToImpalaCluster.sh" script.	2024-03-26 12:34:11 +02:00
Michele De Bonis	f6601ea7d1	default parameters for openorgs updated	2024-03-25 13:07:04 +01:00
Michele De Bonis	cd4c3c934d	openorgs wf updated	2024-03-22 15:42:37 +01:00
Antonis Lempesis	4c40c96e30	code cleanup	2024-03-22 10:16:49 +02:00
Antonis Lempesis	459167ac2f	Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into beta	2024-03-21 12:44:58 +02:00
Antonis Lempesis	07f634a46d	code cleanup	2024-03-21 12:44:30 +02:00
Antonis Lempesis	9521625a07	code cleanup	2024-03-21 11:45:08 +02:00
Antonis Lempesis	67a5aa0a38	Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into beta	2024-03-19 11:24:54 +02:00
dimitrispie	a3a570e9a0	Commit monitor-updates-wf	2024-03-19 09:42:21 +02:00
Michele Artini	a99942f7cf	filter by base types	2024-03-13 12:12:42 +01:00
Michele Artini	7f7083f53e	updated sql query for filtering BASE records	2024-03-13 11:57:26 +01:00
Michele Artini	d9b23a76c5	comments	2024-03-12 14:53:34 +01:00
Michele Artini	841ca92246	Merge pull request 'new plugin to collect from a dump of BASE' (#400 ) from base-collector-plugin into master Reviewed-on: #400	2024-03-12 12:22:42 +01:00
Michele Artini	3bcfc40293	new plugin to collect from a dump of BASE	2024-03-12 12:17:58 +01:00
Antonis Lempesis	f74c7e8689	selecting distinct peer_reviewed	2024-03-12 02:13:04 +02:00
Antonis Lempesis	3c79720342	fixed the irish result subset	2024-03-07 14:08:57 +02:00
Antonis Lempesis	5ae4b4286c	Merge branch 'beta' of https://code-repo.d3science.org/antonis.lempesis/dnet-hadoop into beta	2024-03-07 12:15:19 +02:00
Antonis Lempesis	316d585c8a	using distinct apcs per publication to avoid huge sums	2024-03-07 02:07:59 +02:00
Giambattista Bloisi	3067ea390d	Use SparkSQL in place of Hive for executing step16-createIndicatorsTables.sql of stats update wf	2024-03-04 11:13:34 +01:00
Miriam Baglioni	c94d94035c	[BulkTagging] added check to verify if field is present in the pathMap	2024-02-28 09:41:42 +01:00
Michele Artini	4374d7449e	mapping of project PIDs	2024-02-22 14:44:35 +01:00
Claudio Atzori	07d009007b	Merge pull request 'Fixed problem on missing author in crossref Mapping' (#384 ) from crossref_missing_author_fix_master into master Reviewed-on: #384	2024-02-15 15:06:17 +01:00
Claudio Atzori	071d044971	Merge branch 'master' into crossref_missing_author_fix_master	2024-02-15 15:04:19 +01:00
Claudio Atzori	b3ddbaed58	fixed import of ORPs stored on HDFS in the internal graph format (e.g. Datacite)	2024-02-15 15:02:48 +01:00
Claudio Atzori	1416f16b35	[graph raw] fixed mapping of the original resource type from the Datacite format	2024-02-09 10:19:53 +01:00
Giambattista Bloisi	ba1a0e7b4f	Merge pull request 'Set deletedbyinference =true to dedup aliases, created when a dedup in a previous build has been merged in a new dedup' (#392 ) from fix_dedupaliases_deletedbyinference into master Reviewed-on: #392	2024-02-08 15:29:29 +01:00
Giambattista Bloisi	079085286c	Merge branch 'master' into fix_dedupaliases_deletedbyinference	2024-02-08 15:29:13 +01:00
Giambattista Bloisi	8dd666aedd	Dedup aliases, created when a dedup in a previous build has been merged in a new dedup, need to be marked as "deletedbyinference", since they are "merged" in the new dedup	2024-02-08 15:27:57 +01:00
Claudio Atzori	f21133229a	Merge pull request 'Support for the PromoteAction strategy [master]' (#391 ) from promote_actions_join_type_master into master Reviewed-on: #391	2024-02-08 15:12:16 +01:00
Claudio Atzori	d86b909db2	[actiosets] fixed join type	2024-02-08 15:10:55 +01:00
Claudio Atzori	08162902ab	[actiosets] introduced support for the PromoteAction strategy	2024-02-08 15:10:40 +01:00
Antonis Lempesis	dd4c27f4f3	added 2 new institutions in monitor	2024-02-08 12:57:57 +02:00
Claudio Atzori	e8630a6d03	[graph cleaning] rule out datasources without an officialname	2024-02-05 14:59:06 +02:00
Antonis Lempesis	a512ead447	changed orcid ids to all capital	2024-01-30 16:54:47 +02:00
Antonis Lempesis	bb10a22290	merged changes from dnet-hadoop	2024-01-29 21:51:47 +02:00
Claudio Atzori	f804c58bc7	Merge pull request 'Use SparkSQL in place of Hive for executing step16-createIndicatorsTables.sql of stats update wf' (#386 ) from stats_with_spark_sql into beta Reviewed-on: #386	2024-01-29 09:11:59 +01:00
Claudio Atzori	926903b06b	Merge branch 'beta' into stats_with_spark_sql	2024-01-29 09:11:45 +01:00
Giambattista Bloisi	078df0b4d1	Use SparkSQL in place of Hive for executing step16-createIndicatorsTables.sql of stats update wf	2024-01-26 21:56:55 +01:00
Sandro La Bruzzo	3c8c88bdd3	Fixed problem on missing author in crossref Mapping	2024-01-26 12:29:30 +01:00
Antonis Lempesis	c548796463	Changed step16-createIndicatorsTables to use a spark oozie action instead of hive	2024-01-26 02:04:48 +02:00
Antonis Lempesis	fd43b0e84a	max mem of joins (hive.mapjoin.followby.gby.localtask.max.memory.usage) now 80%, up from 55%.	2024-01-25 15:06:34 +01:00
Antonis Lempesis	e024718f73	creating result_instances even when no pids exist for the instance	2024-01-10 22:25:50 +01:00
dimitrispie	b920307bdd	Changes to indicators	2024-01-09 00:47:09 +02:00
dimitrispie	8b2cbb611e	Changes to beta db names	2024-01-09 00:40:56 +02:00
Antonis Lempesis	2e4cab026c	fixed the result_country definition	2024-01-08 16:01:26 +02:00
dimitrispie	6b823100ae	Update buildIrishMonitorDB.sql New indicators added	2024-01-07 22:54:39 +02:00
dimitrispie	75bfde043c	Historical Snapshots Workflow Create historical snapshots db with parameters: hist_db_name=openaire_beta_historical_snapshots_xxx hist_db_name_prev=openaire_beta_historical_snapshots_xxx (previous run of wf) stats_db_name=openaire_beta_stats_xxx stats_irish_db_name=openaire_beta_stats_monitor_ie_xxx monitor_db_name=openaire_beta_stats_monitor_xxx monitor_db_prod_name=openaire_beta_stats_monitor monitor_irish_db_name=openaire_beta_stats_monitor_ie_xxx monitor_irish_db_prod_name=openaire_beta_stats_monitor_ie hist_db_prod_name=openaire_beta_historical_snapshots hist_db_shadow_name=openaire_beta_historical_snapshots_shadow hist_date=122023 hive_timeout=150000 hadoop_user_name=xxx resumeFrom=CreateDB	2024-01-04 15:11:04 +02:00
dimitrispie	ffdd03d2f4	Monitor Irish Stats WF Parameters (with examples): stats_db_name=openaire_beta_stats_20231208 monitor_irish_db_name=openaire_beta_stats_monitor_ie_20231208b monitor_irish_db_prod_name=openaire_beta_stats_monitor_ie graph_db_name=openaire_beta_20231208 monitor_irish_db_shadow_name=openaire_beta_stats_monitor_ie_shadow hive_timeout=150000 hadoop_user_name=dnet.beta resumeFrom=Step1-buildIrishMonitorDB	2023-12-22 11:05:24 +02:00
dimitrispie	40b98d8182	Changes to indicators and funders definition - Changes result_refereed definition - Added result_country indicator - Added indi_pub_green_with_license indicator - Added country from jurisdiction to funders	2023-12-22 10:29:20 +02:00