merged from master

2020-11-19 14:34:54 +01:00 · 2020-11-19 14:34:54 +01:00 · 3f34757c63
parent 9b0fb9e958 e503271abe
commit 3f34757c63
52 changed files with 1644 additions and 1528 deletions
--- a/dhp-build/dhp-code-style/pom.xml
+++ b/dhp-build/dhp-code-style/pom.xml
@ -15,12 +15,12 @@
        <snapshotRepository>
            <id>dnet45-snapshots</id>
            <name>DNet45 Snapshots</name>
-            <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
            <layout>default</layout>
        </snapshotRepository>
        <repository>
            <id>dnet45-releases</id>
-            <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
        </repository>
    </distributionManagement>

--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -104,11 +104,6 @@
 			<artifactId>dnet-pace-core</artifactId>
 		</dependency>

-		<dependency>
-			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>dhp-schemas</artifactId>
-			<version>${project.version}</version>
-		</dependency>
 	</dependencies>

 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
@ -1,9 +1,7 @@

-package eu.dnetlib.dhp.schema.oaf;
+package eu.dnetlib.dhp.oa.graph.clean;

-import java.util.LinkedHashMap;
-import java.util.Objects;
-import java.util.Optional;
+import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;

@ -12,12 +10,19 @@ import org.apache.commons.lang3.StringUtils;
 import com.clearspring.analytics.util.Lists;

 import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;

 public class CleaningFunctions {

 	public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
 	public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
-	public static final String NONE = "none";
+
+	public static final Set<String> PID_BLACKLIST = new HashSet<>();
+
+	static {
+		PID_BLACKLIST.add("none");
+		PID_BLACKLIST.add("na");
+	}

 	public static <T extends Oaf> T fixVocabularyNames(T value) {
 		if (value instanceof Datasource) {
@ -71,7 +76,7 @@ public class CleaningFunctions {
 		return value;
 	}

-	public static <T extends Oaf> T fixDefaults(T value) {
+	protected static <T extends Oaf> T fixDefaults(T value) {
 		if (value instanceof Datasource) {
 			// nothing to clean here
 		} else if (value instanceof Project) {
@ -114,7 +119,7 @@ public class CleaningFunctions {
 							.stream()
 							.filter(Objects::nonNull)
 							.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
-							.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
+							.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
 							.filter(sp -> Objects.nonNull(sp.getQualifier()))
 							.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
 							.map(CleaningFunctions::normalizePidValue)
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.schema.oaf;

 public class ModelHardLimits {

+	public static final String LAYOUT = "index";
+	public static final String INTERPRETATION = "openaire";
+	public static final String SEPARATOR = "-";
+
 	public static final int MAX_EXTERNAL_ENTITIES = 50;
 	public static final int MAX_AUTHORS = 200;
 	public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
@ -11,4 +15,8 @@ public class ModelHardLimits {
 	public static final int MAX_ABSTRACT_LENGTH = 150000;
 	public static final int MAX_INSTANCES = 10;

+	public static String getCollectionName(String format) {
+		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+	}
+
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java
@ -2,7 +2,6 @@
 package eu.dnetlib.dhp.schema.oaf;

 import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;

 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
@ -13,10 +12,43 @@ import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;

 import eu.dnetlib.dhp.schema.common.LicenseComparator;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.utils.DHPUtils;

 public class OafMapperUtils {

+	public static Oaf merge(final Oaf o1, final Oaf o2) {
+		if (ModelSupport.isSubClass(o1, OafEntity.class)) {
+			if (ModelSupport.isSubClass(o1, Result.class)) {
+
+				return mergeResults((Result) o1, (Result) o2);
+			} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
+				((Datasource) o1).mergeFrom((Datasource) o2);
+			} else if (ModelSupport.isSubClass(o1, Organization.class)) {
+				((Organization) o1).mergeFrom((Organization) o2);
+			} else if (ModelSupport.isSubClass(o1, Project.class)) {
+				((Project) o1).mergeFrom((Project) o2);
+			} else {
+				throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
+			}
+		} else if (ModelSupport.isSubClass(o1, Relation.class)) {
+			((Relation) o1).mergeFrom((Relation) o2);
+		} else {
+			throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
+		}
+		return o1;
+	}
+
+	public static Result mergeResults(Result r1, Result r2) {
+		if (new ResultTypeComparator().compare(r1, r2) < 0) {
+			r1.mergeFrom(r2);
+			return r1;
+		} else {
+			r2.mergeFrom(r1);
+			return r2;
+		}
+	}
+
 	public static KeyValue keyValue(final String k, final String v) {
 		final KeyValue kv = new KeyValue();
 		kv.setKey(k);
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java
@ -3,12 +3,10 @@ package eu.dnetlib.dhp.schema.oaf.utils;

 import java.io.Serializable;
 import java.util.Objects;
-import java.util.Optional;
-import java.util.regex.Pattern;

 import org.apache.commons.lang.StringUtils;

-import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
+import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.dhp.utils.DHPUtils;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@ -5,6 +5,7 @@ import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
+import java.util.List;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;

@ -15,9 +16,15 @@ import org.apache.commons.codec.binary.Hex;
 import com.jayway.jsonpath.JsonPath;

 import net.minidev.json.JSONArray;
+import scala.collection.JavaConverters;
+import scala.collection.Seq;

 public class DHPUtils {

+	public static Seq<String> toSeq(List<String> list) {
+		return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
+	}
+
 	public static String md5(final String s) {
 		try {
 			final MessageDigest md = MessageDigest.getInstance("MD5");
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="update broker notifications" xmlns="uri:oozie:workflow:0.5">

    <parameters>
        <property>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
@ -6,7 +6,9 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Optional;

 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@ -20,7 +22,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

@ -68,12 +71,12 @@ public class CleanGraphSparkJob {
 			conf,
 			isSparkSessionManaged,
 			spark -> {
-				removeOutputDir(spark, outputPath);
-				fixGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
+				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
+				cleanGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
 			});
 	}

-	private static <T extends Oaf> void fixGraphTable(
+	private static <T extends Oaf> void cleanGraphTable(
 		SparkSession spark,
 		VocabularyGroup vocs,
 		String inputPath,
@ -99,13 +102,15 @@ public class CleanGraphSparkJob {
 		return spark
 			.read()
 			.textFile(inputEntityPath)
+			.filter((FilterFunction<String>) s -> isEntityType(s, clazz))
+			.map((MapFunction<String, String>) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING())
 			.map(
 				(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
 				Encoders.bean(clazz));
 	}

-	private static void removeOutputDir(SparkSession spark, String path) {
-		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+	private static <T extends Oaf> boolean isEntityType(final String s, final Class<T> clazz) {
+		return StringUtils.substringBefore(s, "|").equals(clazz.getName());
 	}

 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GroupEntitiesAndRelationsSparkJob.java
@ -0,0 +1,206 @@
+
+package eu.dnetlib.dhp.oa.graph.clean;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.expressions.Aggregator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.jayway.jsonpath.Configuration;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+import com.jayway.jsonpath.Option;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.*;
+import scala.Tuple2;
+
+/**
+ * Groups the graph content by entity identifier to ensure ID uniqueness
+ */
+public class GroupEntitiesAndRelationsSparkJob {
+
+	private static final Logger log = LoggerFactory.getLogger(GroupEntitiesAndRelationsSparkJob.class);
+
+	private final static String ID_JPATH = "$.id";
+
+	private final static String SOURCE_JPATH = "$.source";
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				GroupEntitiesAndRelationsSparkJob.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json"));
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		String graphInputPath = parser.get("graphInputPath");
+		log.info("graphInputPath: {}", graphInputPath);
+
+		String outputPath = parser.get("outputPath");
+		log.info("outputPath: {}", outputPath);
+
+		SparkConf conf = new SparkConf();
+		conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+		conf.registerKryoClasses(ModelSupport.getOafModelClasses());
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
+				groupEntitiesAndRelations(spark, graphInputPath, outputPath);
+			});
+	}
+
+	private static void groupEntitiesAndRelations(
+		SparkSession spark,
+		String inputPath,
+		String outputPath) {
+
+		TypedColumn<Oaf, Oaf> aggregator = new GroupingAggregator().toColumn();
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+		spark
+			.read()
+			.textFile(toSeq(listPaths(inputPath, sc)))
+			.map((MapFunction<String, Oaf>) s -> parseOaf(s), Encoders.kryo(Oaf.class))
+			.filter((FilterFunction<Oaf>) oaf -> StringUtils.isNotBlank(ModelSupport.idFn().apply(oaf)))
+			.groupByKey((MapFunction<Oaf, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
+			.agg(aggregator)
+			.map(
+				(MapFunction<Tuple2<String, Oaf>, String>) t -> t._2().getClass().getName() +
+					"|" + OBJECT_MAPPER.writeValueAsString(t._2()),
+				Encoders.STRING())
+			.write()
+			.option("compression", "gzip")
+			.mode(SaveMode.Overwrite)
+			.text(outputPath);
+	}
+
+	public static class GroupingAggregator extends Aggregator<Oaf, Oaf, Oaf> {
+
+		@Override
+		public Oaf zero() {
+			return null;
+		}
+
+		@Override
+		public Oaf reduce(Oaf b, Oaf a) {
+			return mergeAndGet(b, a);
+		}
+
+		private Oaf mergeAndGet(Oaf b, Oaf a) {
+			if (Objects.nonNull(a) && Objects.nonNull(b)) {
+				return OafMapperUtils.merge(b, a);
+			}
+			return Objects.isNull(a) ? b : a;
+		}
+
+		@Override
+		public Oaf merge(Oaf b, Oaf a) {
+			return mergeAndGet(b, a);
+		}
+
+		@Override
+		public Oaf finish(Oaf j) {
+			return j;
+		}
+
+		@Override
+		public Encoder<Oaf> bufferEncoder() {
+			return Encoders.kryo(Oaf.class);
+		}
+
+		@Override
+		public Encoder<Oaf> outputEncoder() {
+			return Encoders.kryo(Oaf.class);
+		}
+
+	}
+
+	private static Oaf parseOaf(String s) {
+
+		DocumentContext dc = JsonPath
+			.parse(s, Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS));
+		final String id = dc.read(ID_JPATH);
+		if (StringUtils.isNotBlank(id)) {
+
+			String prefix = StringUtils.substringBefore(id, "|");
+			switch (prefix) {
+				case "10":
+					return parse(s, Datasource.class);
+				case "20":
+					return parse(s, Organization.class);
+				case "40":
+					return parse(s, Project.class);
+				case "50":
+					String resultType = dc.read("$.resulttype.classid");
+					switch (resultType) {
+						case "publication":
+							return parse(s, Publication.class);
+						case "dataset":
+							return parse(s, eu.dnetlib.dhp.schema.oaf.Dataset.class);
+						case "software":
+							return parse(s, Software.class);
+						case "other":
+							return parse(s, OtherResearchProduct.class);
+						default:
+							throw new IllegalArgumentException(String.format("invalid resultType: '%s'", resultType));
+					}
+				default:
+					throw new IllegalArgumentException(String.format("invalid id prefix: '%s'", prefix));
+			}
+		} else {
+			String source = dc.read(SOURCE_JPATH);
+			if (StringUtils.isNotBlank(source)) {
+				return parse(s, Relation.class);
+			} else {
+				throw new IllegalArgumentException(String.format("invalid oaf: '%s'", s));
+			}
+		}
+	}
+
+	private static <T extends Oaf> Oaf parse(String s, Class<T> clazz) {
+		try {
+			return OBJECT_MAPPER.readValue(s, clazz);
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static List<String> listPaths(String inputPath, JavaSparkContext sc) {
+		return HdfsSupport
+			.listFiles(inputPath, sc.hadoopConfiguration())
+			.stream()
+			.collect(Collectors.toList());
+	}
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java
@ -33,9 +33,9 @@ import scala.Tuple2;
 * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
 * by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
 */
-public class MergeGraphSparkJob {
+public class MergeGraphTableSparkJob {

-	private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
+	private static final Logger log = LoggerFactory.getLogger(MergeGraphTableSparkJob.class);

 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -258,8 +258,8 @@ public abstract class AbstractMdRecordToOafMapper {

 		r.setCollectedfrom(Arrays.asList(collectedFrom));
 		r.setPid(prepareResultPids(doc, info));
-		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
-		r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
+		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection"));
+		r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation"));
 		r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
 		r.setOaiprovenance(prepareOAIprovenance(doc));
 		r.setAuthor(prepareAuthors(doc, info));
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -4,9 +4,11 @@ package eu.dnetlib.dhp.oa.graph.raw;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 import java.io.IOException;
-import java.util.*;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
@ -18,7 +20,6 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
-import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -68,7 +69,7 @@ public class GenerateEntitiesApplication {

 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
-			removeOutputDir(spark, targetPath);
+			HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
 			generateEntities(spark, vocs, sourcePaths, targetPath);
 		});
 	}
@ -82,7 +83,7 @@ public class GenerateEntitiesApplication {
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		final List<String> existingSourcePaths = Arrays
 			.stream(sourcePaths.split(","))
-			.filter(p -> exists(sc, p))
+			.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
 			.collect(Collectors.toList());

 		log.info("Generate entities from files:");
@ -103,7 +104,7 @@ public class GenerateEntitiesApplication {

 		inputRdd
 			.mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
-			.reduceByKey((o1, o2) -> merge(o1, o2))
+			.reduceByKey((o1, o2) -> OafMapperUtils.merge(o1, o2))
 			.map(Tuple2::_2)
 			.map(
 				oaf -> oaf.getClass().getSimpleName().toLowerCase()
@ -112,38 +113,6 @@ public class GenerateEntitiesApplication {
 			.saveAsTextFile(targetPath, GzipCodec.class);
 	}

-	private static Oaf merge(final Oaf o1, final Oaf o2) {
-		if (ModelSupport.isSubClass(o1, OafEntity.class)) {
-			if (ModelSupport.isSubClass(o1, Result.class)) {
-
-				return mergeResults((Result) o1, (Result) o2);
-			} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
-				((Datasource) o1).mergeFrom((Datasource) o2);
-			} else if (ModelSupport.isSubClass(o1, Organization.class)) {
-				((Organization) o1).mergeFrom((Organization) o2);
-			} else if (ModelSupport.isSubClass(o1, Project.class)) {
-				((Project) o1).mergeFrom((Project) o2);
-			} else {
-				throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
-			}
-		} else if (ModelSupport.isSubClass(o1, Relation.class)) {
-			((Relation) o1).mergeFrom((Relation) o2);
-		} else {
-			throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
-		}
-		return o1;
-	}
-
-	protected static Result mergeResults(Result r1, Result r2) {
-		if (new ResultTypeComparator().compare(r1, r2) < 0) {
-			r1.mergeFrom(r2);
-			return r1;
-		} else {
-			r2.mergeFrom(r1);
-			return r2;
-		}
-	}
-
 	private static List<Oaf> convertToListOaf(
 		final String id,
 		final String s,
@ -192,17 +161,4 @@ public class GenerateEntitiesApplication {
 		}
 	}

-	private static boolean exists(final JavaSparkContext context, final String pathToFile) {
-		try {
-			final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration());
-			final Path path = new Path(pathToFile);
-			return hdfs.exists(path);
-		} catch (final IOException e) {
-			throw new RuntimeException(e);
-		}
-	}
-
-	private static void removeOutputDir(final SparkSession spark, final String path) {
-		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
-	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java
@ -1,6 +1,7 @@

 package eu.dnetlib.dhp.oa.graph.raw;

+import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
@ -9,25 +10,20 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
 import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;

 import java.io.Closeable;
 import java.io.IOException;
@ -442,26 +438,22 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 			createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");

 		try {
-			final String targetType = rs.getString(TARGET_TYPE);
 			if (rs.getString(SOURCE_TYPE).equals("context")) {
 				final Result r;

-				switch (targetType) {
-					case "dataset":
-						r = new Dataset();
-						break;
-					case "software":
-						r = new Software();
-						break;
-					case "other":
-						r = new OtherResearchProduct();
-						break;
-					case "publication":
-					default:
-						r = new Publication();
-						break;
+				if (rs.getString(TARGET_TYPE).equals("dataset")) {
+					r = new Dataset();
+					r.setResulttype(DATASET_DEFAULT_RESULTTYPE);
+				} else if (rs.getString(TARGET_TYPE).equals("software")) {
+					r = new Software();
+					r.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
+				} else if (rs.getString(TARGET_TYPE).equals("other")) {
+					r = new OtherResearchProduct();
+					r.setResulttype(ORP_DEFAULT_RESULTTYPE);
+				} else {
+					r = new Publication();
+					r.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
 				}
-
 				r.setId(createOpenaireId(50, rs.getString("target_id"), false));
 				r.setLastupdatetimestamp(lastUpdateTimestamp);
 				r.setContext(prepareContext(rs.getString("source_id"), info));
@ -471,7 +463,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 				return Arrays.asList(r);
 			} else {
 				final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
-				final String targetId = createOpenaireId(targetType, rs.getString("target_id"), false);
+				final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);

 				final Relation r1 = new Relation();
 				final Relation r2 = new Relation();
@ -527,9 +519,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
 		final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
 		final String inferenceprovenance = rs.getString("inferenceprovenance");
 		final Boolean inferred = rs.getBoolean("inferred");
-		final String trust = rs.getString("trust");
+
+		final double trust = rs.getDouble("trust");
+
 		return dataInfo(
-			deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
+			deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION,
+			String.format("%.3f", trust));
 	}

 	private Qualifier prepareQualifierSplitting(final String s) {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@ -2,9 +2,7 @@
 package eu.dnetlib.dhp.oa.graph.raw;

 import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;

 import java.util.ArrayList;
 import java.util.List;
@ -18,9 +16,9 @@ import org.dom4j.Node;
 import com.google.common.collect.Lists;

 import eu.dnetlib.dhp.common.PacePerson;
+import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;

 public class OafToOafMapper extends AbstractMdRecordToOafMapper {

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -2,15 +2,9 @@
 package eu.dnetlib.dhp.oa.graph.raw;

 import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;

-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
 import java.util.stream.Collectors;

 import org.apache.commons.lang3.StringUtils;
@ -18,9 +12,9 @@ import org.dom4j.Document;
 import org.dom4j.Node;

 import eu.dnetlib.dhp.common.PacePerson;
+import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;

 public class OdfToOafMapper extends AbstractMdRecordToOafMapper {

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -50,12 +50,36 @@
        </property>
    </parameters>

-	<start to="fork_clean_graph"/>
+	<start to="group_entities"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>

+    <action name="group_entities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>group graph entities and relations</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.GroupEntitiesAndRelationsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--graphInputPath</arg><arg>${graphInputPath}</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/grouped_entities</arg>
+        </spark>
+        <ok to="fork_clean_graph"/>
+        <error to="Kill"/>
+    </action>
+
    <fork name="fork_clean_graph">
        <path start="clean_publication"/>
        <path start="clean_dataset"/>
@ -84,7 +108,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -110,7 +134,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -136,7 +160,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -162,7 +186,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -188,7 +212,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -214,7 +238,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -240,7 +264,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -266,7 +290,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json
@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "gin",
+    "paramLongName": "graphInputPath",
+    "paramDescription": "the graph root path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the output merged graph root path",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml
@ -2,11 +2,11 @@

    <parameters>
        <property>
-            <name>betaInputGgraphPath</name>
+            <name>betaInputGraphPath</name>
            <description>the beta graph root path</description>
        </property>
        <property>
-            <name>prodInputGgraphPath</name>
+            <name>prodInputGraphPath</name>
            <description>the production graph root path</description>
        </property>
        <property>
@ -76,7 +76,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge publications</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -88,8 +88,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/publication</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/publication</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/publication</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/publication</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -103,7 +103,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge datasets</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -115,8 +115,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/dataset</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/dataset</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/dataset</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/dataset</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -130,7 +130,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge otherresearchproducts</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -142,8 +142,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/otherresearchproduct</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/otherresearchproduct</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/otherresearchproduct</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/otherresearchproduct</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -157,7 +157,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge softwares</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -169,8 +169,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/software</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/software</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/software</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/software</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -184,7 +184,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge datasources</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -196,8 +196,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/datasource</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/datasource</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/datasource</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/datasource</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -211,7 +211,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge organizations</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -223,8 +223,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/organization</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/organization</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/organization</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/organization</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -238,7 +238,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge projects</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -250,8 +250,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/project</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/project</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/project</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/project</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--priority</arg><arg>${priority}</arg>
@ -265,7 +265,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge relations</name>
-            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-cores=${sparkExecutorCores}
@ -277,8 +277,8 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
-            <arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/relation</arg>
-            <arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/relation</arg>
+            <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/relation</arg>
+            <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/relation</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--priority</arg><arg>${priority}</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
@ -19,7 +19,10 @@ import org.mockito.junit.jupiter.MockitoExtension;
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJobTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJobTest.java
@ -15,7 +15,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.schema.oaf.Datasource;

-public class MergeGraphSparkJobTest {
+public class MergeGraphTableSparkJobTest {

 	private ObjectMapper mapper;

@ -28,7 +28,7 @@ public class MergeGraphSparkJobTest {
 	public void testMergeDatasources() throws IOException {
 		assertEquals(
 			"openaire-cris_1.1",
-			MergeGraphSparkJob
+			MergeGraphTableSparkJob
 				.mergeDatasource(
 					d("datasource_cris.json"),
 					d("datasource_UNKNOWN.json"))
@ -36,7 +36,7 @@ public class MergeGraphSparkJobTest {
 				.getClassid());
 		assertEquals(
 			"openaire-cris_1.1",
-			MergeGraphSparkJob
+			MergeGraphTableSparkJob
 				.mergeDatasource(
 					d("datasource_UNKNOWN.json"),
 					d("datasource_cris.json"))
@ -44,7 +44,7 @@ public class MergeGraphSparkJobTest {
 				.getClassid());
 		assertEquals(
 			"driver-openaire2.0",
-			MergeGraphSparkJob
+			MergeGraphTableSparkJob
 				.mergeDatasource(
 					d("datasource_native.json"),
 					d("datasource_driver-openaire2.0.json"))
@ -52,7 +52,7 @@ public class MergeGraphSparkJobTest {
 				.getClassid());
 		assertEquals(
 			"driver-openaire2.0",
-			MergeGraphSparkJob
+			MergeGraphTableSparkJob
 				.mergeDatasource(
 					d("datasource_driver-openaire2.0.json"),
 					d("datasource_native.json"))
@ -60,7 +60,7 @@ public class MergeGraphSparkJobTest {
 				.getClassid());
 		assertEquals(
 			"openaire4.0",
-			MergeGraphSparkJob
+			MergeGraphTableSparkJob
 				.mergeDatasource(
 					d("datasource_notCompatible.json"),
 					d("datasource_openaire4.0.json"))
@ -68,7 +68,7 @@ public class MergeGraphSparkJobTest {
 				.getClassid());
 		assertEquals(
 			"notCompatible",
-			MergeGraphSparkJob
+			MergeGraphTableSparkJob
 				.mergeDatasource(
 					d("datasource_notCompatible.json"),
 					d("datasource_UNKNOWN.json"))
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java
@ -70,7 +70,7 @@ public class GenerateEntitiesApplicationTest {

 	protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
 		String resultType) {
-		final Result merge = GenerateEntitiesApplication.mergeResults(publication, dataset);
+		final Result merge = OafMapperUtils.mergeResults(publication, dataset);
 		assertTrue(clazz.isAssignableFrom(merge.getClass()));
 		assertEquals(resultType, merge.getResulttype().getClassid());
 	}
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -72,6 +72,8 @@ public class MappersTest {
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 		assertFalse(p.getDataInfo().getInvisible());
 		assertTrue(p.getSource().size() == 1);
+		assertTrue(StringUtils.isNotBlank(p.getDateofcollection()));
+		assertTrue(StringUtils.isNotBlank(p.getDateoftransformation()));

 		assertTrue(p.getAuthor().size() > 0);
 		final Optional<Author> author = p
@ -317,7 +319,7 @@ public class MappersTest {
 	@Test
 	void testODFRecord() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml"));
-		List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
+		final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
 		System.out.println("***************");
 		System.out.println(new ObjectMapper().writeValueAsString(list));
 		System.out.println("***************");
@ -328,6 +330,22 @@ public class MappersTest {
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 	}

+	@Test
+	void testTextGrid() throws IOException {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml"));
+		final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
+
+		System.out.println("***************");
+		System.out.println(new ObjectMapper().writeValueAsString(list));
+		System.out.println("***************");
+
+		final Dataset p = (Dataset) list.get(0);
+		assertValidId(p.getId());
+		assertValidId(p.getCollectedfrom().get(0).getKey());
+		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
+		System.out.println(p.getTitle().get(0).getValue());
+	}
+
 	private void assertValidId(final String id) {
 		assertEquals(49, id.length());
 		assertEquals('|', id.charAt(2));
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java
@ -28,13 +28,7 @@ import com.fasterxml.jackson.core.type.TypeReference;
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.Datasource;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
-import eu.dnetlib.dhp.schema.oaf.Organization;
-import eu.dnetlib.dhp.schema.oaf.Project;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.*;

@ExtendWith(MockitoExtension.class)
 public class MigrateDbEntitiesApplicationTest {
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasourceorganization_resultset_entry.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasourceorganization_resultset_entry.json
@ -31,8 +31,8 @@
 	},
 	{
 		"field": "trust",
-		"type": "string",
-		"value": "0.9"
+		"type": "double",
+		"value": 0.9
 	},
 	{
 		"field": "inferenceprovenance",
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/datasources_resultset_entry.json
@ -114,8 +114,8 @@
 	},
 	{
 		"field": "trust",
-		"type": "string",
-		"value": "0.9"
+		"type": "double",
+		"value": 0.9
 	},
 	{
 		"field": "inferenceprovenance",
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
@ -7,13 +7,12 @@
  <header xmlns="http://namespace.openaire.eu/">
    <dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
    <dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
-    <dri:dateOfCollection/>
    <dri:mdFormat/>
    <dri:mdFormatInterpretation/>
    <dri:repositoryId/>
    <dr:objectIdentifier/>
-    <dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
-    <dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
+    <dri:dateOfCollection>2020-03-23T00:20:51.392Z</dri:dateOfCollection>
+    <dri:dateOfTransformation>2020-03-23T00:26:59.078Z</dri:dateOfTransformation>
    <oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
  </header>
  <metadata xmlns="http://namespace.openaire.eu/">
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/organizations_resultset_entry.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/organizations_resultset_entry.json
@ -96,8 +96,8 @@
 	},
 	{
 		"field": "trust",
-		"type": "string",
-		"value": "0.9"
+		"type": "double",
+		"value": 0.9
 	},
 	{
 		"field": "inferenceprovenance",
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json
@ -41,8 +41,8 @@
 	},
 	{
 		"field": "trust",
-		"type": "string",
-		"value": "0.9"
+		"type": "double",
+		"value": 0.9
 	},
 	{
 		"field": "inferenceprovenance",
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projects_resultset_entry.json
@ -86,8 +86,8 @@
 	},
 	{
 		"field": "trust",
-		"type": "string",
-		"value": "0.9"
+		"type": "double",
+		"value": 0.9
 	},
 	{
 		"field": "inferenceprovenance",
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+        xmlns:oaf="http://namespace.openaire.eu/oaf"
+        xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <oai:header xmlns="http://namespace.openaire.eu/"
+                xmlns:dc="http://purl.org/dc/elements/1.1/"
+                xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
+        <dri:objIdentifier>r3f52792889d::000051aa1f61d77d2c0b340091f8024e</dri:objIdentifier>
+        <dri:recordIdentifier>textgrid:q9cv.0</dri:recordIdentifier>
+        <dri:dateOfCollection>2020-11-17T09:34:11.128+01:00</dri:dateOfCollection>
+        <oaf:datasourceprefix>r3f52792889d</oaf:datasourceprefix>
+        <identifier xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:q9cv.0</identifier>
+        <datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2012-01-21T13:35:20Z</datestamp>
+        <dr:dateOfTransformation>2020-11-17T09:46:21.551+01:00</dr:dateOfTransformation>
+    </oai:header>
+    <metadata>
+        <datacite:resource xmlns="http://www.openarchives.org/OAI/2.0/"
+                           xmlns:datacite="http://datacite.org/schema/kernel-3"
+                           xmlns:dc="http://purl.org/dc/elements/1.1/"
+                           xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
+            <datacite:identifier identifierType="Handle">hdl:11858/00-1734-0000-0003-7664-F</datacite:identifier>
+            <datacite:creators>
+                <datacite:creator>
+                    <datacite:creatorName>Hoffmann von Fallersleben, August Heinrich</datacite:creatorName>
+                    <datacite:nameIdentifier nameIdentifierScheme="pnd" schemeURI="https://de.dariah.eu/pnd-service">118552589</datacite:nameIdentifier>
+                </datacite:creator>
+            </datacite:creators>
+            <datacite:titles>
+                <datacite:title titleType="Other">Mailied</datacite:title>
+                <datacite:title titleType="Other">August Heinrich Hoffmann von Fallersleben: Unpolitische Lieder von Hoffmann von Fallersleben, 1. + 2. Theil, 1. Theil, Hamburg: Hoffmann und Campe, 1841.</datacite:title>
+            </datacite:titles>
+            <datacite:publisher>TextGrid</datacite:publisher>
+            <datacite:publicationYear>2012</datacite:publicationYear>
+            <datacite:contributors>
+                <datacite:contributor contributorType="DataManager">
+                    <datacite:contributorName>tvitt@textgrid.de</datacite:contributorName>
+                </datacite:contributor>
+                <datacite:contributor contributorType="Other">
+                    <datacite:contributorName>Digitale Bibliothek</datacite:contributorName>
+                    <datacite:nameIdentifier nameIdentifierScheme="textgrid">TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c</datacite:nameIdentifier>
+                </datacite:contributor>
+            </datacite:contributors>
+            <datacite:dates>
+                <datacite:date dateType="Created">2012-01-21T13:35:20Z</datacite:date>
+                <datacite:date dateType="Issued">2012-01-21T13:35:20Z</datacite:date>
+                <datacite:date dateType="Updated">2012-01-21T13:35:20Z</datacite:date>
+            </datacite:dates>
+            <datacite:resourceType resourceTypeGeneral="Dataset"/>
+            <alternateIdentifiers>
+                <datacite:alternateIdentifier alternateIdentifierType="URI">textgrid:q9cv.0</datacite:alternateIdentifier>
+                <alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/hdl:11858/00-1734-0000-0003-7664-F</alternateIdentifier>
+            </alternateIdentifiers>
+            <datacite:relatedIdentifiers>
+                <datacite:relatedIdentifier relatedIdentifierType="Handle" relationType="IsPartOf">hdl:11858/00-1734-0000-0003-7666-B</datacite:relatedIdentifier>
+            </datacite:relatedIdentifiers>
+            <datacite:sizes>
+                <datacite:size>527 Bytes</datacite:size>
+            </datacite:sizes>
+            <datacite:formats>
+                <datacite:format>text/tg.edition+tg.aggregation+xml</datacite:format>
+            </datacite:formats>
+            <datacite:version>0</datacite:version>
+            <datacite:rightsList>
+                <datacite:rights rightsURI="http://creativecommons.org/licenses/by/3.0/de/legalcode"> Der annotierte Datenbestand der Digitalen Bibliothek inklusive
+                    Metadaten sowie davon einzeln zugängliche Teile sind eine Abwandlung
+                    des Datenbestandes von www.editura.de durch TextGrid und werden
+                    unter der Lizenz Creative Commons Namensnennung 3.0 Deutschland
+                    Lizenz (by-Nennung TextGrid) veröffentlicht. Die Lizenz bezieht sich
+                    nicht auf die der Annotation zu Grunde liegenden allgemeinfreien
+                    Texte (Siehe auch Punkt 2 der Lizenzbestimmungen).</datacite:rights>
+                <datacite:rights rightsURI="info:eu-repo/semantics/openAccess"/>
+            </datacite:rightsList>
+            <datacite:descriptions>
+                <datacite:description descriptionType="Abstract"/>
+            </datacite:descriptions>
+            <datacite:geoLocations>
+                <datacite:geoLocation>
+                    <datacite:geoLocationPlace
+                            xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Hamburg</datacite:geoLocationPlace>
+                </datacite:geoLocation>
+            </datacite:geoLocations>
+        </datacite:resource>
+        <oaf:identifier identifierType="handle">hdl:11858/00-1734-0000-0003-7664-F</oaf:identifier>
+        <dr:CobjCategory type="dataset">0021</dr:CobjCategory>
+        <oaf:refereed>0002</oaf:refereed>
+        <oaf:dateAccepted>2012-01-01</oaf:dateAccepted>
+        <oaf:accessrights>OPEN</oaf:accessrights>
+        <oaf:license>http://creativecommons.org/licenses/by/3.0/de/legalcode</oaf:license>
+        <oaf:language>und</oaf:language>
+        <oaf:hostedBy id="re3data_____::r3d100011365" name="TextGrid Repository"/>
+        <oaf:collectedFrom id="re3data_____::r3d100011365" name="TextGrid Repository"/>
+    </metadata>
+    <about xmlns:dc="http://purl.org/dc/elements/1.1/"
+           xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
+        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+            <originDescription altered="true" harvestDate="2020-11-17T09:34:11.128+01:00">
+                <baseURL>https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai</baseURL>
+                <identifier>textgrid:q9cv.0</identifier>
+                <datestamp>2012-01-21T13:35:20Z</datestamp>
+                <metadataNamespace>http://schema.datacite.org/oai/oai-1.0/</metadataNamespace>
+            </originDescription>
+        </provenance>
+        <oaf:datainfo>
+            <oaf:inferred>false</oaf:inferred>
+            <oaf:deletedbyinference>false</oaf:deletedbyinference>
+            <oaf:trust>0.9</oaf:trust>
+            <oaf:inferenceprovenance/>
+            <oaf:provenanceaction classid="sysimport:crosswalk"
+                                  classname="sysimport:crosswalk"
+                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+        </oaf:datainfo>
+    </about>
+</record>
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -22,6 +22,12 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-api</artifactId>
+                </exclusion>
+            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -82,9 +88,6 @@
                    <groupId>org.codehaus.woodstox</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
-
-
-
                <exclusion>
                    <groupId>com.github.ben-manes.caffeine</groupId>
                    <artifactId>*</artifactId>
@ -109,11 +112,10 @@
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
-
-
-
-
-
+                <exclusion>
+                    <groupId>org.apache.zookeeper</groupId>
+                    <artifactId>zookeeper</artifactId>
+                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+public class ProvisionConstants {
+
+	public static final String LAYOUT = "index";
+	public static final String INTERPRETATION = "openaire";
+	public static final String SEPARATOR = "-";
+
+	public static String getCollectionName(String format) {
+		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
@ -14,11 +14,12 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.ZkServers;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

-public class SolrAdminApplication extends SolrApplication implements Closeable {
+public class SolrAdminApplication implements Closeable {

 	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);

@ -54,12 +55,12 @@ public class SolrAdminApplication extends SolrApplication implements Closeable {
 			.orElse(false);
 		log.info("commit: {}", commit);

-		final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
+		final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));

-		final String zkHost = getZkHost(isLookup);
+		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

-		final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+		final String collection = ProvisionConstants.getCollectionName(format);
 		log.info("collection: {}", collection);

 		try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrApplication.java
@ -1,40 +0,0 @@
-
-package eu.dnetlib.dhp.oa.provision;
-
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
-
-public abstract class SolrApplication {
-
-	private static final Logger log = LoggerFactory.getLogger(SolrApplication.class);
-
-	protected static final String LAYOUT = "index";
-	protected static final String INTERPRETATION = "openaire";
-	protected static final String SEPARATOR = "-";
-	protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
-
-	/**
-	 * Method retrieves from the information system the zookeeper quorum of the Solr server
-	 *
-	 * @param isLookup
-	 * @return the zookeeper quorum of the Solr server
-	 * @throws ISLookUpException
-	 */
-	protected static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
-		return doLookup(
-			isLookup,
-			"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
-	}
-
-	protected static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
-		log.info(String.format("running xquery: %s", xquery));
-		final String res = isLookup.getResourceProfileByQuery(xquery);
-		log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
-		return res;
-	}
-
-}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
@ -2,12 +2,11 @@
 package eu.dnetlib.dhp.oa.provision;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;

-import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
-import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
@ -28,13 +27,11 @@ import com.google.common.collect.Maps;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.oa.provision.model.*;
+import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
+import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
-import eu.dnetlib.dhp.schema.oaf.*;
 import scala.Tuple2;
-import scala.collection.JavaConverters;
-import scala.collection.Seq;

 /**
 * XmlConverterJob converts the JoinedEntities as XML records
@ -43,8 +40,6 @@ public class XmlConverterJob {

 	private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);

-	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-
 	public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";

 	public static void main(String[] args) throws Exception {
@ -129,10 +124,6 @@ public class XmlConverterJob {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}

-	private static Seq<String> toSeq(List<String> list) {
-		return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
-	}
-
 	private static Map<String, LongAccumulator> prepareAccumulators(SparkContext sc) {
 		Map<String, LongAccumulator> accumulators = Maps.newHashMap();
 		accumulators
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@ -20,27 +20,42 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.lucidworks.spark.util.SolrSupport;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

-public class XmlIndexingJob extends SolrApplication {
+public class XmlIndexingJob {

 	private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);

 	private static final Integer DEFAULT_BATCH_SIZE = 1000;

+	protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
+
+	private String inputPath;
+
+	private String format;
+
+	private int batchSize;
+
+	private String outputPath;
+
+	private SparkSession spark;
+
 	public static void main(String[] args) throws Exception {

 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -60,27 +75,53 @@ public class XmlIndexingJob extends SolrApplication {
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);

-		final String isLookupUrl = parser.get("isLookupUrl");
-		log.info("isLookupUrl: {}", isLookupUrl);
-
 		final String format = parser.get("format");
 		log.info("format: {}", format);

+		final String outputPath = Optional
+			.ofNullable(parser.get("outputPath"))
+			.orElse(null);
+		log.info("outputPath: {}", outputPath);
+
 		final Integer batchSize = parser.getObjectMap().containsKey("batchSize")
 			? Integer.valueOf(parser.get("batchSize"))
 			: DEFAULT_BATCH_SIZE;
 		log.info("batchSize: {}", batchSize);

-		final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
-		final String fields = getLayoutSource(isLookup, format);
+		final SparkConf conf = new SparkConf();
+		conf.registerKryoClasses(new Class[] {
+			SerializableSolrInputDocument.class
+		});
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				final String isLookupUrl = parser.get("isLookupUrl");
+				log.info("isLookupUrl: {}", isLookupUrl);
+				final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
+				new XmlIndexingJob(spark, inputPath, format, batchSize, outputPath).run(isLookup);
+			});
+	}
+
+	public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize, String outputPath) {
+		this.spark = spark;
+		this.inputPath = inputPath;
+		this.format = format;
+		this.batchSize = batchSize;
+		this.outputPath = outputPath;
+	}
+
+	public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
+		final String fields = isLookup.getLayoutSource(format);
 		log.info("fields: {}", fields);

-		final String xslt = getLayoutTransformer(isLookup);
+		final String xslt = isLookup.getLayoutTransformer();

-		final String dsId = getDsId(format, isLookup);
+		final String dsId = isLookup.getDsId(format);
 		log.info("dsId: {}", dsId);

-		final String zkHost = getZkHost(isLookup);
+		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

 		final String version = getRecordDatestamp();
@ -88,24 +129,26 @@ public class XmlIndexingJob extends SolrApplication {
 		final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
 		log.info("indexRecordTransformer {}", indexRecordXslt);

-		final SparkConf conf = new SparkConf();
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+		JavaRDD<SolrInputDocument> docs = sc
+			.sequenceFile(inputPath, Text.class, Text.class)
+			.map(t -> t._2().toString())
+			.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
+			.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s));

-				RDD<SolrInputDocument> docs = sc
-					.sequenceFile(inputPath, Text.class, Text.class)
-					.map(t -> t._2().toString())
-					.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
-					.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
-					.rdd();
-
-				final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
-				SolrSupport.indexDocs(zkHost, collection, batchSize, docs);
-			});
+		if (StringUtils.isNotBlank(outputPath)) {
+			spark
+				.createDataset(
+					docs.map(s -> new SerializableSolrInputDocument(s)).rdd(),
+					Encoders.kryo(SerializableSolrInputDocument.class))
+				.write()
+				.mode(SaveMode.Overwrite)
+				.parquet(outputPath);
+		} else {
+			final String collection = ProvisionConstants.getCollectionName(format);
+			SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+		}
 	}

 	protected static String toIndexRecord(Transformer tr, final String record) {
@ -151,56 +194,4 @@ public class XmlIndexingJob extends SolrApplication {
 		return new SimpleDateFormat(DATE_FORMAT).format(new Date());
 	}

-	/**
-	 * Method retrieves from the information system the list of fields associated to the given MDFormat name
-	 *
-	 * @param isLookup the ISLookup service stub
-	 * @param format the Metadata format name
-	 * @return the string representation of the list of fields to be indexed
-	 * @throws ISLookUpDocumentNotFoundException
-	 * @throws ISLookUpException
-	 */
-	private static String getLayoutSource(final ISLookUpService isLookup, final String format)
-		throws ISLookUpDocumentNotFoundException, ISLookUpException {
-		return doLookup(
-			isLookup,
-			String
-				.format(
-					"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
-					format, LAYOUT));
-	}
-
-	/**
-	 * Method retrieves from the information system the openaireLayoutToRecordStylesheet
-	 *
-	 * @param isLookup the ISLookup service stub
-	 * @return the string representation of the XSLT contained in the transformation rule profile
-	 * @throws ISLookUpDocumentNotFoundException
-	 * @throws ISLookUpException
-	 */
-	private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
-		return doLookup(
-			isLookup,
-			"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
-				+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
-	}
-
-	/**
-	 * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
-	 *
-	 * @param format
-	 * @param isLookup
-	 * @return the IndexDS identifier
-	 * @throws ISLookUpException
-	 */
-	private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
-		return doLookup(
-			isLookup,
-			String
-				.format(
-					"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
-						+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
-					format));
-	}
-
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SerializableSolrInputDocument.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SerializableSolrInputDocument.java
@ -0,0 +1,23 @@
+
+package eu.dnetlib.dhp.oa.provision.model;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+
+/**
+ * Wrapper class needed to make the SolrInputDocument compatible with the Kryo serialization mechanism.
+ */
+public class SerializableSolrInputDocument extends SolrInputDocument {
+
+	public SerializableSolrInputDocument() {
+		super(new HashMap<>());
+	}
+
+	public SerializableSolrInputDocument(Map<String, SolrInputField> fields) {
+		super(fields);
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ISLookupClient.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ISLookupClient.java
@ -0,0 +1,95 @@
+
+package eu.dnetlib.dhp.oa.provision.utils;
+
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class ISLookupClient {
+
+	private static final Logger log = LoggerFactory.getLogger(ISLookupClient.class);
+
+	private ISLookUpService isLookup;
+
+	public ISLookupClient(ISLookUpService isLookup) {
+		this.isLookup = isLookup;
+	}
+
+	/**
+	 * Method retrieves from the information system the list of fields associated to the given MDFormat name
+	 *
+	 * @param format the Metadata format name
+	 * @return the string representation of the list of fields to be indexed
+	 * @throws ISLookUpDocumentNotFoundException
+	 * @throws ISLookUpException
+	 */
+	public String getLayoutSource(final String format)
+		throws ISLookUpDocumentNotFoundException, ISLookUpException {
+		return doLookup(
+			String
+				.format(
+					"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
+					format, ProvisionConstants.LAYOUT));
+	}
+
+	/**
+	 * Method retrieves from the information system the openaireLayoutToRecordStylesheet
+	 *
+	 * @return the string representation of the XSLT contained in the transformation rule profile
+	 * @throws ISLookUpDocumentNotFoundException
+	 * @throws ISLookUpException
+	 */
+	public String getLayoutTransformer() throws ISLookUpException {
+		return doLookup(
+			"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
+				+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
+	}
+
+	/**
+	 * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
+	 *
+	 * @param format
+	 * @return the IndexDS identifier
+	 * @throws ISLookUpException
+	 */
+	public String getDsId(String format) throws ISLookUpException {
+		return doLookup(
+			String
+				.format(
+					"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
+						+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
+					format));
+	}
+
+	/**
+	 * Method retrieves from the information system the zookeeper quorum of the Solr server
+	 *
+	 * @return the zookeeper quorum of the Solr server
+	 * @throws ISLookUpException
+	 */
+	public String getZkHost() throws ISLookUpException {
+		return doLookup(
+			"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
+	}
+
+	private String doLookup(String xquery) throws ISLookUpException {
+		log.info(String.format("running xquery: %s", xquery));
+		final String res = getIsLookup().getResourceProfileByQuery(xquery);
+		log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
+		return res;
+	}
+
+	public ISLookUpService getIsLookup() {
+		return isLookup;
+	}
+
+	public void setIsLookup(ISLookUpService isLookup) {
+		this.isLookup = isLookup;
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
@ -46,11 +46,6 @@ public class StreamingInputDocumentFactory {

 	private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";

-	private static final String outFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
-
-	private static final List<String> dateFormats = Arrays
-		.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
-
 	private static final String DEFAULTDNETRESULT = "dnetResult";

 	private static final String TARGETFIELDS = "targetFields";
@ -125,13 +120,12 @@ public class StreamingInputDocumentFactory {
 			}

 			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
-				indexDocument.clear();
-				System.err.println("missing indexrecord id:\n" + inputDocument);
+				throw new IllegalStateException("cannot extract record ID from: " + inputDocument);
 			}

 			return indexDocument;
 		} catch (XMLStreamException e) {
-			return new SolrInputDocument();
+			throw new IllegalStateException(e);
 		}
 	}

--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
@ -22,5 +22,11 @@
    "paramLongName": "batchSize",
    "paramDescription": "size of the batch of documents sent to solr",
    "paramRequired": false
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "outputPath",
+    "paramDescription": "path on hdfs activating an alternative output for the SolrInputDocuments",
+    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -638,6 +638,7 @@
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--format</arg><arg>${format}</arg>
            <arg>--batchSize</arg><arg>${batchSize}</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="commit_solr_collection"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
@ -1,107 +1,18 @@

 package eu.dnetlib.dhp.oa.provision;

-import java.io.File;
-import java.nio.file.Path;
-
-import org.apache.solr.client.solrj.SolrResponse;
-import org.apache.solr.client.solrj.embedded.JettyConfig;
-import org.apache.solr.client.solrj.impl.CloudSolrClient;
-import org.apache.solr.client.solrj.impl.XMLResponseParser;
-import org.apache.solr.client.solrj.request.CollectionAdminRequest;
-import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
-import org.apache.solr.client.solrj.request.QueryRequest;
-import org.apache.solr.client.solrj.request.RequestWriter;
-import org.apache.solr.client.solrj.response.CollectionAdminResponse;
-import org.apache.solr.client.solrj.response.ConfigSetAdminResponse;
 import org.apache.solr.client.solrj.response.SolrPingResponse;
 import org.apache.solr.client.solrj.response.UpdateResponse;
-import org.apache.solr.cloud.MiniSolrCloudCluster;
-import org.apache.solr.common.params.CollectionParams;
-import org.apache.solr.common.params.CoreAdminParams;
-import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.common.util.NamedList;
-import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;

 import junit.framework.Assert;

-public class SolrAdminApplicationTest {
-
-	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplicationTest.class);
-	public static final String DEFAULT_COLLECTION = "testCollection";
-	public static final String CONFIG_NAME = "testConfig";
-
-	private static MiniSolrCloudCluster miniCluster;
-	private static CloudSolrClient cloudSolrClient;
-
-	@TempDir
-	public static Path tempDir;
-
-	@BeforeAll
-	public static void setup() throws Exception {
-
-		// random unassigned HTTP port
-		final int jettyPort = 0;
-
-		final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
-
-		// create a MiniSolrCloudCluster instance
-		miniCluster = new MiniSolrCloudCluster(2, tempDir, jettyConfig);
-
-		// Upload Solr configuration directory to ZooKeeper
-		String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
-		File configDir = new File(solrZKConfigDir);
-
-		miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
-
-		// override settings in the solrconfig include
-		System.setProperty("solr.tests.maxBufferedDocs", "100000");
-		System.setProperty("solr.tests.maxIndexingThreads", "-1");
-		System.setProperty("solr.tests.ramBufferSizeMB", "100");
-
-		// use non-test classes so RandomizedRunner isn't necessary
-		System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
-		System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
-
-		cloudSolrClient = miniCluster.getSolrClient();
-		cloudSolrClient.setRequestWriter(new RequestWriter());
-		cloudSolrClient.setParser(new XMLResponseParser());
-		cloudSolrClient.setDefaultCollection(DEFAULT_COLLECTION);
-		cloudSolrClient.connect();
-
-		log.info(new ConfigSetAdminRequest.List().process(cloudSolrClient).toString());
-		log.info(CollectionAdminRequest.ClusterStatus.getClusterStatus().process(cloudSolrClient).toString());
-
-		createCollection(cloudSolrClient, DEFAULT_COLLECTION, 2, 1, CONFIG_NAME);
-	}
-
-	@AfterAll
-	public static void shutDown() throws Exception {
-		miniCluster.shutdown();
-	}
-
-	protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
-		int replicationFactor, String configName) throws Exception {
-		ModifiableSolrParams modParams = new ModifiableSolrParams();
-		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
-		modParams.set("name", name);
-		modParams.set("numShards", numShards);
-		modParams.set("replicationFactor", replicationFactor);
-		modParams.set("collection.configName", configName);
-		QueryRequest request = new QueryRequest(modParams);
-		request.setPath("/admin/collections");
-		return client.request(request);
-	}
+public class SolrAdminApplicationTest extends SolrTest {

 	@Test
 	public void testPing() throws Exception {
-		SolrPingResponse pingResponse = cloudSolrClient.ping();
+		SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
 		log.info("pingResponse: '{}'", pingResponse.getStatus());
 		Assert.assertTrue(pingResponse.getStatus() == 0);
 	}
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrTest.java
@ -0,0 +1,109 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.File;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.client.solrj.embedded.JettyConfig;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
+import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.params.CoreAdminParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.io.TempDir;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class SolrTest {
+
+	protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
+
+	protected static final String FORMAT = "test";
+	protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
+	protected static final String CONFIG_NAME = "testConfig";
+
+	protected static MiniSolrCloudCluster miniCluster;
+
+	@TempDir
+	public static Path workingDir;
+
+	@BeforeAll
+	public static void setup() throws Exception {
+
+		// random unassigned HTTP port
+		final int jettyPort = 0;
+		final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
+
+		log.info(String.format("working directory: %s", workingDir.toString()));
+		System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
+
+		// create a MiniSolrCloudCluster instance
+		miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
+
+		// Upload Solr configuration directory to ZooKeeper
+		String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
+		File configDir = new File(solrZKConfigDir);
+
+		miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
+
+		// override settings in the solrconfig include
+		System.setProperty("solr.tests.maxBufferedDocs", "100000");
+		System.setProperty("solr.tests.maxIndexingThreads", "-1");
+		System.setProperty("solr.tests.ramBufferSizeMB", "100");
+
+		// use non-test classes so RandomizedRunner isn't necessary
+		System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
+		System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
+		System.setProperty("solr.lock.type", "single");
+
+		log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
+		log
+			.info(
+				CollectionAdminRequest.ClusterStatus
+					.getClusterStatus()
+					.process(miniCluster.getSolrClient())
+					.toString());
+
+		NamedList<Object> res = createCollection(
+			miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
+		res.forEach(o -> log.info(o.toString()));
+
+		miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
+
+		log
+			.info(
+				CollectionAdminRequest.ClusterStatus
+					.getClusterStatus()
+					.process(miniCluster.getSolrClient())
+					.toString());
+
+	}
+
+	@AfterAll
+	public static void shutDown() throws Exception {
+		miniCluster.shutdown();
+		FileUtils.deleteDirectory(workingDir.toFile());
+	}
+
+	protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
+		int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
+		ModifiableSolrParams modParams = new ModifiableSolrParams();
+		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
+		modParams.set("name", name);
+		modParams.set("numShards", numShards);
+		modParams.set("replicationFactor", replicationFactor);
+		modParams.set("collection.configName", configName);
+		modParams.set("maxShardsPerNode", maxShardsPerNode);
+		QueryRequest request = new QueryRequest(modParams);
+		request.setPath("/admin/collections");
+		return client.request(request);
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java
@ -0,0 +1,147 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URI;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.dom4j.io.SAXReader;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@ExtendWith(MockitoExtension.class)
+public class XmlIndexingJobTest extends SolrTest {
+
+	protected static SparkSession spark;
+
+	private static final Integer batchSize = 100;
+
+	@Mock
+	private ISLookUpService isLookUpService;
+
+	@Mock
+	private ISLookupClient isLookupClient;
+
+	@BeforeEach
+	public void prepareMocks() throws ISLookUpException, IOException {
+		isLookupClient.setIsLookup(isLookUpService);
+
+		int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
+
+		Mockito
+			.when(isLookupClient.getDsId(Mockito.anyString()))
+			.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
+		Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
+		Mockito
+			.when(isLookupClient.getLayoutSource(Mockito.anyString()))
+			.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
+		Mockito
+			.when(isLookupClient.getLayoutTransformer())
+			.thenReturn(IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
+	}
+
+	@BeforeAll
+	public static void before() {
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
+		conf.registerKryoClasses(new Class[] {
+			SerializableSolrInputDocument.class
+		});
+
+		conf.setMaster("local[1]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(XmlIndexingJobTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void tearDown() {
+		spark.stop();
+	}
+
+	@Test
+	public void testXmlIndexingJob_onSolr() throws Exception {
+
+		String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
+
+		long nRecord = JavaSparkContext
+			.fromSparkContext(spark.sparkContext())
+			.sequenceFile(inputPath, Text.class, Text.class)
+			.count();
+
+		new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, null).run(isLookupClient);
+
+		Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
+
+		QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*"));
+
+		Assertions
+			.assertEquals(
+				nRecord, rsp.getResults().getNumFound(),
+				"the number of indexed records should be equal to the number of input records");
+	}
+
+	@Test
+	public void testXmlIndexingJob_saveOnHDFS() throws Exception {
+		final String ID_XPATH = "//header/*[local-name()='objIdentifier']";
+
+		String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
+
+		final JavaPairRDD<Text, Text> xmlRecords = JavaSparkContext
+			.fromSparkContext(spark.sparkContext())
+			.sequenceFile(inputPath, Text.class, Text.class);
+		long nRecord = xmlRecords.count();
+		long xmlIdUnique = xmlRecords
+			.map(t -> t._2().toString())
+			.map(s -> new SAXReader().read(new StringReader(s)).valueOf(ID_XPATH))
+			.distinct()
+			.count();
+		Assertions.assertEquals(nRecord, xmlIdUnique, "IDs should be unique among input records");
+
+		final String outputPath = workingDir.resolve("outputPath").toAbsolutePath().toString();
+		new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, outputPath).run(isLookupClient);
+
+		final Dataset<SerializableSolrInputDocument> solrDocs = spark
+			.read()
+			.load(outputPath)
+			.as(Encoders.kryo(SerializableSolrInputDocument.class));
+		long docIdUnique = solrDocs.map((MapFunction<SerializableSolrInputDocument, String>) doc -> {
+			final SolrInputField id = doc.getField("__indexrecordidentifier");
+			return id.getFirstValue().toString();
+		}, Encoders.STRING())
+			.distinct()
+			.count();
+		Assertions.assertEquals(xmlIdUnique, docIdUnique, "IDs should be unique among the output records");
+
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml
@ -105,7 +105,7 @@
        <FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
        <FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
        <FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
-        <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
+        <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
        <FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
        <FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
        <FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
@ -123,7 +123,8 @@
        <FIELD indexable="true" name="relfundername" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@name)"/>
        <FIELD indexable="true" name="relfunderjurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@jurisdiction)"/><!-- Collected from of the related entity. Available for result-result relationships -->
        <FIELD indexable="true" name="relcollectedfromid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@id)"/>
-        <FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/><!-- COMMON FIELDS -->
+        <FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/>
+        <FIELD indexable="true" name="semrelid" result="false" stat="false" tokenizable="false" value="concat(./to/text(), '||', ./to/@class/string())" xpath="//*[local-name()='entity']//rel"/><!-- COMMON FIELDS -->
        <FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="pdate" value="//header/*[local-name()='dateOfCollection']"/>
        <FIELD indexable="true" name="collectedfrom" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*/*[local-name()='collectedfrom'] | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']"/>
        <FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/elevate.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/elevate.xml
@ -0,0 +1,31 @@
+Unless required by applicable law or agreed to in writing, software
+        distributed under the License is distributed on an "AS IS" BASIS,
+        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+        See the License for the specific language governing permissions and
+        limitations under the License.
+        -->
+
+        <!-- If this file is found in the config directory, it will only be
+             loaded once at startup.  If it is found in Solr's data
+             directory, it will be re-loaded every commit.
+
+           See http://wiki.apache.org/solr/QueryElevationComponent for more info
+
+        -->
+<elevate>
+    <!-- Query elevation examples
+     <query text="foo bar">
+       <doc id="1" />
+       <doc id="2" />
+       <doc id="3" />
+     </query>
+
+   for use with techproducts example
+
+     <query text="ipod">
+       <doc id="MA147LL/A" />  put the actual ipod at the top
+       <doc id="IW-02" exclude="true" /> exclude this cable
+     </query>
+   -->
+
+</elevate>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/managed-schema
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/managed-schema
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/solrconfig.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/solrconfig.xml
@ -83,6 +83,7 @@

  <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
+
  <!-- an exact 'path' can be used instead of a 'dir' to specify a
       specific jar file.  This will cause a serious error to be logged
       if it can't be loaded.
@ -112,7 +113,8 @@
       One can force a particular implementation via solr.MMapDirectoryFactory,
       solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.

-       solr.RAMDirectoryFactory is memory based and not persistent.
+       solr.RAMDirectoryFactory is memory based, not
+       persistent, and doesn't work with replication.
    -->
  <directoryFactory name="DirectoryFactory"
                    class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
@ -204,7 +206,7 @@
         More details on the nuances of each LockFactory...
         http://wiki.apache.org/lucene-java/AvailableLockFactories
    -->
-    <lockType>${solr.lock.type:single}</lockType>
+    <lockType>${solr.lock.type:native}</lockType>

    <!-- Commit Deletion Policy
         Custom deletion policies can be specified here. The class must
@ -331,6 +333,29 @@
         postCommit - fired after every commit or optimize command
         postOptimize - fired after every optimize command
      -->
+    <!-- The RunExecutableListener executes an external command from a
+         hook such as postCommit or postOptimize.
+
+         exe - the name of the executable to run
+         dir - dir to use as the current working directory. (default=".")
+         wait - the calling thread waits until the executable returns.
+                (default="true")
+         args - the arguments to pass to the program.  (default is none)
+         env - environment variables to set.  (default is none)
+      -->
+    <!-- This example shows how RunExecutableListener could be used
+         with the script based replication...
+         http://wiki.apache.org/solr/CollectionDistribution
+      -->
+    <!--
+       <listener event="postCommit" class="solr.RunExecutableListener">
+         <str name="exe">solr/bin/snapshooter</str>
+         <str name="dir">.</str>
+         <bool name="wait">true</bool>
+         <arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
+         <arr name="env"> <str>MYVAR=val1</str> </arr>
+       </listener>
+      -->

  </updateHandler>

@ -366,14 +391,22 @@
       Query section - these settings control query time things like caches
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
  <query>
+    <!-- Max Boolean Clauses
+
+         Maximum number of clauses in each BooleanQuery,  an exception
+         is thrown if exceeded.
+
+         ** WARNING **
+
+         This option actually modifies a global Lucene property that
+         will affect all SolrCores.  If multiple solrconfig.xml files
+         disagree on this property, the value at any given moment will
+         be based on the last SolrCore to be initialized.

-    <!-- Maximum number of clauses in each BooleanQuery,  an exception
-         is thrown if exceeded.  It is safe to increase or remove this setting,
-         since it is purely an arbitrary limit to try and catch user errors where
-         large boolean queries may not be the best implementation choice.
      -->
    <maxBooleanClauses>1024</maxBooleanClauses>

+
    <!-- Solr Internal Query Caches

         There are two implementations of cache available for Solr,
@ -575,8 +608,21 @@
       This section contains instructions for how the SolrDispatchFilter
       should behave when processing requests for this SolrCore.

+       handleSelect is a legacy option that affects the behavior of requests
+       such as /select?qt=XXX
+
+       handleSelect="true" will cause the SolrDispatchFilter to process
+       the request and dispatch the query to a handler specified by the
+       "qt" param, assuming "/select" isn't already registered.
+
+       handleSelect="false" will cause the SolrDispatchFilter to
+       ignore "/select" requests, resulting in a 404 unless a handler
+       is explicitly registered with the name "/select"
+
+       handleSelect="true" is not recommended for new users, but is the default
+       for backwards compatibility
    -->
-  <requestDispatcher>
+  <requestDispatcher handleSelect="false" >
    <!-- Request Parsing

         These settings indicate how Solr Requests may be parsed, and
@ -602,14 +648,15 @@
         plugins.

         *** WARNING ***
-         Before enabling remote streaming, you should make sure your
-         system has authentication enabled.
+         The settings below authorize Solr to fetch remote files, You
+         should make sure your system has some authentication before
+         using enableRemoteStreaming="true"

-    <requestParsers enableRemoteStreaming="false"
-                    multipartUploadLimitInKB="-1"
-                    formdataUploadLimitInKB="-1"
-                    addHttpRequestToContext="false"/>
      -->
+    <requestParsers enableRemoteStreaming="true"
+                    multipartUploadLimitInKB="2048000"
+                    formdataUploadLimitInKB="2048"
+                    addHttpRequestToContext="false"/>

    <!-- HTTP Caching

@ -673,6 +720,14 @@
       Incoming queries will be dispatched to a specific handler by name
       based on the path specified in the request.

+       Legacy behavior: If the request path uses "/select" but no Request
+       Handler has that name, and if handleSelect="true" has been specified in
+       the requestDispatcher, then the Request Handler is dispatched based on
+       the qt parameter.  Handlers without a leading '/' are accessed this way
+       like so: http://host/app/[core/]select?qt=name  If no qt is
+       given, then the requestHandler that declares default="true" will be
+       used or the one named "standard".
+
       If a Request Handler is declared with startup="lazy", then it will
       not be initialized until the first request that uses it.

@ -692,13 +747,9 @@
      -->
    <lst name="defaults">
      <str name="echoParams">explicit</str>
+      <str name="q.op">AND</str>
      <int name="rows">10</int>
-      <!-- Default search field
-         <str name="df">text</str> 
-        -->
-      <!-- Change from JSON to XML format (the default prior to Solr 7.0)
-         <str name="wt">xml</str> 
-        -->
+      <!-- <str name="df">text</str> -->
    </lst>
    <!-- In addition to defaults, "appends" params can be specified
         to identify values which should be appended to the list of
@ -781,10 +832,18 @@

  <initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell,/browse">
    <lst name="defaults">
-      <str name="df">_text_</str>
+      <str name="df">__all</str>
    </lst>
  </initParams>

+  <!-- This enabled schemaless mode
+  <initParams path="/update/**">
+    <lst name="defaults">
+      <str name="update.chain">add-unknown-fields-to-the-schema</str>
+    </lst>
+  </initParams>
+  -->
+
  <!-- Solr Cell Update Request Handler

       http://wiki.apache.org/solr/ExtractingRequestHandler
@ -796,10 +855,9 @@
    <lst name="defaults">
      <str name="lowernames">true</str>
      <str name="fmap.meta">ignored_</str>
-      <str name="fmap.content">_text_</str>
+      <str name="fmap.content">__all</str>
    </lst>
  </requestHandler>
-
  <!-- Search Components

       Search components are registered to SolrCore and used by
@ -861,7 +919,7 @@
    <!-- a spellchecker built from a field of the main index -->
    <lst name="spellchecker">
      <str name="name">default</str>
-      <str name="field">_text_</str>
+      <str name="field">__all</str>
      <str name="classname">solr.DirectSolrSpellChecker</str>
      <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
      <str name="distanceMeasure">internal</str>
@ -986,6 +1044,7 @@
  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
+    <str name="config-file">elevate.xml</str>
  </searchComponent>

  <!-- A request handler for demonstrating the elevator component -->
@ -1116,81 +1175,70 @@

  <!-- Add unknown fields to the schema

-       Field type guessing update processors that will
+       An example field type guessing update processor that will
       attempt to parse string-typed field values as Booleans, Longs,
       Doubles, or Dates, and then add schema fields with the guessed
-       field types. Text content will be indexed as "text_general" as
-       well as a copy to a plain string version in *_str.
+       field types.

-       These require that the schema is both managed and mutable, by
+       This requires that the schema is both managed and mutable, by
       declaring schemaFactory as ManagedIndexSchemaFactory, with
       mutable specified as true.

       See http://wiki.apache.org/solr/GuessingFieldTypes
    -->
-  <updateProcessor class="solr.UUIDUpdateProcessorFactory" name="uuid"/>
-  <updateProcessor class="solr.RemoveBlankFieldUpdateProcessorFactory" name="remove-blank"/>
-  <updateProcessor class="solr.FieldNameMutatingUpdateProcessorFactory" name="field-name-mutating">
-    <str name="pattern">[^\w-\.]</str>
-    <str name="replacement">_</str>
-  </updateProcessor>
-  <updateProcessor class="solr.ParseBooleanFieldUpdateProcessorFactory" name="parse-boolean"/>
-  <updateProcessor class="solr.ParseLongFieldUpdateProcessorFactory" name="parse-long"/>
-  <updateProcessor class="solr.ParseDoubleFieldUpdateProcessorFactory" name="parse-double"/>
-  <updateProcessor class="solr.ParseDateFieldUpdateProcessorFactory" name="parse-date">
-    <arr name="format">
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss</str>
-      <str>yyyy-MM-dd'T'HH:mmZ</str>
-      <str>yyyy-MM-dd'T'HH:mm</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd HH:mm:ssZ</str>
-      <str>yyyy-MM-dd HH:mm:ss</str>
-      <str>yyyy-MM-dd HH:mmZ</str>
-      <str>yyyy-MM-dd HH:mm</str>
-      <str>yyyy-MM-dd</str>
-    </arr>
-  </updateProcessor>
-  <updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields">
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.String</str>
-      <str name="fieldType">text_general</str>
-      <lst name="copyField">
-        <str name="dest">*_str</str>
-        <int name="maxChars">256</int>
+  <updateRequestProcessorChain name="add-unknown-fields-to-the-schema">
+    <!-- UUIDUpdateProcessorFactory will generate an id if none is present in the incoming document -->
+    <processor class="solr.UUIDUpdateProcessorFactory" />
+    <processor class="solr.RemoveBlankFieldUpdateProcessorFactory"/>
+    <processor class="solr.FieldNameMutatingUpdateProcessorFactory">
+      <str name="pattern">[^\w-\.]</str>
+      <str name="replacement">_</str>
+    </processor>
+    <processor class="solr.ParseBooleanFieldUpdateProcessorFactory"/>
+    <processor class="solr.ParseLongFieldUpdateProcessorFactory"/>
+    <processor class="solr.ParseDoubleFieldUpdateProcessorFactory"/>
+    <processor class="solr.ParseDateFieldUpdateProcessorFactory">
+      <arr name="format">
+        <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
+        <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss</str>
+        <str>yyyy-MM-dd'T'HH:mmZ</str>
+        <str>yyyy-MM-dd'T'HH:mm</str>
+        <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
+        <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
+        <str>yyyy-MM-dd HH:mm:ss.SSS</str>
+        <str>yyyy-MM-dd HH:mm:ss,SSS</str>
+        <str>yyyy-MM-dd HH:mm:ssZ</str>
+        <str>yyyy-MM-dd HH:mm:ss</str>
+        <str>yyyy-MM-dd HH:mmZ</str>
+        <str>yyyy-MM-dd HH:mm</str>
+        <str>yyyy-MM-dd</str>
+      </arr>
+    </processor>
+    <processor class="solr.AddSchemaFieldsUpdateProcessorFactory">
+      <str name="defaultFieldType">strings</str>
+      <lst name="typeMapping">
+        <str name="valueClass">java.lang.Boolean</str>
+        <str name="fieldType">booleans</str>
      </lst>
-      <!-- Use as default mapping instead of defaultFieldType -->
-      <bool name="default">true</bool>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.Boolean</str>
-      <str name="fieldType">booleans</str>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.util.Date</str>
-      <str name="fieldType">pdates</str>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.Long</str>
-      <str name="valueClass">java.lang.Integer</str>
-      <str name="fieldType">plongs</str>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.Number</str>
-      <str name="fieldType">pdoubles</str>
-    </lst>
-  </updateProcessor>
+      <lst name="typeMapping">
+        <str name="valueClass">java.util.Date</str>
+        <str name="fieldType">tdates</str>
+      </lst>
+      <lst name="typeMapping">
+        <str name="valueClass">java.lang.Long</str>
+        <str name="valueClass">java.lang.Integer</str>
+        <str name="fieldType">tlongs</str>
+      </lst>
+      <lst name="typeMapping">
+        <str name="valueClass">java.lang.Number</str>
+        <str name="fieldType">tdoubles</str>
+      </lst>
+    </processor>

-  <!-- The update.autoCreateFields property can be turned to false to disable schemaless mode -->
-  <updateRequestProcessorChain name="add-unknown-fields-to-the-schema" default="${update.autoCreateFields:true}"
-           processor="uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date,add-schema-fields">
    <processor class="solr.LogUpdateProcessorFactory"/>
    <processor class="solr.DistributedUpdateProcessorFactory"/>
    <processor class="solr.RunUpdateProcessorFactory"/>
@ -1313,7 +1361,7 @@

  <!-- Query Parsers

-       https://lucene.apache.org/solr/guide/query-syntax-and-parsing.html
+       https://cwiki.apache.org/confluence/display/solr/Query+Syntax+and+Parsing

       Multiple QParserPlugins can be registered by name, and then
       used in either the "defType" param for the QueryComponent (used
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/xml/part-00000
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/xml/part-00000
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@ -46,7 +46,7 @@
        </configuration>
    </global>

-    <start to="Step18"/>
+    <start to="Step1"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
--- a/pom.xml
+++ b/pom.xml
@ -50,7 +50,7 @@
 		<repository>
 			<id>dnet45-releases</id>
 			<name>D-Net 45 releases</name>
-			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
+			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
 			<layout>default</layout>
 			<snapshots>
 				<enabled>false</enabled>
@ -70,6 +70,26 @@
 				<enabled>false</enabled>
 			</snapshots>
 		</repository>
+		<repository>
+			<id>dnet45-releases-old</id>
+			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
+			<releases>
+				<enabled>false</enabled>
+			</releases>
+			<snapshots>
+				<enabled>false</enabled>
+			</snapshots>
+		</repository>
+		<repository>
+			<id>dnet45-snapshots-old</id>
+			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
+			<releases>
+				<enabled>false</enabled>
+			</releases>
+			<snapshots>
+				<enabled>false</enabled>
+			</snapshots>
+		</repository>
 	</repositories>

 	<dependencies>
@ -639,12 +659,12 @@
 		<snapshotRepository>
 			<id>dnet45-snapshots</id>
 			<name>DNet45 Snapshots</name>
-			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
+			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
 			<layout>default</layout>
 		</snapshotRepository>
 		<repository>
 			<id>dnet45-releases</id>
-			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
+			<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
 		</repository>
 	</distributionManagement>
 	<reporting>