From adcdd2d05e36a7e2b3997ec62b27a6bf770455d3 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 1 Apr 2020 14:56:57 +0200
Subject: [PATCH 01/13] WIP: reimplementing the adjacency list construction
 process using spark Datasets

---
 .../dnetlib/dhp/oa/provision/GraphJoiner.java | 291 --------------
 .../dhp/oa/provision/GraphJoiner_v2.java      | 328 ++++++++++++++++
 .../dhp/oa/provision/SparkXmlIndexingJob.java |   2 +-
 .../provision/SparkXmlRecordBuilderJob.java   |  47 ---
 .../SparkXmlRecordBuilderJob_v2.java          |  81 ++++
 .../oa/provision/model/EntityRelEntity.java   |  44 +--
 .../dhp/oa/provision/model/JoinedEntity.java  |   9 +-
 .../dhp/oa/provision/model/RelatedEntity.java | 362 ++++++++----------
 .../provision/model/SortableRelationKey.java  |  31 +-
 .../dhp/oa/provision/model/Tuple2.java        |  11 +-
 .../dhp/oa/provision/model/TypedRow.java      |  77 ++--
 .../oa/provision/utils/GraphMappingUtils.java |  26 +-
 .../oa/provision/utils/XmlRecordFactory.java  |   4 +-
 .../dhp/oa/provision/oozie_app/workflow.xml   |   3 +-
 14 files changed, 652 insertions(+), 664 deletions(-)
 delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
 delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner.java
deleted file mode 100644
index def757da31..0000000000
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner.java
+++ /dev/null
@@ -1,291 +0,0 @@
-package eu.dnetlib.dhp.oa.provision;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Maps;
-import com.jayway.jsonpath.DocumentContext;
-import com.jayway.jsonpath.JsonPath;
-import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
-import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
-import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
-import eu.dnetlib.dhp.oa.provision.model.*;
-import eu.dnetlib.dhp.schema.oaf.*;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.util.LongAccumulator;
-import scala.Tuple2;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Map;
-
-import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntity;
-
-/**
- * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
- * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
- * and all the possible relationships (similarity links produced by the Dedup process are excluded).
- *
- * The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
- * by E, finally grouped by E.id;
- *
- * Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
- * 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
- *      it seems that deserializing it with jackson's object mapper has higher memory footprint.
- *
- * 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
- * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
- *      and E_target = T. Objects in T are heavily pruned by all the unnecessary information
- *
- * 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ <T, R> ]
- */
-public class GraphJoiner implements Serializable {
-
-    private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
-
-    public static final int MAX_RELS = 100;
-
-    public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
-
-    private SparkSession spark;
-
-    private ContextMapper contextMapper;
-
-    private String inputPath;
-
-    private String outPath;
-
-    private String otherDsTypeId;
-
-    public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
-        this.spark = spark;
-        this.contextMapper = contextMapper;
-        this.otherDsTypeId = otherDsTypeId;
-        this.inputPath = inputPath;
-        this.outPath = outPath;
-
-        final SparkContext sc = spark.sparkContext();
-        prepareAccumulators(sc);
-    }
-
-    public GraphJoiner adjacencyLists() {
-        final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext());
-
-        // read each entity
-        JavaPairRDD<String, TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
-        JavaPairRDD<String, TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
-        JavaPairRDD<String, TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
-        JavaPairRDD<String, TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
-        JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
-        JavaPairRDD<String, TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
-        JavaPairRDD<String, TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
-
-        // create the union between all the entities
-        final String entitiesPath = getOutPath() + "/entities";
-        datasource
-                .union(organization)
-                .union(project)
-                .union(dataset)
-                .union(otherresearchproduct)
-                .union(software)
-                .union(publication)
-                .map(e -> new EntityRelEntity().setSource(e._2()))
-                .map(GraphMappingUtils::serialize)
-                .saveAsTextFile(entitiesPath, GzipCodec.class);
-
-        JavaPairRDD<String, EntityRelEntity> entities = jsc.textFile(entitiesPath)
-                .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
-                .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
-
-        final String relationPath = getOutPath() + "/relation";
-        // reads the relationships
-        final JavaPairRDD<SortableRelationKey, EntityRelEntity> rels = readPathRelation(jsc, getInputPath())
-                .filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted
-                .map(p -> new EntityRelEntity().setRelation(p))
-                .mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p));
-        rels
-                .groupByKey(new RelationPartitioner(rels.getNumPartitions()))
-                .map(p -> Iterables.limit(p._2(), MAX_RELS))
-                .flatMap(p -> p.iterator())
-                .map(s -> new ObjectMapper().writeValueAsString(s))
-                .saveAsTextFile(relationPath, GzipCodec.class);
-
-        final JavaPairRDD<String, EntityRelEntity> relation = jsc.textFile(relationPath)
-                .map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class))
-                .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
-
-        final String bySourcePath = getOutPath() + "/join_by_source";
-        relation
-                .join(entities
-                        .filter(e -> !e._2().getSource().getDeleted())
-                        .mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
-                .map(s -> new EntityRelEntity()
-                        .setRelation(s._2()._1().getRelation())
-                        .setTarget(s._2()._2().getSource()))
-                .map(j -> new ObjectMapper().writeValueAsString(j))
-                .saveAsTextFile(bySourcePath, GzipCodec.class);
-
-        JavaPairRDD<String, EntityRelEntity> bySource = jsc.textFile(bySourcePath)
-                .map(e -> getObjectMapper().readValue(e, EntityRelEntity.class))
-                .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
-
-        final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
-        entities
-                .union(bySource)
-                .groupByKey()   // by source id
-                .map(l -> toJoinedEntity(l))
-                .mapToPair(je -> new Tuple2<>(
-                        new Text(je.getEntity().getId()),
-                        new Text(recordFactory.build(je))))
-                .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
-
-        return this;
-    }
-
-    public SparkSession getSpark() {
-        return spark;
-    }
-
-    public String getInputPath() {
-        return inputPath;
-    }
-
-    public String getOutPath() {
-        return outPath;
-    }
-
-    // HELPERS
-
-    private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
-        try {
-            switch (GraphMappingUtils.EntityType.valueOf(type)) {
-                case publication:
-                    return mapper.readValue(json, Publication.class);
-                case dataset:
-                    return mapper.readValue(json, Dataset.class);
-                case otherresearchproduct:
-                    return mapper.readValue(json, OtherResearchProduct.class);
-                case software:
-                    return mapper.readValue(json, Software.class);
-                case datasource:
-                    return mapper.readValue(json, Datasource.class);
-                case organization:
-                    return mapper.readValue(json, Organization.class);
-                case project:
-                    return mapper.readValue(json, Project.class);
-                default:
-                    throw new IllegalArgumentException("invalid type: " + type);
-            }
-        } catch (IOException e) {
-            throw new IllegalArgumentException(e);
-        }
-    }
-
-    private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
-        final ObjectMapper mapper = getObjectMapper();
-        final JoinedEntity j = new JoinedEntity();
-        final Links links = new Links();
-        for(EntityRelEntity rel : p._2()) {
-            if (rel.hasMainEntity() & j.getEntity() == null) {
-                j.setType(rel.getSource().getType());
-                j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper));
-            }
-            if (rel.hasRelatedEntity()) {
-                try {
-                    links.add(
-                            new eu.dnetlib.dhp.oa.provision.model.Tuple2()
-                                    .setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class))
-                                    .setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
-                } catch (IOException e) {
-                    throw new IllegalArgumentException(e);
-                }
-            }
-        }
-        j.setLinks(links);
-        if (j.getEntity() == null) {
-            throw new IllegalStateException("missing main entity on '" + p._1() + "'");
-        }
-        return j;
-    }
-
-    /**
-     * Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file <className, entity json serialization>,
-     * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
-     * @param sc
-     * @param inputPath
-     * @param type
-     * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
-     */
-    private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
-        return sc.textFile(inputPath + "/" + type)
-                .mapToPair((PairFunction<String, String, TypedRow>) s -> {
-                    final DocumentContext json = JsonPath.parse(s);
-                    final String id = json.read("$.id");
-                    return new Tuple2<>(id, new TypedRow()
-                            .setSourceId(id)
-                            .setDeleted(json.read("$.dataInfo.deletedbyinference"))
-                            .setType(type)
-                            .setOaf(s));
-                });
-    }
-
-    /**
-     * Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
-     * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
-     * @param sc
-     * @param inputPath
-     * @return the JavaRDD<TypedRow> containing all the relationships
-     */
-    private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
-        return sc.textFile(inputPath + "/relation")
-                .map(s -> {
-                    final DocumentContext json = JsonPath.parse(s);
-                    return new TypedRow()
-                            .setSourceId(json.read("$.source"))
-                            .setTargetId(json.read("$.target"))
-                            .setDeleted(json.read("$.dataInfo.deletedbyinference"))
-                            .setType("relation")
-                            .setRelType("$.relType")
-                            .setSubRelType("$.subRelType")
-                            .setRelClass("$.relClass")
-                            .setOaf(s);
-                });
-    }
-
-    private ObjectMapper getObjectMapper() {
-        return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-    }
-
-    private void prepareAccumulators(SparkContext sc) {
-        accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
-        accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
-        accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
-        accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
-        accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
-        accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
-
-        accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
-        accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
-        accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
-        accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
-        accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
-
-        accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
-        accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
-        accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
-        accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
-        accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
-        accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
-        accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
-    }
-
-}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
new file mode 100644
index 0000000000..d67493f432
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@@ -0,0 +1,328 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Maps;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+import eu.dnetlib.dhp.oa.provision.model.*;
+import eu.dnetlib.dhp.oa.provision.utils.*;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.util.LongAccumulator;
+import scala.Tuple2;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.*;
+
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntity;
+
+/**
+ * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
+ * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
+ * and all the possible relationships (similarity links produced by the Dedup process are excluded).
+ *
+ * The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
+ * by E, finally grouped by E.id;
+ *
+ * Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
+ * 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
+ *      it seems that deserializing it with jackson's object mapper has higher memory footprint.
+ *
+ * 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
+ * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
+ *      and E_target = T. Objects in T are heavily pruned by all the unnecessary information
+ *
+ * 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ <T, R> ]
+ */
+public class GraphJoiner_v2 implements Serializable {
+
+    public static final int LIMIT = 1000000;
+    private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
+
+    public static final int MAX_RELS = 100;
+
+    public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
+
+    private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
+            Arrays.asList(
+                    StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
+                    StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
+            ));
+
+    private static final StructType TYPED_ROW_SCHEMA = StructType$.MODULE$.apply(
+        Arrays.asList(
+                StructField$.MODULE$.apply("sourceId",      DataTypes.StringType, false, Metadata.empty()),
+                StructField$.MODULE$.apply("targetId",      DataTypes.StringType, true, Metadata.empty()),
+                StructField$.MODULE$.apply("deleted",       DataTypes.BooleanType, false, Metadata.empty()),
+                StructField$.MODULE$.apply("type",          DataTypes.StringType, false, Metadata.empty()),
+                StructField$.MODULE$.apply("relType",       DataTypes.StringType, true, Metadata.empty()),
+                StructField$.MODULE$.apply("subRelType",    DataTypes.StringType, true, Metadata.empty()),
+                StructField$.MODULE$.apply("relClass",      DataTypes.StringType, true, Metadata.empty()),
+                StructField$.MODULE$.apply("oaf",           DataTypes.BinaryType, false, Metadata.empty())
+        ));
+
+    private static final StructType ENTITY_REL_ENTITY_SCHEMA = StructType$.MODULE$.apply(
+            Arrays.asList(
+                    StructField$.MODULE$.apply("source",    TYPED_ROW_SCHEMA, false, Metadata.empty()),
+                    StructField$.MODULE$.apply("relation",  TYPED_ROW_SCHEMA, true, Metadata.empty()),
+                    StructField$.MODULE$.apply("target",    TYPED_ROW_SCHEMA, false, Metadata.empty())
+            ));
+
+
+    private SparkSession spark;
+
+    private ContextMapper contextMapper;
+
+    private String inputPath;
+
+    private String outPath;
+
+    private String otherDsTypeId;
+
+    public GraphJoiner_v2(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
+        this.spark = spark;
+        this.contextMapper = contextMapper;
+        this.otherDsTypeId = otherDsTypeId;
+        this.inputPath = inputPath;
+        this.outPath = outPath;
+
+        final SparkContext sc = spark.sparkContext();
+        prepareAccumulators(sc);
+    }
+
+    public GraphJoiner_v2 adjacencyLists() throws IOException {
+
+        final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(getSpark().sparkContext());
+
+        // read each entity
+        Dataset<TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
+        Dataset<TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
+        Dataset<TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
+        Dataset<TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
+        Dataset<TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
+        Dataset<TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
+        Dataset<TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
+
+        // create the union between all the entities
+        Dataset<Tuple2<String, TypedRow>> entities =
+                datasource
+                        .union(organization)
+                        .union(project)
+                        .union(dataset)
+                        .union(otherresearchproduct)
+                        .union(software)
+                        .union(publication)
+                .map((MapFunction<TypedRow, Tuple2<String, TypedRow>>) value -> new Tuple2<>(
+                        value.getId(),
+                        value),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
+                .limit(LIMIT)
+                .cache();
+
+        System.out.println("Entities schema:");
+        entities.printSchema();
+        // reads the relationships
+
+        Dataset<Relation> rels = readPathRelation(jsc, getInputPath())
+                .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
+                .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(Relation.class))
+                .limit(LIMIT)
+                .cache();
+
+        System.out.println("Relation schema:");
+        rels.printSchema();
+
+        Dataset<Tuple2<String, Relation>> relsByTarget = rels
+                .map((MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(), r), Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class)));
+
+        System.out.println("Relation by target schema:");
+        relsByTarget.printSchema();
+
+        Dataset<Tuple2<String, EntityRelEntity>> bySource = relsByTarget
+                .joinWith(entities, relsByTarget.col("_1").equalTo(entities.col("_1")), "inner")
+                .filter((FilterFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, TypedRow>>>) value -> value._2()._2().getDeleted() == false)
+                .map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, TypedRow>>, EntityRelEntity>) t -> {
+                    EntityRelEntity e = new EntityRelEntity();
+                    e.setRelation(t._1()._2());
+                    e.setTarget(asRelatedEntity(t._2()._2()));
+                    return e;
+                }, Encoders.bean(EntityRelEntity.class))
+                .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>) e -> new Tuple2<>(e.getRelation().getSource(), e),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
+
+        System.out.println("bySource schema");
+        bySource.printSchema();
+
+        Dataset<EntityRelEntity> joined = entities
+                .joinWith(bySource, entities.col("_1").equalTo(bySource.col("_1")), "left")
+                .map((MapFunction<Tuple2<Tuple2<String, TypedRow>, Tuple2<String, EntityRelEntity>>, EntityRelEntity>) value -> {
+                    EntityRelEntity re = new EntityRelEntity();
+                    re.setEntity(value._1()._2());
+                    Optional<EntityRelEntity> related = Optional.ofNullable(value._2()).map(Tuple2::_2);
+                    if (related.isPresent()) {
+                        re.setRelation(related.get().getRelation());
+                        re.setTarget(related.get().getTarget());
+                    }
+                    return re;
+                }, Encoders.kryo(EntityRelEntity.class));
+
+        System.out.println("joined schema");
+        joined.printSchema();
+        //joined.write().json(getOutPath() + "/joined");
+
+        final Dataset<JoinedEntity> grouped = joined
+                .groupByKey((MapFunction<EntityRelEntity, TypedRow>) e -> e.getEntity(), Encoders.kryo(TypedRow.class))
+                .mapGroups((MapGroupsFunction<TypedRow, EntityRelEntity, JoinedEntity>) (key, values) -> toJoinedEntity(key, values), Encoders.kryo(JoinedEntity.class));
+
+        System.out.println("grouped schema");
+        grouped.printSchema();
+
+        final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
+        grouped
+                .map((MapFunction<JoinedEntity, String>) value -> recordFactory.build(value), Encoders.STRING())
+                .limit(LIMIT)
+                .write()
+                .text(getOutPath() + "/xml");
+        /*
+                .javaRDD()
+                .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> new Tuple2<>(t._1(), t._2()))
+                .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+
+         */
+
+        return this;
+    }
+
+    public SparkSession getSpark() {
+        return spark;
+    }
+
+    public String getInputPath() {
+        return inputPath;
+    }
+
+    public String getOutPath() {
+        return outPath;
+    }
+
+    // HELPERS
+
+    private JoinedEntity toJoinedEntity(TypedRow key, Iterator<EntityRelEntity> values) {
+        final ObjectMapper mapper = getObjectMapper();
+        final JoinedEntity j = new JoinedEntity();
+        j.setType(key.getType());
+        j.setEntity(parseOaf(key.getOaf(), key.getType(), mapper));
+        final Links links = new Links();
+        values.forEachRemaining(rel -> links.add(
+                new eu.dnetlib.dhp.oa.provision.model.Tuple2(
+                        rel.getRelation(),
+                        rel.getTarget()
+                )));
+        j.setLinks(links);
+        return j;
+    }
+
+    private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
+        try {
+            switch (GraphMappingUtils.EntityType.valueOf(type)) {
+                case publication:
+                    return mapper.readValue(json, Publication.class);
+                case dataset:
+                    return mapper.readValue(json, eu.dnetlib.dhp.schema.oaf.Dataset.class);
+                case otherresearchproduct:
+                    return mapper.readValue(json, OtherResearchProduct.class);
+                case software:
+                    return mapper.readValue(json, Software.class);
+                case datasource:
+                    return mapper.readValue(json, Datasource.class);
+                case organization:
+                    return mapper.readValue(json, Organization.class);
+                case project:
+                    return mapper.readValue(json, Project.class);
+                default:
+                    throw new IllegalArgumentException("invalid type: " + type);
+            }
+        } catch (IOException e) {
+            throw new IllegalArgumentException(e);
+        }
+    }
+
+    /**
+     * Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a new line delimited json file,
+     * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
+     * @param sc
+     * @param inputPath
+     * @param type
+     * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
+     */
+    private Dataset<TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
+        RDD<Row> rdd = sc.textFile(inputPath + "/" + type)
+                .map((Function<String, Row>) s -> RowFactory.create("", s))
+                .rdd();
+
+        return getSpark().createDataFrame(rdd, KV_SCHEMA)
+                .map((MapFunction<Row, TypedRow>) row -> {
+                    final String s = row.getAs("value");
+                    final DocumentContext json = JsonPath.parse(s);
+                    final TypedRow t = new TypedRow();
+                    t.setId(json.read("$.id"));
+                    t.setDeleted(json.read("$.dataInfo.deletedbyinference"));
+                    t.setType(type);
+                    t.setOaf(s);
+                    return t;
+                }, Encoders.bean(TypedRow.class));
+    }
+
+    /**
+     * Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
+     * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
+     * @param sc
+     * @param inputPath
+     * @return the JavaRDD<TypedRow> containing all the relationships
+     */
+    private Dataset<Relation> readPathRelation(final JavaSparkContext sc, final String inputPath) {
+        final RDD<Row> rdd = sc.textFile(inputPath + "/relation")
+                .map((Function<String, Row>) s -> RowFactory.create("", s))
+                .rdd();
+
+        return getSpark().createDataFrame(rdd, KV_SCHEMA)
+                .map((MapFunction<Row, Relation>) value -> new ObjectMapper().readValue(value.<String>getAs("value"), Relation.class), Encoders.bean(Relation.class));
+    }
+
+    private ObjectMapper getObjectMapper() {
+        return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+    }
+
+    private void prepareAccumulators(SparkContext sc) {
+        accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
+        accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
+        accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
+        accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
+        accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
+        accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
+
+        accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
+        accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
+        accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
+        accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
+        accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
+
+        accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
+        accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
+        accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
+        accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
+        accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
+        accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
+        accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
+    }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
index cafbc86533..975ac75485 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
@@ -84,7 +84,7 @@ public class SparkXmlIndexingJob {
         return SparkSession
                 .builder()
                 .config(conf)
-                .appName(SparkXmlRecordBuilderJob.class.getSimpleName())
+                .appName(SparkXmlIndexingJob.class.getSimpleName())
                 .master(master)
                 .getOrCreate();
     }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob.java
deleted file mode 100644
index 0a898c0fcc..0000000000
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob.java
+++ /dev/null
@@ -1,47 +0,0 @@
-package eu.dnetlib.dhp.oa.provision;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SparkSession;
-
-public class SparkXmlRecordBuilderJob {
-
-    public static void main(String[] args) throws Exception {
-
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-                IOUtils.toString(
-                        SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
-        parser.parseArgument(args);
-
-        final String master = parser.get("master");
-        final SparkConf conf = new SparkConf()
-                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
-
-        try(SparkSession spark = getSession(conf, master)) {
-
-            final String inputPath = parser.get("sourcePath");
-            final String outputPath = parser.get("outputPath");
-            final String isLookupUrl = parser.get("isLookupUrl");
-            final String otherDsTypeId = parser.get("otherDsTypeId");
-
-            final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
-
-            new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
-                    .adjacencyLists();
-        }
-    }
-
-    private static SparkSession getSession(SparkConf conf, String master) {
-        return SparkSession
-                .builder()
-                .config(conf)
-                .appName(SparkXmlRecordBuilderJob.class.getSimpleName())
-                .master(master)
-                .getOrCreate();
-    }
-
-}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
new file mode 100644
index 0000000000..3b119cebb4
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
@@ -0,0 +1,81 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.model.*;
+import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SparkSession;
+
+public class SparkXmlRecordBuilderJob_v2 {
+
+    public static void main(String[] args) throws Exception {
+
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkXmlRecordBuilderJob_v2.class.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
+        parser.parseArgument(args);
+
+        final String master = parser.get("master");
+        try(SparkSession spark = getSession(master)) {
+
+            final String inputPath = parser.get("sourcePath");
+            final String outputPath = parser.get("outputPath");
+            final String isLookupUrl = parser.get("isLookupUrl");
+            final String otherDsTypeId = parser.get("otherDsTypeId");
+
+            new GraphJoiner_v2(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
+                    .adjacencyLists();
+        }
+    }
+
+    private static SparkSession getSession(String master) {
+        final SparkConf conf = new SparkConf();
+        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+        conf.set("spark.sql.shuffle.partitions", "500");
+        conf.registerKryoClasses(new Class[]{
+                Author.class,
+                Context.class,
+                Country.class,
+                DataInfo.class,
+                eu.dnetlib.dhp.schema.oaf.Dataset.class,
+                Datasource.class,
+                ExternalReference.class,
+                ExtraInfo.class,
+                Field.class,
+                GeoLocation.class,
+                Instance.class,
+                Journal.class,
+                KeyValue.class,
+                Oaf.class,
+                OafEntity.class,
+                OAIProvenance.class,
+                Organization.class,
+                OriginDescription.class,
+                OtherResearchProduct.class,
+                Project.class,
+                Publication.class,
+                Qualifier.class,
+                Relation.class,
+                Result.class,
+                Software.class,
+                StructuredProperty.class,
+
+                TypedRow.class,
+                EntityRelEntity.class,
+                JoinedEntity.class,
+                SortableRelationKey.class,
+                Tuple2.class,
+                Links.class,
+                RelatedEntity.class
+        });
+        return SparkSession
+                .builder()
+                .config(conf)
+                .appName(SparkXmlRecordBuilderJob_v2.class.getSimpleName())
+                .master(master)
+                .getOrCreate();
+    }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
index ba89eaa389..ddeec140b7 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
@@ -1,54 +1,36 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
 import java.io.Serializable;
 
 public class EntityRelEntity implements Serializable {
 
-    private TypedRow source;
-    private TypedRow relation;
-    private TypedRow target;
+    private TypedRow entity;
+    private Relation relation;
+    private RelatedEntity target;
 
-    public EntityRelEntity() {
+    public TypedRow getEntity() {
+        return entity;
     }
 
-    public EntityRelEntity(TypedRow source) {
-        this.source = source;
+    public void setEntity(TypedRow entity) {
+        this.entity = entity;
     }
 
-    //helpers
-    public Boolean hasMainEntity() {
-        return getSource() != null & getRelation() == null & getTarget() == null;
-    }
-
-    public Boolean hasRelatedEntity() {
-        return getSource() == null & getRelation() != null & getTarget() != null;
-    }
-
-
-    public TypedRow getSource() {
-        return source;
-    }
-
-    public EntityRelEntity setSource(TypedRow source) {
-        this.source = source;
-        return this;
-    }
-
-    public TypedRow getRelation() {
+    public Relation getRelation() {
         return relation;
     }
 
-    public EntityRelEntity setRelation(TypedRow relation) {
+    public void setRelation(Relation relation) {
         this.relation = relation;
-        return this;
     }
 
-    public TypedRow getTarget() {
+    public RelatedEntity getTarget() {
         return target;
     }
 
-    public EntityRelEntity setTarget(TypedRow target) {
+    public void setTarget(RelatedEntity target) {
         this.target = target;
-        return this;
     }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
index 80b15a4d61..815863c678 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
@@ -16,26 +16,23 @@ public class JoinedEntity implements Serializable {
         return type;
     }
 
-    public JoinedEntity setType(String type) {
+    public void setType(String type) {
         this.type = type;
-        return this;
     }
 
     public OafEntity getEntity() {
         return entity;
     }
 
-    public JoinedEntity setEntity(OafEntity entity) {
+    public void setEntity(OafEntity entity) {
         this.entity = entity;
-        return this;
     }
 
     public Links getLinks() {
         return links;
     }
 
-    public JoinedEntity setLinks(Links links) {
+    public void setLinks(Links links) {
         this.links = links;
-        return this;
     }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
index 75e9045e86..2e5b4186cb 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
@@ -49,207 +49,183 @@ public class RelatedEntity implements Serializable {
         return id;
     }
 
-    public RelatedEntity setId(String id) {
+    public void setId(String id) {
         this.id = id;
-        return this;
-    }
-
-    public StructuredProperty getTitle() {
-        return title;
-    }
-
-    public RelatedEntity setTitle(StructuredProperty title) {
-        this.title = title;
-        return this;
-    }
-
-    public String getDateofacceptance() {
-        return dateofacceptance;
-    }
-
-    public RelatedEntity setDateofacceptance(String dateofacceptance) {
-        this.dateofacceptance = dateofacceptance;
-        return this;
-    }
-
-    public String getPublisher() {
-        return publisher;
-    }
-
-    public RelatedEntity setPublisher(String publisher) {
-        this.publisher = publisher;
-        return this;
-    }
-
-    public List<StructuredProperty> getPid() {
-        return pid;
-    }
-
-    public RelatedEntity setPid(List<StructuredProperty> pid) {
-        this.pid = pid;
-        return this;
-    }
-
-    public String getCodeRepositoryUrl() {
-        return codeRepositoryUrl;
-    }
-
-    public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) {
-        this.codeRepositoryUrl = codeRepositoryUrl;
-        return this;
-    }
-
-    public Qualifier getResulttype() {
-        return resulttype;
-    }
-
-    public RelatedEntity setResulttype(Qualifier resulttype) {
-        this.resulttype = resulttype;
-        return this;
-    }
-
-    public List<KeyValue> getCollectedfrom() {
-        return collectedfrom;
-    }
-
-    public RelatedEntity setCollectedfrom(List<KeyValue> collectedfrom) {
-        this.collectedfrom = collectedfrom;
-        return this;
-    }
-
-    public List<Instance> getInstances() {
-        return instances;
-    }
-
-    public RelatedEntity setInstances(List<Instance> instances) {
-        this.instances = instances;
-        return this;
-    }
-
-    public String getOfficialname() {
-        return officialname;
-    }
-
-    public RelatedEntity setOfficialname(String officialname) {
-        this.officialname = officialname;
-        return this;
-    }
-
-    public String getWebsiteurl() {
-        return websiteurl;
-    }
-
-    public RelatedEntity setWebsiteurl(String websiteurl) {
-        this.websiteurl = websiteurl;
-        return this;
-    }
-
-    public Qualifier getDatasourcetype() {
-        return datasourcetype;
-    }
-
-    public RelatedEntity setDatasourcetype(Qualifier datasourcetype) {
-        this.datasourcetype = datasourcetype;
-        return this;
-    }
-
-    public Qualifier getDatasourcetypeui() {
-        return datasourcetypeui;
-    }
-
-    public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) {
-        this.datasourcetypeui = datasourcetypeui;
-        return this;
-    }
-
-    public Qualifier getOpenairecompatibility() {
-        return openairecompatibility;
-    }
-
-    public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) {
-        this.openairecompatibility = openairecompatibility;
-        return this;
-    }
-
-    public String getLegalname() {
-        return legalname;
-    }
-
-    public RelatedEntity setLegalname(String legalname) {
-        this.legalname = legalname;
-        return this;
-    }
-
-    public String getLegalshortname() {
-        return legalshortname;
-    }
-
-    public RelatedEntity setLegalshortname(String legalshortname) {
-        this.legalshortname = legalshortname;
-        return this;
-    }
-
-    public Qualifier getCountry() {
-        return country;
-    }
-
-    public RelatedEntity setCountry(Qualifier country) {
-        this.country = country;
-        return this;
-    }
-
-    public String getCode() {
-        return code;
-    }
-
-    public RelatedEntity setCode(String code) {
-        this.code = code;
-        return this;
-    }
-
-    public String getAcronym() {
-        return acronym;
-    }
-
-    public RelatedEntity setAcronym(String acronym) {
-        this.acronym = acronym;
-        return this;
-    }
-
-    public Qualifier getContracttype() {
-        return contracttype;
-    }
-
-    public RelatedEntity setContracttype(Qualifier contracttype) {
-        this.contracttype = contracttype;
-        return this;
-    }
-
-    public List<String> getFundingtree() {
-        return fundingtree;
-    }
-
-    public RelatedEntity setFundingtree(List<String> fundingtree) {
-        this.fundingtree = fundingtree;
-        return this;
-    }
-
-    public String getProjectTitle() {
-        return projectTitle;
-    }
-
-    public RelatedEntity setProjectTitle(String projectTitle) {
-        this.projectTitle = projectTitle;
-        return this;
     }
 
     public String getType() {
         return type;
     }
 
-    public RelatedEntity setType(String type) {
+    public void setType(String type) {
         this.type = type;
-        return this;
     }
 
+    public StructuredProperty getTitle() {
+        return title;
+    }
+
+    public void setTitle(StructuredProperty title) {
+        this.title = title;
+    }
+
+    public String getWebsiteurl() {
+        return websiteurl;
+    }
+
+    public void setWebsiteurl(String websiteurl) {
+        this.websiteurl = websiteurl;
+    }
+
+    public String getDateofacceptance() {
+        return dateofacceptance;
+    }
+
+    public void setDateofacceptance(String dateofacceptance) {
+        this.dateofacceptance = dateofacceptance;
+    }
+
+    public String getPublisher() {
+        return publisher;
+    }
+
+    public void setPublisher(String publisher) {
+        this.publisher = publisher;
+    }
+
+    public List<StructuredProperty> getPid() {
+        return pid;
+    }
+
+    public void setPid(List<StructuredProperty> pid) {
+        this.pid = pid;
+    }
+
+    public String getCodeRepositoryUrl() {
+        return codeRepositoryUrl;
+    }
+
+    public void setCodeRepositoryUrl(String codeRepositoryUrl) {
+        this.codeRepositoryUrl = codeRepositoryUrl;
+    }
+
+    public Qualifier getResulttype() {
+        return resulttype;
+    }
+
+    public void setResulttype(Qualifier resulttype) {
+        this.resulttype = resulttype;
+    }
+
+    public List<KeyValue> getCollectedfrom() {
+        return collectedfrom;
+    }
+
+    public void setCollectedfrom(List<KeyValue> collectedfrom) {
+        this.collectedfrom = collectedfrom;
+    }
+
+    public List<Instance> getInstances() {
+        return instances;
+    }
+
+    public void setInstances(List<Instance> instances) {
+        this.instances = instances;
+    }
+
+    public String getOfficialname() {
+        return officialname;
+    }
+
+    public void setOfficialname(String officialname) {
+        this.officialname = officialname;
+    }
+
+    public Qualifier getDatasourcetype() {
+        return datasourcetype;
+    }
+
+    public void setDatasourcetype(Qualifier datasourcetype) {
+        this.datasourcetype = datasourcetype;
+    }
+
+    public Qualifier getDatasourcetypeui() {
+        return datasourcetypeui;
+    }
+
+    public void setDatasourcetypeui(Qualifier datasourcetypeui) {
+        this.datasourcetypeui = datasourcetypeui;
+    }
+
+    public Qualifier getOpenairecompatibility() {
+        return openairecompatibility;
+    }
+
+    public void setOpenairecompatibility(Qualifier openairecompatibility) {
+        this.openairecompatibility = openairecompatibility;
+    }
+
+    public String getLegalname() {
+        return legalname;
+    }
+
+    public void setLegalname(String legalname) {
+        this.legalname = legalname;
+    }
+
+    public String getLegalshortname() {
+        return legalshortname;
+    }
+
+    public void setLegalshortname(String legalshortname) {
+        this.legalshortname = legalshortname;
+    }
+
+    public Qualifier getCountry() {
+        return country;
+    }
+
+    public void setCountry(Qualifier country) {
+        this.country = country;
+    }
+
+    public String getProjectTitle() {
+        return projectTitle;
+    }
+
+    public void setProjectTitle(String projectTitle) {
+        this.projectTitle = projectTitle;
+    }
+
+    public String getCode() {
+        return code;
+    }
+
+    public void setCode(String code) {
+        this.code = code;
+    }
+
+    public String getAcronym() {
+        return acronym;
+    }
+
+    public void setAcronym(String acronym) {
+        this.acronym = acronym;
+    }
+
+    public Qualifier getContracttype() {
+        return contracttype;
+    }
+
+    public void setContracttype(Qualifier contracttype) {
+        this.contracttype = contracttype;
+    }
+
+    public List<String> getFundingtree() {
+        return fundingtree;
+    }
+
+    public void setFundingtree(List<String> fundingtree) {
+        this.fundingtree = fundingtree;
+    }
 }
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java
index 8169e57e09..fef9915e80 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java
@@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.provision.model;
 
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.Maps;
+import eu.dnetlib.dhp.schema.oaf.Relation;
 
 import java.io.Serializable;
 import java.util.Map;
@@ -33,58 +34,54 @@ public class SortableRelationKey implements Comparable<SortableRelationKey>, Ser
         weights.put("dedup", 8);
     }
 
-    public static SortableRelationKey from(final EntityRelEntity e) {
-        return new SortableRelationKey()
-                .setSourceId(e.getRelation().getSourceId())
-                .setTargetId(e.getRelation().getTargetId())
-                .setRelType(e.getRelation().getRelType())
-                .setSubRelType(e.getRelation().getSubRelType())
-                .setRelClass(e.getRelation().getRelClass());
+    public static SortableRelationKey from(final Relation r) {
+        final SortableRelationKey s = new SortableRelationKey();
+        s.setSourceId(r.getSource());
+        s.setTargetId(r.getTarget());
+        s.setRelType(r.getRelType());
+        s.setSubRelType(r.getSubRelType());
+        s.setRelClass(r.getRelClass());
+        return s;
     }
 
     public String getSourceId() {
         return sourceId;
     }
 
-    public SortableRelationKey setSourceId(String sourceId) {
+    public void setSourceId(String sourceId) {
         this.sourceId = sourceId;
-        return this;
     }
 
     public String getTargetId() {
         return targetId;
     }
 
-    public SortableRelationKey setTargetId(String targetId) {
+    public void setTargetId(String targetId) {
         this.targetId = targetId;
-        return this;
     }
 
     public String getRelType() {
         return relType;
     }
 
-    public SortableRelationKey setRelType(String relType) {
+    public void setRelType(String relType) {
         this.relType = relType;
-        return this;
     }
 
     public String getSubRelType() {
         return subRelType;
     }
 
-    public SortableRelationKey setSubRelType(String subRelType) {
+    public void setSubRelType(String subRelType) {
         this.subRelType = subRelType;
-        return this;
     }
 
     public String getRelClass() {
         return relClass;
     }
 
-    public SortableRelationKey setRelClass(String relClass) {
+    public void setRelClass(String relClass) {
         this.relClass = relClass;
-        return this;
     }
 
     @Override
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
index ded976eeae..db639f1132 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
@@ -8,21 +8,24 @@ public class Tuple2 {
 
     private RelatedEntity relatedEntity;
 
+    public Tuple2(Relation relation, RelatedEntity relatedEntity) {
+        this.relation = relation;
+        this.relatedEntity = relatedEntity;
+    }
+
     public Relation getRelation() {
         return relation;
     }
 
-    public Tuple2 setRelation(Relation relation) {
+    public void setRelation(Relation relation) {
         this.relation = relation;
-        return this;
     }
 
     public RelatedEntity getRelatedEntity() {
         return relatedEntity;
     }
 
-    public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) {
+    public void setRelatedEntity(RelatedEntity relatedEntity) {
         this.relatedEntity = relatedEntity;
-        return this;
     }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java
index e275fd9daf..54f34802f0 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java
@@ -1,92 +1,61 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
+import com.google.common.base.Objects;
+
 import java.io.Serializable;
 
 public class TypedRow implements Serializable {
 
-    private String sourceId;
-
-    private String targetId;
+    private String id;
 
     private Boolean deleted;
 
     private String type;
 
-    private String relType;
-    private String subRelType;
-    private String relClass;
-
     private String oaf;
 
-    public String getSourceId() {
-        return sourceId;
+    public String getId() {
+        return id;
     }
 
-    public TypedRow setSourceId(String sourceId) {
-        this.sourceId = sourceId;
-        return this;
-    }
-
-    public String getTargetId() {
-        return targetId;
-    }
-
-    public TypedRow setTargetId(String targetId) {
-        this.targetId = targetId;
-        return this;
+    public void setId(String id) {
+        this.id = id;
     }
 
     public Boolean getDeleted() {
         return deleted;
     }
 
-    public TypedRow setDeleted(Boolean deleted) {
+    public void setDeleted(Boolean deleted) {
         this.deleted = deleted;
-        return this;
     }
 
     public String getType() {
         return type;
     }
 
-    public TypedRow setType(String type) {
+    public void setType(String type) {
         this.type = type;
-        return this;
-    }
-
-    public String getRelType() {
-        return relType;
-    }
-
-    public TypedRow setRelType(String relType) {
-        this.relType = relType;
-        return this;
-    }
-
-    public String getSubRelType() {
-        return subRelType;
-    }
-
-    public TypedRow setSubRelType(String subRelType) {
-        this.subRelType = subRelType;
-        return this;
-    }
-
-    public String getRelClass() {
-        return relClass;
-    }
-
-    public TypedRow setRelClass(String relClass) {
-        this.relClass = relClass;
-        return this;
     }
 
     public String getOaf() {
         return oaf;
     }
 
-    public TypedRow setOaf(String oaf) {
+    public void setOaf(String oaf) {
         this.oaf = oaf;
-        return this;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        TypedRow typedRow2 = (TypedRow) o;
+        return Objects.equal(id, typedRow2.id);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(id);
     }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
index a48c812fc8..27b42e69d5 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
@@ -3,14 +3,11 @@ package eu.dnetlib.dhp.oa.provision.utils;
 import com.fasterxml.jackson.annotation.JsonInclude;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.base.Predicate;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import com.jayway.jsonpath.DocumentContext;
 import com.jayway.jsonpath.JsonPath;
-import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
-import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
-import eu.dnetlib.dhp.oa.provision.model.TypedRow;
+import eu.dnetlib.dhp.oa.provision.model.*;
 import eu.dnetlib.dhp.schema.oaf.*;
 import net.minidev.json.JSONArray;
 import org.apache.commons.lang3.StringUtils;
@@ -66,14 +63,14 @@ public class GraphMappingUtils {
         return MainEntityType.result.name().equals(getMainType(type));
     }
 
-    public static Predicate<String> instanceFilter = s -> instanceFieldFilter.contains(s);
+    public static RelatedEntity asRelatedEntity(TypedRow e) {
 
-    public static EntityRelEntity asRelatedEntity(EntityRelEntity e) {
+        final DocumentContext j = JsonPath.parse(e.getOaf());
+        final RelatedEntity re = new RelatedEntity();
+        re.setId(j.read("$.id"));
+        re.setType(e.getType());
 
-        final DocumentContext j = JsonPath.parse(e.getSource().getOaf());
-        final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType());
-
-        switch (EntityType.valueOf(e.getSource().getType())) {
+        switch (EntityType.valueOf(e.getType())) {
             case publication:
             case dataset:
             case otherresearchproduct:
@@ -147,14 +144,11 @@ public class GraphMappingUtils {
 
                 break;
         }
-        return new EntityRelEntity().setSource(
-                new TypedRow()
-                        .setSourceId(e.getSource().getSourceId())
-                        .setDeleted(e.getSource().getDeleted())
-                        .setType(e.getSource().getType())
-                        .setOaf(serialize(re)));
+
+        return re;
     }
 
+
     private static KeyValue asKV(LinkedHashMap<String, Object> j) {
         final KeyValue kv = new KeyValue();
         kv.setKey((String) j.get("key"));
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
index ffbe54904b..f2b3aa2e73 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@@ -7,9 +7,7 @@ import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import com.mycila.xmltool.XMLDoc;
 import com.mycila.xmltool.XMLTag;
-import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
-import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
-import eu.dnetlib.dhp.oa.provision.model.Tuple2;
+import eu.dnetlib.dhp.oa.provision.model.*;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.*;
 import org.apache.commons.lang3.StringUtils;
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index a28174cce2..e981c450ea 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -65,7 +65,7 @@
             <master>yarn</master>
             <mode>cluster</mode>
             <name>build_adjacency_lists</name>
-            <class>eu.dnetlib.dhp.oa.provision.SparkXmlRecordBuilderJob</class>
+            <class>eu.dnetlib.dhp.oa.provision.SparkXmlRecordBuilderJob_v2</class>
             <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-cores ${sparkExecutorCoresForJoining}
@@ -75,6 +75,7 @@
                 --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.network.timeout=10000000
             </spark-opts>
             <arg>-mt</arg> <arg>yarn</arg>
             <arg>-is</arg> <arg>${isLookupUrl}</arg>

From 1402eb1fe7e7a0599003b5afbe8ab7e519a0ae32 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 1 Apr 2020 15:38:50 +0200
Subject: [PATCH 02/13] cleanup

---
 .../dhp/oa/provision/GraphJoiner_v2.java      | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
index d67493f432..236ef93ec8 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@@ -58,26 +58,6 @@ public class GraphJoiner_v2 implements Serializable {
                     StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
             ));
 
-    private static final StructType TYPED_ROW_SCHEMA = StructType$.MODULE$.apply(
-        Arrays.asList(
-                StructField$.MODULE$.apply("sourceId",      DataTypes.StringType, false, Metadata.empty()),
-                StructField$.MODULE$.apply("targetId",      DataTypes.StringType, true, Metadata.empty()),
-                StructField$.MODULE$.apply("deleted",       DataTypes.BooleanType, false, Metadata.empty()),
-                StructField$.MODULE$.apply("type",          DataTypes.StringType, false, Metadata.empty()),
-                StructField$.MODULE$.apply("relType",       DataTypes.StringType, true, Metadata.empty()),
-                StructField$.MODULE$.apply("subRelType",    DataTypes.StringType, true, Metadata.empty()),
-                StructField$.MODULE$.apply("relClass",      DataTypes.StringType, true, Metadata.empty()),
-                StructField$.MODULE$.apply("oaf",           DataTypes.BinaryType, false, Metadata.empty())
-        ));
-
-    private static final StructType ENTITY_REL_ENTITY_SCHEMA = StructType$.MODULE$.apply(
-            Arrays.asList(
-                    StructField$.MODULE$.apply("source",    TYPED_ROW_SCHEMA, false, Metadata.empty()),
-                    StructField$.MODULE$.apply("relation",  TYPED_ROW_SCHEMA, true, Metadata.empty()),
-                    StructField$.MODULE$.apply("target",    TYPED_ROW_SCHEMA, false, Metadata.empty())
-            ));
-
-
     private SparkSession spark;
 
     private ContextMapper contextMapper;

From 9c7092416a9a535792389ac72206868ccc4eabc5 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 1 Apr 2020 19:07:30 +0200
Subject: [PATCH 03/13] dataset based provision WIP

---
 .../dhp/oa/provision/GraphJoiner_v2.java      | 25 ++++-------------
 .../SparkXmlRecordBuilderJob_v2.java          | 10 +++----
 .../input_params_build_adjacency_lists.json   |  3 +-
 .../oa/provision/oozie_app/config-default.xml |  6 +---
 .../dhp/oa/provision/oozie_app/workflow.xml   | 28 ++++++++++++-------
 5 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
index 236ef93ec8..d9f79a9674 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@@ -45,19 +45,12 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntit
  */
 public class GraphJoiner_v2 implements Serializable {
 
-    public static final int LIMIT = 1000000;
     private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
 
     public static final int MAX_RELS = 100;
 
     public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
 
-    private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
-            Arrays.asList(
-                    StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
-                    StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
-            ));
-
     private SparkSession spark;
 
     private ContextMapper contextMapper;
@@ -105,7 +98,6 @@ public class GraphJoiner_v2 implements Serializable {
                         value.getId(),
                         value),
                         Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
-                .limit(LIMIT)
                 .cache();
 
         System.out.println("Entities schema:");
@@ -115,7 +107,6 @@ public class GraphJoiner_v2 implements Serializable {
         Dataset<Relation> rels = readPathRelation(jsc, getInputPath())
                 .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
                 .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(Relation.class))
-                .limit(LIMIT)
                 .cache();
 
         System.out.println("Relation schema:");
@@ -169,7 +160,6 @@ public class GraphJoiner_v2 implements Serializable {
         final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
         grouped
                 .map((MapFunction<JoinedEntity, String>) value -> recordFactory.build(value), Encoders.STRING())
-                .limit(LIMIT)
                 .write()
                 .text(getOutPath() + "/xml");
         /*
@@ -245,13 +235,11 @@ public class GraphJoiner_v2 implements Serializable {
      * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
      */
     private Dataset<TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
-        RDD<Row> rdd = sc.textFile(inputPath + "/" + type)
-                .map((Function<String, Row>) s -> RowFactory.create("", s))
+        RDD<String> rdd = sc.textFile(inputPath + "/" + type)
                 .rdd();
 
-        return getSpark().createDataFrame(rdd, KV_SCHEMA)
-                .map((MapFunction<Row, TypedRow>) row -> {
-                    final String s = row.getAs("value");
+        return getSpark().createDataset(rdd, Encoders.STRING())
+                .map((MapFunction<String, TypedRow>) s -> {
                     final DocumentContext json = JsonPath.parse(s);
                     final TypedRow t = new TypedRow();
                     t.setId(json.read("$.id"));
@@ -270,12 +258,11 @@ public class GraphJoiner_v2 implements Serializable {
      * @return the JavaRDD<TypedRow> containing all the relationships
      */
     private Dataset<Relation> readPathRelation(final JavaSparkContext sc, final String inputPath) {
-        final RDD<Row> rdd = sc.textFile(inputPath + "/relation")
-                .map((Function<String, Row>) s -> RowFactory.create("", s))
+        final RDD<String> rdd = sc.textFile(inputPath + "/relation")
                 .rdd();
 
-        return getSpark().createDataFrame(rdd, KV_SCHEMA)
-                .map((MapFunction<Row, Relation>) value -> new ObjectMapper().readValue(value.<String>getAs("value"), Relation.class), Encoders.bean(Relation.class));
+        return getSpark().createDataset(rdd, Encoders.STRING())
+                .map((MapFunction<String, Relation>) s -> new ObjectMapper().readValue(s, Relation.class), Encoders.bean(Relation.class));
     }
 
     private ObjectMapper getObjectMapper() {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
index 3b119cebb4..e4124e52fb 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
@@ -17,23 +17,23 @@ public class SparkXmlRecordBuilderJob_v2 {
                         SparkXmlRecordBuilderJob_v2.class.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
         parser.parseArgument(args);
 
-        final String master = parser.get("master");
-        try(SparkSession spark = getSession(master)) {
+        try(SparkSession spark = getSession(parser)) {
 
             final String inputPath = parser.get("sourcePath");
             final String outputPath = parser.get("outputPath");
             final String isLookupUrl = parser.get("isLookupUrl");
             final String otherDsTypeId = parser.get("otherDsTypeId");
 
+
             new GraphJoiner_v2(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
                     .adjacencyLists();
         }
     }
 
-    private static SparkSession getSession(String master) {
+    private static SparkSession getSession(ArgumentApplicationParser parser) {
         final SparkConf conf = new SparkConf();
         conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
-        conf.set("spark.sql.shuffle.partitions", "500");
+        conf.set("spark.sql.shuffle.partitions", parser.get("sparkSqlShufflePartitions"));
         conf.registerKryoClasses(new Class[]{
                 Author.class,
                 Context.class,
@@ -74,7 +74,7 @@ public class SparkXmlRecordBuilderJob_v2 {
                 .builder()
                 .config(conf)
                 .appName(SparkXmlRecordBuilderJob_v2.class.getSimpleName())
-                .master(master)
+                .master(parser.get("master"))
                 .getOrCreate();
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
index a5d20a55f4..bbac579feb 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
@@ -3,5 +3,6 @@
   {"paramName":"is",  "paramLongName":"isLookupUrl",     "paramDescription": "URL of the isLookUp Service",                                 "paramRequired": true},
   {"paramName":"o",   "paramLongName":"outputPath",      "paramDescription": "the path used to store temporary output files",               "paramRequired": true},
   {"paramName":"s",   "paramLongName":"sourcePath",      "paramDescription": "the path of the sequence file to read",                       "paramRequired": true},
-  {"paramName":"t",   "paramLongName":"otherDsTypeId",   "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true}
+  {"paramName":"t",   "paramLongName":"otherDsTypeId",   "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true},
+  {"paramName":"sp",  "paramLongName":"sparkSqlShufflePartitions",   "paramDescription": "Configures the number of partitions to use when shuffling data for joins or aggregations", "paramRequired": true}
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
index 624d3ea763..c0364c2cf2 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
@@ -19,13 +19,9 @@
         <name>hive_metastore_uris</name>
         <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
     </property>
-    <property>
-        <name>hive_db_name</name>
-        <value>openaire</value>
-    </property>
     <property>
         <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
     </property>
     <property>
         <name>spark2EventLogDir</name>
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index e981c450ea..f4bd3f19ee 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -2,19 +2,27 @@
 
     <parameters>
         <property>
-            <name>hive_db_name</name>
-            <description>the target hive database name</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
+            <name>sparkDriverMemoryForJoining</name>
             <description>memory for driver process</description>
         </property>
         <property>
-            <name>sparkExecutorMemory</name>
+            <name>sparkExecutorMemoryForJoining</name>
             <description>memory for individual executor</description>
         </property>
         <property>
-            <name>sparkExecutorCores</name>
+            <name>sparkExecutorCoresForJoining</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>sparkDriverMemoryForIndexing</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemoryForIndexing</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCoresForIndexing</name>
             <description>number of cores used by single executor</description>
         </property>
         <property>
@@ -75,13 +83,13 @@
                 --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.network.timeout=10000000
             </spark-opts>
             <arg>-mt</arg> <arg>yarn</arg>
             <arg>-is</arg> <arg>${isLookupUrl}</arg>
             <arg>-t</arg> <arg>${otherDsTypeId}</arg>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-            <arg>--outputPath</arg><arg>${outputPath}</arg>
+            <arg>-s</arg><arg>${sourcePath}</arg>
+            <arg>-o</arg><arg>${outputPath}</arg>
+            <arg>-sp</arg><arg>${sparkSqlShufflePartitions}</arg>
         </spark>
         <ok to="to_solr_index"/>
         <error to="Kill"/>

From daa26acc9d2863fda801ae17087ce8f68082978f Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 2 Apr 2020 16:15:50 +0200
Subject: [PATCH 04/13] dataset based provision WIP, fixed spark2EventLogDir

---
 .../dhp/oa/provision/GraphJoiner_v2.java      | 48 +++++++++++--------
 .../oa/provision/oozie_app/config-default.xml |  2 +-
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
index d9f79a9674..5eac12e5da 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@@ -86,24 +86,36 @@ public class GraphJoiner_v2 implements Serializable {
         Dataset<TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
 
         // create the union between all the entities
-        Dataset<Tuple2<String, TypedRow>> entities =
-                datasource
-                        .union(organization)
-                        .union(project)
-                        .union(dataset)
-                        .union(otherresearchproduct)
-                        .union(software)
-                        .union(publication)
-                .map((MapFunction<TypedRow, Tuple2<String, TypedRow>>) value -> new Tuple2<>(
-                        value.getId(),
-                        value),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
-                .cache();
+         datasource
+                .union(organization)
+                .union(project)
+                .union(dataset)
+                .union(otherresearchproduct)
+                .union(software)
+                .union(publication)
+        .repartition(20000)
+        .write()
+        .parquet(getOutPath() + "/entities");
 
+        Dataset<Tuple2<String, TypedRow>> entities = getSpark()
+                .read()
+                .load(getOutPath() + "/entities")
+                .map((MapFunction<Row, Tuple2<String, TypedRow>>) r -> {
+                    TypedRow t = new TypedRow();
+                    t.setId(r.getAs("id"));
+                    t.setDeleted(r.getAs("deleted"));
+                    t.setType(r.getAs("type"));
+                    t.setOaf(r.getAs("oaf"));
+
+                    return new Tuple2<>(t.getId(), t);
+                }, Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)));
+
+        System.out.println("Entities, number of partitions: " + entities.rdd().getNumPartitions());
         System.out.println("Entities schema:");
         entities.printSchema();
-        // reads the relationships
 
+/*
+        // reads the relationships
         Dataset<Relation> rels = readPathRelation(jsc, getInputPath())
                 .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
                 .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(Relation.class))
@@ -126,7 +138,7 @@ public class GraphJoiner_v2 implements Serializable {
                     e.setRelation(t._1()._2());
                     e.setTarget(asRelatedEntity(t._2()._2()));
                     return e;
-                }, Encoders.bean(EntityRelEntity.class))
+                }, Encoders.kryo(EntityRelEntity.class))
                 .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>) e -> new Tuple2<>(e.getRelation().getSource(), e),
                         Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
 
@@ -160,14 +172,12 @@ public class GraphJoiner_v2 implements Serializable {
         final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
         grouped
                 .map((MapFunction<JoinedEntity, String>) value -> recordFactory.build(value), Encoders.STRING())
-                .write()
-                .text(getOutPath() + "/xml");
-        /*
                 .javaRDD()
                 .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> new Tuple2<>(t._1(), t._2()))
                 .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
 
-         */
+
+*/
 
         return this;
     }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
index c0364c2cf2..b1a494ac46 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
@@ -25,6 +25,6 @@
     </property>
     <property>
         <name>spark2EventLogDir</name>
-        <value>/user/spark/applicationHistory</value>
+        <value>/user/spark/spark2ApplicationHistory</value>
     </property>
 </configuration>
\ No newline at end of file

From 24b2c9012e2702086fc97dd53f9ddc7d7cfaea8d Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 2 Apr 2020 18:44:09 +0200
Subject: [PATCH 05/13] dataset based provision WIP

---
 .../dhp/oa/provision/GraphJoiner_v2.java      | 69 +++++++++++++++----
 .../oa/provision/oozie_app/config-default.xml | 12 ++++
 .../dhp/oa/provision/oozie_app/workflow.xml   | 34 +++++----
 3 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
index 5eac12e5da..3ee72c3185 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
@@ -23,6 +23,8 @@ import java.io.IOException;
 import java.io.Serializable;
 import java.util.*;
 
+import static org.apache.spark.sql.functions.*;
+
 import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntity;
 
 /**
@@ -93,9 +95,10 @@ public class GraphJoiner_v2 implements Serializable {
                 .union(otherresearchproduct)
                 .union(software)
                 .union(publication)
-        .repartition(20000)
-        .write()
-        .parquet(getOutPath() + "/entities");
+                .repartition(7000)
+                .write()
+                .partitionBy("id")
+                .parquet(getOutPath() + "/entities");
 
         Dataset<Tuple2<String, TypedRow>> entities = getSpark()
                 .read()
@@ -108,29 +111,51 @@ public class GraphJoiner_v2 implements Serializable {
                     t.setOaf(r.getAs("oaf"));
 
                     return new Tuple2<>(t.getId(), t);
-                }, Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)));
+                }, Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
+                .cache();
 
         System.out.println("Entities, number of partitions: " + entities.rdd().getNumPartitions());
         System.out.println("Entities schema:");
         entities.printSchema();
+        System.out.println("Entities count:" + entities.count());
 
-/*
         // reads the relationships
-        Dataset<Relation> rels = readPathRelation(jsc, getInputPath())
+        readPathRelation(jsc, getInputPath())
                 .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
-                .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(Relation.class))
+                .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.kryo(Relation.class))
+                .repartition(3000)
+                .write()
+                .partitionBy("source", "target")
+                .parquet(getOutPath() + "/relations");
+
+        Dataset<Relation> rels = getSpark()
+                .read()
+                .load(getOutPath() + "/relations")
+                .map((MapFunction<Row, Relation>) r -> {
+                    Relation rel = new Relation();
+                    rel.setSource(r.getAs("source"));
+                    rel.setTarget(r.getAs("target"));
+                    rel.setRelType(r.getAs("relType"));
+                    rel.setSubRelType(r.getAs("subRelType"));
+                    rel.setRelClass(r.getAs("relClass"));
+                    rel.setDataInfo(r.getAs("dataInfo"));
+                    rel.setCollectedFrom(r.getList(r.fieldIndex("collectedFrom")));
+                    return rel;
+                }, Encoders.kryo(Relation.class))
                 .cache();
 
         System.out.println("Relation schema:");
-        rels.printSchema();
+        System.out.println("Relation, number of partitions: " + rels.rdd().getNumPartitions());
+        System.out.println("Relation schema:");
+        entities.printSchema();
+        System.out.println("Relation count:" + rels.count());
 
+                /*
         Dataset<Tuple2<String, Relation>> relsByTarget = rels
                 .map((MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(), r), Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class)));
 
-        System.out.println("Relation by target schema:");
-        relsByTarget.printSchema();
 
-        Dataset<Tuple2<String, EntityRelEntity>> bySource = relsByTarget
+        relsByTarget
                 .joinWith(entities, relsByTarget.col("_1").equalTo(entities.col("_1")), "inner")
                 .filter((FilterFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, TypedRow>>>) value -> value._2()._2().getDeleted() == false)
                 .map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, TypedRow>>, EntityRelEntity>) t -> {
@@ -139,12 +164,28 @@ public class GraphJoiner_v2 implements Serializable {
                     e.setTarget(asRelatedEntity(t._2()._2()));
                     return e;
                 }, Encoders.kryo(EntityRelEntity.class))
-                .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>) e -> new Tuple2<>(e.getRelation().getSource(), e),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
+                .repartition(20000)
+                .write()
+                .parquet(getOutPath() + "/bySource");
 
-        System.out.println("bySource schema");
+        Dataset<Tuple2<String, EntityRelEntity>> bySource = getSpark()
+                .read()
+                .load(getOutPath() + "/bySource")
+                .map(new MapFunction<Row, EntityRelEntity>() {
+                    @Override
+                    public EntityRelEntity call(Row value) throws Exception {
+                        return null;
+                    }
+                }, Encoders.kryo(EntityRelEntity.class))
+                .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>) e -> new Tuple2<>(e.getRelation().getSource(), e),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)))
+
+                System.out.println("bySource schema");
         bySource.printSchema();
 
+
+
+
         Dataset<EntityRelEntity> joined = entities
                 .joinWith(bySource, entities.col("_1").equalTo(bySource.col("_1")), "left")
                 .map((MapFunction<Tuple2<Tuple2<String, TypedRow>, Tuple2<String, EntityRelEntity>>, EntityRelEntity>) value -> {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
index b1a494ac46..c070d83384 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml
@@ -27,4 +27,16 @@
         <name>spark2EventLogDir</name>
         <value>/user/spark/spark2ApplicationHistory</value>
     </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
+    </property>
+    <property>
+        <name>oozieActionShareLibForSpark2</name>
+        <value>spark2</value>
+    </property>
 </configuration>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index f4bd3f19ee..194cd43c81 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -25,6 +25,20 @@
             <name>sparkExecutorCoresForIndexing</name>
             <description>number of cores used by single executor</description>
         </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
         <property>
             <name>spark2YarnHistoryServerAddress</name>
             <description>spark 2.* yarn history server address</description>
@@ -40,12 +54,8 @@
         <name-node>${nameNode}</name-node>
         <configuration>
             <property>
-                <name>mapreduce.job.queuename</name>
-                <value>${queueName}</value>
-            </property>
-            <property>
-                <name>oozie.launcher.mapred.job.queue.name</name>
-                <value>${oozieLauncherQueueName}</value>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
             </property>
         </configuration>
     </global>
@@ -76,11 +86,11 @@
             <class>eu.dnetlib.dhp.oa.provision.SparkXmlRecordBuilderJob_v2</class>
             <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
-                --executor-cores ${sparkExecutorCoresForJoining}
-                --executor-memory ${sparkExecutorMemoryForJoining}
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
                 --driver-memory=${sparkDriverMemoryForJoining}
-                --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
-                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>
@@ -107,8 +117,8 @@
                 --executor-memory ${sparkExecutorMemoryForIndexing}
                 --driver-memory=${sparkDriverMemoryForIndexing}
                 --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
-                --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
-                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>

From 3d1b637cab7628051d15fb60855cd7edcfc3aed8 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Sat, 4 Apr 2020 14:03:43 +0200
Subject: [PATCH 06/13] dataset based provision WIP

---
 dhp-common/pom.xml                            |  20 +
 .../common/FunctionalInterfaceSupport.java    |  56 +++
 .../eu/dnetlib/dhp/common/HdfsSupport.java    |  57 +++
 .../dhp/common/SparkSessionSupport.java       |  57 +++
 .../dnetlib/dhp/common/ThrowingSupport.java   |  76 ++++
 .../dnetlib/dhp/common/HdfsSupportTest.java   |  78 ++++
 .../dnetlib/dhp/common/ModelSupportTest.java  |  36 ++
 .../dhp/common/SparkSessionSupportTest.java   |  54 +++
 .../dhp/schema/common/ModelSupport.java       |  51 +++
 .../eu/dnetlib/dhp/schema/oaf/Relation.java   |   3 +-
 .../oa/provision/AdjacencyListBuilderJob.java | 167 +++++++++
 .../CreateRelatedEntitiesJob_phase1.java      | 157 ++++++++
 .../CreateRelatedEntitiesJob_phase2.java      | 168 +++++++++
 .../dhp/oa/provision/GraphJoiner_v2.java      | 346 ------------------
 .../dhp/oa/provision/PrepareRelationsJob.java | 132 +++++++
 .../dhp/oa/provision/SparkXmlIndexingJob.java |  73 ++--
 .../SparkXmlRecordBuilderJob_v2.java          |  81 ----
 .../dhp/oa/provision/XmlConverterJob.java     | 149 ++++++++
 .../oa/provision/model/EntityRelEntity.java   |  21 +-
 .../dhp/oa/provision/model/JoinedEntity.java  |   7 +-
 .../dnetlib/dhp/oa/provision/model/Links.java |   4 +-
 .../oa/provision/model/SortableRelation.java  |  34 ++
 .../dhp/oa/provision/model/Tuple2.java        |  19 +-
 .../oa/provision/utils/GraphMappingUtils.java | 232 ++++++------
 .../input_params_build_adjacency_lists.json   |  18 +-
 .../input_params_prepare_relations.json       |  20 +
 ...input_params_related_entities_pahase1.json |  32 ++
 ...input_params_related_entities_pahase2.json |  26 ++
 .../provision/input_params_update_index.json  |   2 +-
 .../provision/input_params_xml_converter.json |  26 ++
 .../dhp/oa/provision/oozie_app/workflow.xml   | 157 +++++++-
 31 files changed, 1739 insertions(+), 620 deletions(-)
 create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java
 create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java
 create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java
 create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java
 create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java
 create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/common/ModelSupportTest.java
 create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java
 create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
 delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
 delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json
 create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json

diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 1268afa3a1..d224ebc9f8 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -13,6 +13,26 @@
 	<packaging>jar</packaging>
 
 	<dependencies>
+
+		<dependency>
+			<groupId>eu.dnetlib.dhp</groupId>
+			<artifactId>dhp-schemas</artifactId>
+			<version>${project.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-common</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.spark</groupId>
+			<artifactId>spark-core_2.11</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.spark</groupId>
+			<artifactId>spark-sql_2.11</artifactId>
+		</dependency>
+
 		<dependency>
 			<groupId>commons-cli</groupId>
 			<artifactId>commons-cli</artifactId>
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java
new file mode 100644
index 0000000000..d78520f55b
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java
@@ -0,0 +1,56 @@
+package eu.dnetlib.dhp.common;
+
+import java.io.Serializable;
+import java.util.function.Supplier;
+
+/**
+ * Provides serializable and throwing extensions to standard functional interfaces.
+ */
+public class FunctionalInterfaceSupport {
+
+    private FunctionalInterfaceSupport() {
+    }
+
+    /**
+     * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
+     * functions externally.
+     *
+     * @param <T>
+     */
+    @FunctionalInterface
+    public interface SerializableSupplier<T> extends Supplier<T>, Serializable {
+    }
+
+    /**
+     * Extension of consumer accepting functions throwing an exception.
+     *
+     * @param <T>
+     * @param <E>
+     */
+    @FunctionalInterface
+    public interface ThrowingConsumer<T, E extends Exception> {
+        void accept(T t) throws E;
+    }
+
+    /**
+     * Extension of supplier accepting functions throwing an exception.
+     *
+     * @param <T>
+     * @param <E>
+     */
+    @FunctionalInterface
+    public interface ThrowingSupplier<T, E extends Exception> {
+        T get() throws E;
+    }
+
+    /**
+     * Extension of runnable accepting functions throwing an exception.
+     *
+     * @param <E>
+     */
+    @FunctionalInterface
+    public interface ThrowingRunnable<E extends Exception> {
+        void run() throws E;
+    }
+
+}
\ No newline at end of file
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java
new file mode 100644
index 0000000000..05beaa51e1
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.dhp.common;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException;
+
+/**
+ * HDFS utility methods.
+ */
+public class HdfsSupport {
+    private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class);
+
+    private HdfsSupport() {
+    }
+
+    /**
+     * Removes a path (file or dir) from HDFS.
+     *
+     * @param path          Path to be removed
+     * @param configuration Configuration of hadoop env
+     */
+    public static void remove(String path, Configuration configuration) {
+        logger.info("Removing path: {}", path);
+        rethrowAsRuntimeException(() -> {
+            Path f = new Path(path);
+            FileSystem fileSystem = FileSystem.get(configuration);
+            if (fileSystem.exists(f)) {
+                fileSystem.delete(f, true);
+            }
+        });
+    }
+
+    /**
+     * Lists hadoop files located below path or alternatively lists subdirs under path.
+     *
+     * @param path          Path to be listed for hadoop files
+     * @param configuration Configuration of hadoop env
+     * @return List with string locations of hadoop files
+     */
+    public static List<String> listFiles(String path, Configuration configuration) {
+        logger.info("Listing files in path: {}", path);
+        return rethrowAsRuntimeException(() -> Arrays
+                .stream(FileSystem.get(configuration).listStatus(new Path(path)))
+                .filter(FileStatus::isDirectory)
+                .map(x -> x.getPath().toString())
+                .collect(Collectors.toList()));
+    }
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java
new file mode 100644
index 0000000000..f42ee1c581
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.dhp.common;
+
+import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SparkSession;
+
+import java.util.Objects;
+import java.util.function.Function;
+
+/**
+ * SparkSession utility methods.
+ */
+public class SparkSessionSupport {
+
+    private SparkSessionSupport() {
+    }
+
+    /**
+     * Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession
+     * when SparkSession is managed. Allows to reuse SparkSession created externally.
+     *
+     * @param conf                  SparkConf instance
+     * @param isSparkSessionManaged When true will stop SparkSession
+     * @param fn                    Consumer to be applied to constructed SparkSession
+     */
+    public static void runWithSparkSession(SparkConf conf,
+                                           Boolean isSparkSessionManaged,
+                                           ThrowingConsumer<SparkSession, Exception> fn) {
+        runWithSparkSession(c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn);
+    }
+
+    /**
+     * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops SparkSession
+     * when SparkSession is managed. Allows to reuse SparkSession created externally.
+     *
+     * @param sparkSessionBuilder   Builder of SparkSession
+     * @param conf                  SparkConf instance
+     * @param isSparkSessionManaged When true will stop SparkSession
+     * @param fn                    Consumer to be applied to constructed SparkSession
+     */
+    public static void runWithSparkSession(Function<SparkConf, SparkSession> sparkSessionBuilder,
+                                           SparkConf conf,
+                                           Boolean isSparkSessionManaged,
+                                           ThrowingConsumer<SparkSession, Exception> fn) {
+        SparkSession spark = null;
+        try {
+            spark = sparkSessionBuilder.apply(conf);
+            fn.accept(spark);
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        } finally {
+            if (Objects.nonNull(spark) && isSparkSessionManaged) {
+                spark.stop();
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java
new file mode 100644
index 0000000000..b32803c378
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java
@@ -0,0 +1,76 @@
+package eu.dnetlib.dhp.common;
+
+import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingRunnable;
+import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingSupplier;
+
+/**
+ * Exception handling utility methods.
+ */
+public class ThrowingSupport {
+
+    private ThrowingSupport() {
+    }
+
+    /**
+     * Executes given runnable and rethrows any exceptions as RuntimeException.
+     *
+     * @param fn  Runnable to be executed
+     * @param <E> Type of exception thrown
+     */
+    public static <E extends Exception> void rethrowAsRuntimeException(ThrowingRunnable<E> fn) {
+        try {
+            fn.run();
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Executes given runnable and rethrows any exceptions as RuntimeException with custom message.
+     *
+     * @param fn  Runnable to be executed
+     * @param msg Message to be set for rethrown exception
+     * @param <E> Type of exception thrown
+     */
+    public static <E extends Exception> void rethrowAsRuntimeException(ThrowingRunnable<E> fn, String msg) {
+        try {
+            fn.run();
+        } catch (Exception e) {
+            throw new RuntimeException(msg, e);
+        }
+    }
+
+    /**
+     * Executes given supplier and rethrows any exceptions as RuntimeException.
+     *
+     * @param fn  Supplier to be executed
+     * @param <T> Type of returned value
+     * @param <E> Type of exception thrown
+     * @return Result of supplier execution
+     */
+    public static <T, E extends Exception> T rethrowAsRuntimeException(ThrowingSupplier<T, E> fn) {
+        try {
+            return fn.get();
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Executes given supplier and rethrows any exceptions as RuntimeException with custom message.
+     *
+     * @param fn  Supplier to be executed
+     * @param msg Message to be set for rethrown exception
+     * @param <T> Type of returned value
+     * @param <E> Type of exception thrown
+     * @return Result of supplier execution
+     */
+    public static <T, E extends Exception> T rethrowAsRuntimeException(ThrowingSupplier<T, E> fn, String msg) {
+        try {
+            return fn.get();
+        } catch (Exception e) {
+            throw new RuntimeException(msg, e);
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java
new file mode 100644
index 0000000000..f1e790ee7c
--- /dev/null
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java
@@ -0,0 +1,78 @@
+package eu.dnetlib.dhp.common;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.jupiter.api.Nested;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class HdfsSupportTest {
+
+    @Nested
+    class Remove {
+
+        @Test
+        public void shouldThrowARuntimeExceptionOnError() {
+            // when
+            assertThrows(RuntimeException.class, () ->
+                    HdfsSupport.remove(null, new Configuration()));
+        }
+
+        @Test
+        public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
+            // when
+            HdfsSupport.remove(tempDir.toString(), new Configuration());
+
+            // then
+            assertFalse(Files.exists(tempDir));
+        }
+
+        @Test
+        public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
+            // given
+            Path file = Files.createTempFile(tempDir, "p", "s");
+
+            // when
+            HdfsSupport.remove(file.toString(), new Configuration());
+
+            // then
+            assertFalse(Files.exists(file));
+        }
+    }
+
+    @Nested
+    class ListFiles {
+
+        @Test
+        public void shouldThrowARuntimeExceptionOnError() {
+            // when
+            assertThrows(RuntimeException.class, () ->
+                    HdfsSupport.listFiles(null, new Configuration()));
+        }
+
+        @Test
+        public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
+            Path subDir1 = Files.createTempDirectory(tempDir, "list_me");
+            Path subDir2 = Files.createTempDirectory(tempDir, "list_me");
+
+            // when
+            List<String> paths = HdfsSupport.listFiles(tempDir.toString(), new Configuration());
+
+            // then
+            assertEquals(2, paths.size());
+            List<String> expecteds = Arrays.stream(new String[]{subDir1.toString(), subDir2.toString()})
+                    .sorted().collect(Collectors.toList());
+            List<String> actuals = paths.stream().sorted().collect(Collectors.toList());
+            assertTrue(actuals.get(0).contains(expecteds.get(0)));
+            assertTrue(actuals.get(1).contains(expecteds.get(1)));
+        }
+    }
+}
\ No newline at end of file
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/ModelSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/ModelSupportTest.java
new file mode 100644
index 0000000000..bfed019e9b
--- /dev/null
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/ModelSupportTest.java
@@ -0,0 +1,36 @@
+package eu.dnetlib.dhp.common;
+
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import org.junit.jupiter.api.Nested;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class ModelSupportTest {
+
+    @Nested
+    class IsSubClass {
+
+        @Test
+        public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() {
+            // when
+            Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class);
+
+            // then
+            assertFalse(result);
+        }
+
+        @Test
+        public void shouldReturnTrueWhenSubClassExtendsSuperClass() {
+            // when
+            Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class);
+
+            // then
+            assertTrue(result);
+        }
+    }
+}
\ No newline at end of file
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java
new file mode 100644
index 0000000000..bc2dce3cff
--- /dev/null
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java
@@ -0,0 +1,54 @@
+package eu.dnetlib.dhp.common;
+
+import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.Nested;
+import org.junit.jupiter.api.Test;
+
+import java.util.function.Function;
+
+import static org.mockito.Mockito.*;
+
+public class SparkSessionSupportTest {
+
+    @Nested
+    class RunWithSparkSession {
+
+        @Test
+        public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() throws Exception {
+            // given
+            SparkSession spark = mock(SparkSession.class);
+            SparkConf conf = mock(SparkConf.class);
+            Function<SparkConf, SparkSession> sparkSessionBuilder = mock(Function.class);
+            when(sparkSessionBuilder.apply(conf)).thenReturn(spark);
+            ThrowingConsumer<SparkSession, Exception> fn = mock(ThrowingConsumer.class);
+
+            // when
+            SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, false, fn);
+
+            // then
+            verify(sparkSessionBuilder).apply(conf);
+            verify(fn).accept(spark);
+            verify(spark, never()).stop();
+        }
+
+        @Test
+        public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() throws Exception {
+            // given
+            SparkSession spark = mock(SparkSession.class);
+            SparkConf conf = mock(SparkConf.class);
+            Function<SparkConf, SparkSession> sparkSessionBuilder = mock(Function.class);
+            when(sparkSessionBuilder.apply(conf)).thenReturn(spark);
+            ThrowingConsumer<SparkSession, Exception> fn = mock(ThrowingConsumer.class);
+
+            // when
+            SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, true, fn);
+
+            // then
+            verify(sparkSessionBuilder).apply(conf);
+            verify(fn).accept(spark);
+            verify(spark, times(1)).stop();
+        }
+    }
+}
\ No newline at end of file
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
new file mode 100644
index 0000000000..3c774aa385
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
@@ -0,0 +1,51 @@
+package eu.dnetlib.dhp.schema.common;
+
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+
+/**
+ * Inheritance utility methods.
+ */
+public class ModelSupport {
+
+    private ModelSupport() {
+    }
+
+    /**
+     * Checks subclass-superclass relationship.
+     *
+     * @param subClazzObject   Subclass object instance
+     * @param superClazzObject Superclass object instance
+     * @param <X>              Subclass type
+     * @param <Y>              Superclass type
+     * @return True if X is a subclass of Y
+     */
+    public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(X subClazzObject, Y superClazzObject) {
+        return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
+    }
+
+    /**
+     * Checks subclass-superclass relationship.
+     *
+     * @param subClazzObject Subclass object instance
+     * @param superClazz     Superclass class
+     * @param <X>            Subclass type
+     * @param <Y>            Superclass type
+     * @return True if X is a subclass of Y
+     */
+    public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(X subClazzObject, Class<Y> superClazz) {
+        return isSubClass(subClazzObject.getClass(), superClazz);
+    }
+
+    /**
+     * Checks subclass-superclass relationship.
+     *
+     * @param subClazz   Subclass class
+     * @param superClazz Superclass class
+     * @param <X>        Subclass type
+     * @param <Y>        Superclass type
+     * @return True if X is a subclass of Y
+     */
+    public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(Class<X> subClazz, Class<Y> superClazz) {
+        return superClazz.isAssignableFrom(subClazz);
+    }
+}
\ No newline at end of file
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
index 6738b86938..e2471cd898 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
@@ -92,8 +92,7 @@ public class Relation extends Oaf {
 				subRelType.equals(relation.subRelType) &&
 				relClass.equals(relation.relClass) &&
 				source.equals(relation.source) &&
-				target.equals(relation.target) &&
-				Objects.equals(collectedFrom, relation.collectedFrom);
+				target.equals(relation.target);
 	}
 
 	@Override
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
new file mode 100644
index 0000000000..dcb3ac171d
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
@@ -0,0 +1,167 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.oa.provision.model.*;
+import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
+import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.io.IOException;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+
+/**
+ * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
+ * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
+ * and all the possible relationships (similarity links produced by the Dedup process are excluded).
+ *
+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
+ * by E, finally grouped by E.id;
+ *
+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation
+ *  1) PrepareRelationsJob:
+ *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
+ *      can be linked at most to 100 other objects
+ *
+ *  2) JoinRelationEntityByTargetJob:
+ *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *      for each entity type E_i
+ *          join (R.target = E_i.id),
+ *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *
+ *  3) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *
+ *  4) XmlConverterJob:
+ *      convert the JoinedEntities as XML records
+ */
+public class AdjacencyListBuilderJob {
+
+    private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    public static void main(String[] args) throws Exception {
+
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        AdjacencyListBuilderJob.class
+                                .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
+        parser.parseArgument(args);
+
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        String inputPath = parser.get("inputPath");
+        log.info("inputPath: {}", inputPath);
+
+        String outputPath = parser.get("outputPath");
+        log.info("outputPath: {}", outputPath);
+
+        SparkConf conf = new SparkConf();
+        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+        conf.registerKryoClasses(getKryoClasses());
+
+        runWithSparkSession(conf, isSparkSessionManaged,
+                spark -> {
+                    removeOutputDir(spark, outputPath);
+                    createAdjacencyLists(spark, inputPath, outputPath);
+                });
+
+    }
+
+    private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
+
+        RDD<JoinedEntity> joined = spark.read()
+                .load(inputPath)
+                .as(Encoders.kryo(EntityRelEntity.class))
+                .javaRDD()
+                .map(e -> getJoinedEntity(e))
+                .mapToPair(e -> new Tuple2<>(e.getEntity().getId(), e))
+                .reduceByKey((j1, j2) -> getJoinedEntity(j1, j2))
+                .map(Tuple2::_2)
+                .rdd();
+
+        spark.createDataset(joined, Encoders.bean(JoinedEntity.class))
+                .write()
+                .mode(SaveMode.Overwrite)
+                .parquet(outputPath);
+
+    }
+
+    private static JoinedEntity getJoinedEntity(JoinedEntity j1, JoinedEntity j2) {
+        JoinedEntity je = new JoinedEntity();
+        je.setEntity(je.getEntity());
+        je.setType(j1.getType());
+
+        Links links = new Links();
+        links.addAll(j1.getLinks());
+        links.addAll(j2.getLinks());
+
+        return je;
+    }
+
+    private static JoinedEntity getJoinedEntity(EntityRelEntity e) {
+        JoinedEntity j = new JoinedEntity();
+        j.setEntity(toOafEntity(e.getEntity()));
+        j.setType(EntityType.valueOf(e.getEntity().getType()));
+        Links links = new Links();
+        links.add(new eu.dnetlib.dhp.oa.provision.model.Tuple2(e.getRelation(), e.getTarget()));
+        j.setLinks(links);
+        return j;
+    }
+
+    private static OafEntity toOafEntity(TypedRow typedRow) {
+        return parseOaf(typedRow.getOaf(), typedRow.getType());
+    }
+
+    private static OafEntity parseOaf(final String json, final String type) {
+        try {
+            switch (GraphMappingUtils.EntityType.valueOf(type)) {
+                case publication:
+                    return OBJECT_MAPPER.readValue(json, Publication.class);
+                case dataset:
+                    return OBJECT_MAPPER.readValue(json, Dataset.class);
+                case otherresearchproduct:
+                    return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class);
+                case software:
+                    return OBJECT_MAPPER.readValue(json, Software.class);
+                case datasource:
+                    return OBJECT_MAPPER.readValue(json, Datasource.class);
+                case organization:
+                    return OBJECT_MAPPER.readValue(json, Organization.class);
+                case project:
+                    return OBJECT_MAPPER.readValue(json, Project.class);
+                default:
+                    throw new IllegalArgumentException("invalid type: " + type);
+            }
+        } catch (IOException e) {
+            throw new IllegalArgumentException(e);
+        }
+    }
+
+    private static void removeOutputDir(SparkSession spark, String path) {
+        HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+    }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
new file mode 100644
index 0000000000..0b153f8269
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@@ -0,0 +1,157 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
+import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
+import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+
+/**
+ * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
+ * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
+ * and all the possible relationships (similarity links produced by the Dedup process are excluded).
+ *
+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
+ * by E, finally grouped by E.id;
+ *
+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation
+ *  1) PrepareRelationsJob:
+ *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
+ *      can be linked at most to 100 other objects
+ *
+ *  2) CreateRelatedEntitiesJob_phase1:
+ *      prepare tuples [relation - target entity] (R - T):
+ *      for each entity type E_i
+ *          join (R.target = E_i.id),
+ *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          save the tuples [R - T_i] in append mode
+ *
+ *  3) CreateRelatedEntitiesJob_phase2:
+ *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *      create the union of the each entity type, hash by id (S)
+ *      for each [R - T_i] produced in phase1
+ *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
+ *          save in append mode
+ *
+ *  4) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *
+ *  5) XmlConverterJob:
+ *      convert the JoinedEntities as XML records
+ */
+public class CreateRelatedEntitiesJob_phase1 {
+
+    private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class);
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    public static void main(String[] args) throws Exception {
+
+        String jsonConfiguration = IOUtils.toString(
+                PrepareRelationsJob.class
+                        .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+        parser.parseArgument(args);
+
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        String inputRelationsPath = parser.get("inputRelationsPath");
+        log.info("inputRelationsPath: {}", inputRelationsPath);
+
+        String inputEntityPath = parser.get("inputEntityPath");
+        log.info("inputEntityPath: {}", inputEntityPath);
+
+        String outputPath = parser.get("outputPath");
+        log.info("outputPath: {}", outputPath);
+
+        String graphTableClassName = parser.get("graphTableClassName");
+        log.info("graphTableClassName: {}", graphTableClassName);
+
+        Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
+
+        SparkConf conf = new SparkConf();
+        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+        conf.registerKryoClasses(getKryoClasses());
+
+        runWithSparkSession(conf, isSparkSessionManaged,
+                spark -> {
+                    removeOutputDir(spark, outputPath);
+                    joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath);
+                });
+    }
+
+    private static <E extends OafEntity> void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class<E> entityClazz, String outputPath) {
+
+        Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
+                .map((MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)));
+
+        Dataset<Tuple2<String, E>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
+                .map((MapFunction<E, Tuple2<String, E>>) e -> new Tuple2<>(e.getId(), e),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(entityClazz)))
+                .cache();
+
+        relsByTarget
+                .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
+                .filter((FilterFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>>)
+                        value -> value._2()._2().getDataInfo().getDeletedbyinference() == false)
+                .map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>, EntityRelEntity>)
+                        t -> new EntityRelEntity(t._1()._2(), GraphMappingUtils.asRelatedEntity(t._2()._2(), entityClazz)),
+                        Encoders.bean(EntityRelEntity.class))
+                .write()
+                .mode(SaveMode.Append)
+                .parquet(outputPath);
+    }
+
+    private static <E extends OafEntity> Dataset<E> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
+
+        log.info("Reading Graph table from: {}", inputEntityPath);
+        return spark
+                .read()
+                .textFile(inputEntityPath)
+                .map((MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz));
+    }
+
+    /**
+     * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file,
+     *
+     * @param spark
+     * @param relationPath
+     * @return the Dataset<SortableRelation> containing all the relationships
+     */
+    private static Dataset<SortableRelation> readPathRelation(SparkSession spark, final String relationPath) {
+
+        log.info("Reading relations from: {}", relationPath);
+        return spark.read()
+                .load(relationPath)
+                .as(Encoders.bean(SortableRelation.class));
+    }
+
+    private static void removeOutputDir(SparkSession spark, String path) {
+        HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+    }
+
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
new file mode 100644
index 0000000000..6c7f1efd74
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@@ -0,0 +1,168 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
+import eu.dnetlib.dhp.oa.provision.model.TypedRow;
+import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.util.Map;
+import java.util.Optional;
+import java.util.function.Function;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+
+/**
+ * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
+ * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
+ * and all the possible relationships (similarity links produced by the Dedup process are excluded).
+ *
+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
+ * by E, finally grouped by E.id;
+ *
+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation
+ *  1) PrepareRelationsJob:
+ *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
+ *      can be linked at most to 100 other objects
+ *
+ *  2) CreateRelatedEntitiesJob_phase1:
+ *      prepare tuples [relation - target entity] (R - T):
+ *      for each entity type E_i
+ *          join (R.target = E_i.id),
+ *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          save the tuples [R - T_i] in append mode
+ *
+ *  3) CreateRelatedEntitiesJob_phase2:
+ *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *      create the union of the each entity type, hash by id (S)
+ *      for each [R - T_i] produced in phase1
+ *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
+ *          save in append mode
+ *
+ *  4) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *
+ *  5) XmlConverterJob:
+ *      convert the JoinedEntities as XML records
+ */
+public class CreateRelatedEntitiesJob_phase2 {
+
+    private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class);
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    public static void main(String[] args) throws Exception {
+
+        String jsonConfiguration = IOUtils.toString(
+                PrepareRelationsJob.class
+                        .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+        parser.parseArgument(args);
+
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath");
+        log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath);
+
+        String inputGraphPath = parser.get("inputGraphPath");
+        log.info("inputGraphPath: {}", inputGraphPath);
+
+        String outputPath = parser.get("outputPath");
+        log.info("outputPath: {}", outputPath);
+
+        String graphTableClassName = parser.get("graphTableClassName");
+        log.info("graphTableClassName: {}", graphTableClassName);
+
+        SparkConf conf = new SparkConf();
+        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+        conf.registerKryoClasses(getKryoClasses());
+
+        runWithSparkSession(conf, isSparkSessionManaged,
+                spark -> {
+                    removeOutputDir(spark, outputPath);
+                    joinAllEntities(spark, inputRelatedEntitiesPath, inputGraphPath, outputPath);
+                });
+    }
+
+    private static void joinAllEntities(SparkSession spark, String inputRelatedEntitiesPath, String inputGraphPath, String outputPath) {
+
+        Dataset<Tuple2<String, EntityRelEntity>> relsBySource = readRelatedEntities(spark, inputRelatedEntitiesPath);
+        Dataset<Tuple2<String, TypedRow>> entities = readAllEntities(spark, inputGraphPath);
+
+        entities
+                .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer")
+                .map((MapFunction<Tuple2<Tuple2<String, TypedRow>, Tuple2<String, EntityRelEntity>>, EntityRelEntity>) value -> {
+                    EntityRelEntity re = new EntityRelEntity();
+                    re.setEntity(value._1()._2());
+                    Optional<EntityRelEntity> related = Optional.ofNullable(value._2()).map(Tuple2::_2);
+                    if (related.isPresent()) {
+                        re.setRelation(related.get().getRelation());
+                        re.setTarget(related.get().getTarget());
+                    }
+                    return re;
+                }, Encoders.bean(EntityRelEntity.class))
+                .write()
+                .mode(SaveMode.Append)
+                .parquet(outputPath);
+    }
+
+    private static Dataset<Tuple2<String, TypedRow>> readAllEntities(SparkSession spark, String inputGraphPath) {
+        return GraphMappingUtils.entityTypes.entrySet()
+                .stream()
+                .map((Function<Map.Entry<GraphMappingUtils.EntityType, Class>, Dataset<TypedRow>>)
+                        e -> readPathEntity(spark, inputGraphPath + "/" + e.getKey().name(), e.getValue())
+                        .map((MapFunction<OafEntity, TypedRow>) entity -> {
+                            TypedRow t = new TypedRow();
+                            t.setType(e.getKey().name());
+                            t.setDeleted(entity.getDataInfo().getDeletedbyinference());
+                            t.setId(entity.getId());
+                            t.setOaf(OBJECT_MAPPER.writeValueAsString(entity));
+                            return t;
+                        }, Encoders.bean(TypedRow.class)))
+                .reduce(spark.emptyDataset(Encoders.bean(TypedRow.class)), Dataset::union)
+                .map((MapFunction<TypedRow, Tuple2<String, TypedRow>>)
+                        value -> new Tuple2<>(value.getId(), value),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)));
+    }
+
+    private static Dataset<Tuple2<String, EntityRelEntity>> readRelatedEntities(SparkSession spark, String inputRelatedEntitiesPath) {
+        return spark.read()
+                .load(inputRelatedEntitiesPath)
+                .as(Encoders.kryo(EntityRelEntity.class))
+                .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>)
+                        value -> new Tuple2<>(value.getRelation().getSource(), value),
+                    Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
+    }
+
+
+    private static <E extends OafEntity> Dataset<E> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
+
+        log.info("Reading Graph table from: {}", inputEntityPath);
+        return spark
+                .read()
+                .textFile(inputEntityPath)
+                .map((MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz));
+    }
+
+    private static void removeOutputDir(SparkSession spark, String path) {
+        HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+    }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
deleted file mode 100644
index 3ee72c3185..0000000000
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/GraphJoiner_v2.java
+++ /dev/null
@@ -1,346 +0,0 @@
-package eu.dnetlib.dhp.oa.provision;
-
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.Maps;
-import com.jayway.jsonpath.DocumentContext;
-import com.jayway.jsonpath.JsonPath;
-import eu.dnetlib.dhp.oa.provision.model.*;
-import eu.dnetlib.dhp.oa.provision.utils.*;
-import eu.dnetlib.dhp.schema.oaf.*;
-import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.*;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.types.*;
-import org.apache.spark.util.LongAccumulator;
-import scala.Tuple2;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.*;
-
-import static org.apache.spark.sql.functions.*;
-
-import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntity;
-
-/**
- * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
- * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
- * and all the possible relationships (similarity links produced by the Dedup process are excluded).
- *
- * The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
- * by E, finally grouped by E.id;
- *
- * Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
- * 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
- *      it seems that deserializing it with jackson's object mapper has higher memory footprint.
- *
- * 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
- * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
- *      and E_target = T. Objects in T are heavily pruned by all the unnecessary information
- *
- * 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ <T, R> ]
- */
-public class GraphJoiner_v2 implements Serializable {
-
-    private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
-
-    public static final int MAX_RELS = 100;
-
-    public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
-
-    private SparkSession spark;
-
-    private ContextMapper contextMapper;
-
-    private String inputPath;
-
-    private String outPath;
-
-    private String otherDsTypeId;
-
-    public GraphJoiner_v2(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
-        this.spark = spark;
-        this.contextMapper = contextMapper;
-        this.otherDsTypeId = otherDsTypeId;
-        this.inputPath = inputPath;
-        this.outPath = outPath;
-
-        final SparkContext sc = spark.sparkContext();
-        prepareAccumulators(sc);
-    }
-
-    public GraphJoiner_v2 adjacencyLists() throws IOException {
-
-        final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(getSpark().sparkContext());
-
-        // read each entity
-        Dataset<TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
-        Dataset<TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
-        Dataset<TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
-        Dataset<TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
-        Dataset<TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
-        Dataset<TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
-        Dataset<TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
-
-        // create the union between all the entities
-         datasource
-                .union(organization)
-                .union(project)
-                .union(dataset)
-                .union(otherresearchproduct)
-                .union(software)
-                .union(publication)
-                .repartition(7000)
-                .write()
-                .partitionBy("id")
-                .parquet(getOutPath() + "/entities");
-
-        Dataset<Tuple2<String, TypedRow>> entities = getSpark()
-                .read()
-                .load(getOutPath() + "/entities")
-                .map((MapFunction<Row, Tuple2<String, TypedRow>>) r -> {
-                    TypedRow t = new TypedRow();
-                    t.setId(r.getAs("id"));
-                    t.setDeleted(r.getAs("deleted"));
-                    t.setType(r.getAs("type"));
-                    t.setOaf(r.getAs("oaf"));
-
-                    return new Tuple2<>(t.getId(), t);
-                }, Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
-                .cache();
-
-        System.out.println("Entities, number of partitions: " + entities.rdd().getNumPartitions());
-        System.out.println("Entities schema:");
-        entities.printSchema();
-        System.out.println("Entities count:" + entities.count());
-
-        // reads the relationships
-        readPathRelation(jsc, getInputPath())
-                .groupByKey((MapFunction<Relation, SortableRelationKey>) t -> SortableRelationKey.from(t), Encoders.kryo(SortableRelationKey.class))
-                .flatMapGroups((FlatMapGroupsFunction<SortableRelationKey, Relation, Relation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.kryo(Relation.class))
-                .repartition(3000)
-                .write()
-                .partitionBy("source", "target")
-                .parquet(getOutPath() + "/relations");
-
-        Dataset<Relation> rels = getSpark()
-                .read()
-                .load(getOutPath() + "/relations")
-                .map((MapFunction<Row, Relation>) r -> {
-                    Relation rel = new Relation();
-                    rel.setSource(r.getAs("source"));
-                    rel.setTarget(r.getAs("target"));
-                    rel.setRelType(r.getAs("relType"));
-                    rel.setSubRelType(r.getAs("subRelType"));
-                    rel.setRelClass(r.getAs("relClass"));
-                    rel.setDataInfo(r.getAs("dataInfo"));
-                    rel.setCollectedFrom(r.getList(r.fieldIndex("collectedFrom")));
-                    return rel;
-                }, Encoders.kryo(Relation.class))
-                .cache();
-
-        System.out.println("Relation schema:");
-        System.out.println("Relation, number of partitions: " + rels.rdd().getNumPartitions());
-        System.out.println("Relation schema:");
-        entities.printSchema();
-        System.out.println("Relation count:" + rels.count());
-
-                /*
-        Dataset<Tuple2<String, Relation>> relsByTarget = rels
-                .map((MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(), r), Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class)));
-
-
-        relsByTarget
-                .joinWith(entities, relsByTarget.col("_1").equalTo(entities.col("_1")), "inner")
-                .filter((FilterFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, TypedRow>>>) value -> value._2()._2().getDeleted() == false)
-                .map((MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, TypedRow>>, EntityRelEntity>) t -> {
-                    EntityRelEntity e = new EntityRelEntity();
-                    e.setRelation(t._1()._2());
-                    e.setTarget(asRelatedEntity(t._2()._2()));
-                    return e;
-                }, Encoders.kryo(EntityRelEntity.class))
-                .repartition(20000)
-                .write()
-                .parquet(getOutPath() + "/bySource");
-
-        Dataset<Tuple2<String, EntityRelEntity>> bySource = getSpark()
-                .read()
-                .load(getOutPath() + "/bySource")
-                .map(new MapFunction<Row, EntityRelEntity>() {
-                    @Override
-                    public EntityRelEntity call(Row value) throws Exception {
-                        return null;
-                    }
-                }, Encoders.kryo(EntityRelEntity.class))
-                .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>) e -> new Tuple2<>(e.getRelation().getSource(), e),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)))
-
-                System.out.println("bySource schema");
-        bySource.printSchema();
-
-
-
-
-        Dataset<EntityRelEntity> joined = entities
-                .joinWith(bySource, entities.col("_1").equalTo(bySource.col("_1")), "left")
-                .map((MapFunction<Tuple2<Tuple2<String, TypedRow>, Tuple2<String, EntityRelEntity>>, EntityRelEntity>) value -> {
-                    EntityRelEntity re = new EntityRelEntity();
-                    re.setEntity(value._1()._2());
-                    Optional<EntityRelEntity> related = Optional.ofNullable(value._2()).map(Tuple2::_2);
-                    if (related.isPresent()) {
-                        re.setRelation(related.get().getRelation());
-                        re.setTarget(related.get().getTarget());
-                    }
-                    return re;
-                }, Encoders.kryo(EntityRelEntity.class));
-
-        System.out.println("joined schema");
-        joined.printSchema();
-        //joined.write().json(getOutPath() + "/joined");
-
-        final Dataset<JoinedEntity> grouped = joined
-                .groupByKey((MapFunction<EntityRelEntity, TypedRow>) e -> e.getEntity(), Encoders.kryo(TypedRow.class))
-                .mapGroups((MapGroupsFunction<TypedRow, EntityRelEntity, JoinedEntity>) (key, values) -> toJoinedEntity(key, values), Encoders.kryo(JoinedEntity.class));
-
-        System.out.println("grouped schema");
-        grouped.printSchema();
-
-        final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
-        grouped
-                .map((MapFunction<JoinedEntity, String>) value -> recordFactory.build(value), Encoders.STRING())
-                .javaRDD()
-                .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> new Tuple2<>(t._1(), t._2()))
-                .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
-
-
-*/
-
-        return this;
-    }
-
-    public SparkSession getSpark() {
-        return spark;
-    }
-
-    public String getInputPath() {
-        return inputPath;
-    }
-
-    public String getOutPath() {
-        return outPath;
-    }
-
-    // HELPERS
-
-    private JoinedEntity toJoinedEntity(TypedRow key, Iterator<EntityRelEntity> values) {
-        final ObjectMapper mapper = getObjectMapper();
-        final JoinedEntity j = new JoinedEntity();
-        j.setType(key.getType());
-        j.setEntity(parseOaf(key.getOaf(), key.getType(), mapper));
-        final Links links = new Links();
-        values.forEachRemaining(rel -> links.add(
-                new eu.dnetlib.dhp.oa.provision.model.Tuple2(
-                        rel.getRelation(),
-                        rel.getTarget()
-                )));
-        j.setLinks(links);
-        return j;
-    }
-
-    private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
-        try {
-            switch (GraphMappingUtils.EntityType.valueOf(type)) {
-                case publication:
-                    return mapper.readValue(json, Publication.class);
-                case dataset:
-                    return mapper.readValue(json, eu.dnetlib.dhp.schema.oaf.Dataset.class);
-                case otherresearchproduct:
-                    return mapper.readValue(json, OtherResearchProduct.class);
-                case software:
-                    return mapper.readValue(json, Software.class);
-                case datasource:
-                    return mapper.readValue(json, Datasource.class);
-                case organization:
-                    return mapper.readValue(json, Organization.class);
-                case project:
-                    return mapper.readValue(json, Project.class);
-                default:
-                    throw new IllegalArgumentException("invalid type: " + type);
-            }
-        } catch (IOException e) {
-            throw new IllegalArgumentException(e);
-        }
-    }
-
-    /**
-     * Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a new line delimited json file,
-     * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
-     * @param sc
-     * @param inputPath
-     * @param type
-     * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
-     */
-    private Dataset<TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
-        RDD<String> rdd = sc.textFile(inputPath + "/" + type)
-                .rdd();
-
-        return getSpark().createDataset(rdd, Encoders.STRING())
-                .map((MapFunction<String, TypedRow>) s -> {
-                    final DocumentContext json = JsonPath.parse(s);
-                    final TypedRow t = new TypedRow();
-                    t.setId(json.read("$.id"));
-                    t.setDeleted(json.read("$.dataInfo.deletedbyinference"));
-                    t.setType(type);
-                    t.setOaf(s);
-                    return t;
-                }, Encoders.bean(TypedRow.class));
-    }
-
-    /**
-     * Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
-     * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
-     * @param sc
-     * @param inputPath
-     * @return the JavaRDD<TypedRow> containing all the relationships
-     */
-    private Dataset<Relation> readPathRelation(final JavaSparkContext sc, final String inputPath) {
-        final RDD<String> rdd = sc.textFile(inputPath + "/relation")
-                .rdd();
-
-        return getSpark().createDataset(rdd, Encoders.STRING())
-                .map((MapFunction<String, Relation>) s -> new ObjectMapper().readValue(s, Relation.class), Encoders.bean(Relation.class));
-    }
-
-    private ObjectMapper getObjectMapper() {
-        return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
-    }
-
-    private void prepareAccumulators(SparkContext sc) {
-        accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
-        accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
-        accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
-        accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
-        accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
-        accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
-
-        accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
-        accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
-        accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
-        accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
-        accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
-
-        accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
-        accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
-        accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
-        accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
-        accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
-        accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
-        accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
-    }
-
-}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
new file mode 100644
index 0000000000..19599b52c4
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@@ -0,0 +1,132 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+/**
+ * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
+ * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
+ * and all the possible relationships (similarity links produced by the Dedup process are excluded).
+ *
+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
+ * by E, finally grouped by E.id;
+ *
+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation
+ *  1) PrepareRelationsJob:
+ *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
+ *      can be linked at most to 100 other objects
+ *
+ *  2) JoinRelationEntityByTargetJob:
+ *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *      for each entity type E_i
+ *          join (R.target = E_i.id),
+ *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *
+ *  3) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *
+ *  4) XmlConverterJob:
+ *      convert the JoinedEntities as XML records
+ */
+public class PrepareRelationsJob {
+
+    private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class);
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    public static final int MAX_RELS = 100;
+
+    public static void main(String[] args) throws Exception {
+        String jsonConfiguration = IOUtils.toString(
+                PrepareRelationsJob.class
+                        .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json"));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+        parser.parseArgument(args);
+
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        String inputRelationsPath = parser.get("inputRelationsPath");
+        log.info("inputRelationsPath: {}", inputRelationsPath);
+
+        String outputPath = parser.get("outputPath");
+        log.info("outputPath: {}", outputPath);
+
+        SparkConf conf = new SparkConf();
+
+        runWithSparkSession(conf, isSparkSessionManaged,
+                spark -> {
+                    removeOutputDir(spark, outputPath);
+                    prepareRelationsFromPaths(spark, inputRelationsPath, outputPath);
+                });
+    }
+
+    private static void prepareRelationsFromPaths(SparkSession spark, String inputRelationsPath, String outputPath) {
+        RDD<SortableRelation> rels = readPathRelation(spark, inputRelationsPath)
+                .filter((FilterFunction<SortableRelation>) r -> r.getDataInfo().getDeletedbyinference() == false)
+                .javaRDD()
+                .mapToPair((PairFunction<SortableRelation, String, List<SortableRelation>>) rel -> new Tuple2<>(
+                        rel.getSource(),
+                        Lists.newArrayList(rel)))
+                .reduceByKey((v1, v2) -> {
+                    v1.addAll(v2);
+                    v1.sort(SortableRelation::compareTo);
+                    if (v1.size() > MAX_RELS) {
+                        return v1.subList(0, MAX_RELS);
+                    }
+                    return new ArrayList<>(v1.subList(0, MAX_RELS));
+                })
+                .flatMap(r -> r._2().iterator())
+                .rdd();
+
+        spark.createDataset(rels, Encoders.bean(SortableRelation.class))
+                .write()
+                .mode(SaveMode.Overwrite)
+                .parquet(outputPath);
+    }
+
+    /**
+     * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file,
+     *
+     * @param spark
+     * @param inputPath
+     * @return the Dataset<SortableRelation> containing all the relationships
+     */
+    private static Dataset<SortableRelation> readPathRelation(SparkSession spark, final String inputPath) {
+        return spark.read()
+                .textFile(inputPath)
+                .map((MapFunction<String, SortableRelation>) s -> OBJECT_MAPPER.readValue(s, SortableRelation.class),
+                        Encoders.bean(SortableRelation.class));
+    }
+
+    private static void removeOutputDir(SparkSession spark, String path) {
+        HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+    }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
index 975ac75485..eae8cf1a14 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
@@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.provision;
 
 import com.lucidworks.spark.util.SolrSupport;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
@@ -18,6 +19,8 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerException;
@@ -28,14 +31,20 @@ import java.io.StringReader;
 import java.io.StringWriter;
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 
 public class SparkXmlIndexingJob {
 
-    private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class);
+    private static final Logger log = LoggerFactory.getLogger(SparkXmlIndexingJob.class);
 
     private static final Integer DEFAULT_BATCH_SIZE = 1000;
 
     private static final String LAYOUT = "index";
+    private static final String INTERPRETATION = "openaire";
+    private static final String SEPARATOR = "-";
+    public static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
 
     public static void main(String[] args) throws Exception {
 
@@ -45,48 +54,56 @@ public class SparkXmlIndexingJob {
                                 "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json")));
         parser.parseArgument(args);
 
-        final String inputPath = parser.get("sourcePath");
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        final String inputPath = parser.get("inputPath");
+        log.info("inputPath: {}", inputPath);
+
         final String isLookupUrl = parser.get("isLookupUrl");
+        log.info("isLookupUrl: {}", isLookupUrl);
+
         final String format = parser.get("format");
+        log.info("format: {}", format);
+
         final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE;
+        log.info("batchSize: {}", batchSize);
 
         final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
         final String fields = getLayoutSource(isLookup, format);
+        log.info("fields: {}", fields);
+
         final String xslt = getLayoutTransformer(isLookup);
 
         final String dsId = getDsId(format, isLookup);
+        log.info("dsId: {}", dsId);
+
         final String zkHost = getZkHost(isLookup);
+        log.info("zkHost: {}", zkHost);
+
         final String version = getRecordDatestamp();
 
         final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
+        log.info("indexRecordTransformer {}", indexRecordXslt);
 
-        log.info("indexRecordTransformer: " + indexRecordXslt);
+        final SparkConf conf = new SparkConf();
 
-        final String master = parser.get("master");
-        final SparkConf conf = new SparkConf()
-                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+        runWithSparkSession(conf, isSparkSessionManaged,
+                spark -> {
+                    final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
-        try(SparkSession spark = getSession(conf, master)) {
+                    RDD<SolrInputDocument> docs = sc.sequenceFile(inputPath, Text.class, Text.class)
+                            .map(t -> t._2().toString())
+                            .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
+                            .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
+                            .rdd();
 
-            final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-
-            RDD<SolrInputDocument> docs = sc.sequenceFile(inputPath, Text.class, Text.class)
-                    .map(t -> t._2().toString())
-                    .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
-                    .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
-                    .rdd();
-
-            SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs);
-        }
-    }
-
-    private static SparkSession getSession(SparkConf conf, String master) {
-        return SparkSession
-                .builder()
-                .config(conf)
-                .appName(SparkXmlIndexingJob.class.getSimpleName())
-                .master(master)
-                .getOrCreate();
+                    final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+                    SolrSupport.indexDocs(zkHost, collection, batchSize, docs);
+                });
     }
 
     private static String toIndexRecord(Transformer tr, final String record) {
@@ -95,7 +112,7 @@ public class SparkXmlIndexingJob {
             tr.transform(new StreamSource(new StringReader(record)), res);
             return res.getWriter().toString();
         } catch (Throwable e) {
-            System.out.println("XPathException on record:\n" + record);
+            log.error("XPathException on record: \n {}", record, e);
             throw new IllegalArgumentException(e);
         }
     }
@@ -127,7 +144,7 @@ public class SparkXmlIndexingJob {
      * @return the parsed date
      */
     public static String getRecordDatestamp() {
-        return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date());
+        return new SimpleDateFormat(DATE_FORMAT).format(new Date());
     }
 
     /**
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
deleted file mode 100644
index e4124e52fb..0000000000
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlRecordBuilderJob_v2.java
+++ /dev/null
@@ -1,81 +0,0 @@
-package eu.dnetlib.dhp.oa.provision;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.oa.provision.model.*;
-import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
-import eu.dnetlib.dhp.schema.oaf.*;
-import org.apache.commons.io.IOUtils;
-import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SparkSession;
-
-public class SparkXmlRecordBuilderJob_v2 {
-
-    public static void main(String[] args) throws Exception {
-
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-                IOUtils.toString(
-                        SparkXmlRecordBuilderJob_v2.class.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
-        parser.parseArgument(args);
-
-        try(SparkSession spark = getSession(parser)) {
-
-            final String inputPath = parser.get("sourcePath");
-            final String outputPath = parser.get("outputPath");
-            final String isLookupUrl = parser.get("isLookupUrl");
-            final String otherDsTypeId = parser.get("otherDsTypeId");
-
-
-            new GraphJoiner_v2(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
-                    .adjacencyLists();
-        }
-    }
-
-    private static SparkSession getSession(ArgumentApplicationParser parser) {
-        final SparkConf conf = new SparkConf();
-        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
-        conf.set("spark.sql.shuffle.partitions", parser.get("sparkSqlShufflePartitions"));
-        conf.registerKryoClasses(new Class[]{
-                Author.class,
-                Context.class,
-                Country.class,
-                DataInfo.class,
-                eu.dnetlib.dhp.schema.oaf.Dataset.class,
-                Datasource.class,
-                ExternalReference.class,
-                ExtraInfo.class,
-                Field.class,
-                GeoLocation.class,
-                Instance.class,
-                Journal.class,
-                KeyValue.class,
-                Oaf.class,
-                OafEntity.class,
-                OAIProvenance.class,
-                Organization.class,
-                OriginDescription.class,
-                OtherResearchProduct.class,
-                Project.class,
-                Publication.class,
-                Qualifier.class,
-                Relation.class,
-                Result.class,
-                Software.class,
-                StructuredProperty.class,
-
-                TypedRow.class,
-                EntityRelEntity.class,
-                JoinedEntity.class,
-                SortableRelationKey.class,
-                Tuple2.class,
-                Links.class,
-                RelatedEntity.class
-        });
-        return SparkSession
-                .builder()
-                .config(conf)
-                .appName(SparkXmlRecordBuilderJob_v2.class.getSimpleName())
-                .master(parser.get("master"))
-                .getOrCreate();
-    }
-
-}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
new file mode 100644
index 0000000000..74a36c580b
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
@@ -0,0 +1,149 @@
+package eu.dnetlib.dhp.oa.provision;
+
+import com.google.common.collect.Maps;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.oa.provision.model.*;
+import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
+import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
+import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.util.LongAccumulator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.util.Map;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+/**
+ * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
+ * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
+ * and all the possible relationships (similarity links produced by the Dedup process are excluded).
+ *
+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
+ * by E, finally grouped by E.id;
+ *
+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation
+ *  1) PrepareRelationsJob:
+ *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
+ *      can be linked at most to 100 other objects
+ *
+ *  2) JoinRelationEntityByTargetJob:
+ *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *      for each entity type E_i
+ *          join (R.target = E_i.id),
+ *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *
+ *  3) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *
+ *  4) XmlConverterJob:
+ *      convert the JoinedEntities as XML records
+ */
+public class XmlConverterJob {
+
+    private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
+
+    public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
+
+    public static void main(String[] args) throws Exception {
+
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        XmlConverterJob.class
+                                .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
+        parser.parseArgument(args);
+
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        String inputPath = parser.get("inputPath");
+        log.info("inputPath: {}", inputPath);
+
+        String outputPath = parser.get("outputPath");
+        log.info("outputPath: {}", outputPath);
+
+        String isLookupUrl = parser.get("isLookupUrl");
+        log.info("isLookupUrl: {}", isLookupUrl);
+
+        String otherDsTypeId = parser.get("otherDsTypeId");
+        log.info("otherDsTypeId: {}", otherDsTypeId);
+
+        SparkConf conf = new SparkConf();
+
+        runWithSparkSession(conf, isSparkSessionManaged,
+                spark -> {
+                    removeOutputDir(spark, outputPath);
+                    convertToXml(spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId);
+                });
+
+    }
+
+    private static void convertToXml(SparkSession spark, String inputPath, String outputPath, ContextMapper contextMapper, String otherDsTypeId) {
+
+        final XmlRecordFactory recordFactory = new XmlRecordFactory(prepareAccumulators(spark.sparkContext()), contextMapper, false, schemaLocation, otherDsTypeId);
+
+        spark.read()
+                .load(inputPath)
+                .as(Encoders.bean(JoinedEntity.class))
+                .map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
+                        je.getEntity().getId(),
+                        recordFactory.build(je)
+                ), Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
+                .javaRDD()
+                .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> t)
+                .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+    }
+
+    private static void removeOutputDir(SparkSession spark, String path) {
+        HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+    }
+
+    private static Map<String, LongAccumulator> prepareAccumulators(SparkContext sc) {
+        Map<String, LongAccumulator> accumulators = Maps.newHashMap();
+        accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
+        accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
+        accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
+        accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
+        accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
+        accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
+
+        accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
+        accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
+        accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
+        accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
+        accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
+
+        accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
+        accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
+        accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
+        accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
+        accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
+        accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
+        accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
+
+        return accumulators;
+    }
+
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
index ddeec140b7..35dfa41d38 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
@@ -1,15 +1,26 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
-import eu.dnetlib.dhp.schema.oaf.Relation;
-
 import java.io.Serializable;
 
 public class EntityRelEntity implements Serializable {
 
     private TypedRow entity;
-    private Relation relation;
+    private SortableRelation relation;
     private RelatedEntity target;
 
+    public EntityRelEntity() {
+    }
+
+    public EntityRelEntity(SortableRelation relation, RelatedEntity target) {
+        this(null, relation, target);
+    }
+
+    public EntityRelEntity(TypedRow entity, SortableRelation relation, RelatedEntity target) {
+        this.entity = entity;
+        this.relation = relation;
+        this.target = target;
+    }
+
     public TypedRow getEntity() {
         return entity;
     }
@@ -18,11 +29,11 @@ public class EntityRelEntity implements Serializable {
         this.entity = entity;
     }
 
-    public Relation getRelation() {
+    public SortableRelation getRelation() {
         return relation;
     }
 
-    public void setRelation(Relation relation) {
+    public void setRelation(SortableRelation relation) {
         this.relation = relation;
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
index 815863c678..4dd4348040 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
@@ -1,22 +1,23 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
+import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
 
 import java.io.Serializable;
 
 public class JoinedEntity implements Serializable {
 
-    private String type;
+    private GraphMappingUtils.EntityType type;
 
     private OafEntity entity;
 
     private Links links;
 
-    public String getType() {
+    public GraphMappingUtils.EntityType getType() {
         return type;
     }
 
-    public void setType(String type) {
+    public void setType(GraphMappingUtils.EntityType type) {
         this.type = type;
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
index 0cb4617ec0..4ea1948766 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
@@ -1,6 +1,6 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
-import java.util.ArrayList;
+import java.util.HashSet;
 
-public class Links extends ArrayList<Tuple2> {
+public class Links extends HashSet<Tuple2> {
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java
new file mode 100644
index 0000000000..430779c727
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java
@@ -0,0 +1,34 @@
+package eu.dnetlib.dhp.oa.provision.model;
+
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Maps;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+import java.util.Map;
+
+public class SortableRelation extends Relation implements Comparable<Relation> {
+
+    private final static Map<String, Integer> weights = Maps.newHashMap();
+
+    static {
+        weights.put("outcome", 0);
+        weights.put("supplement", 1);
+        weights.put("publicationDataset", 2);
+        weights.put("relationship", 3);
+        weights.put("similarity", 4);
+        weights.put("affiliation", 5);
+
+        weights.put("provision", 6);
+        weights.put("participation", 7);
+        weights.put("dedup", 8);
+    }
+
+    @Override
+    public int compareTo(Relation o) {
+        return ComparisonChain.start()
+                .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
+                .compare(getSource(), o.getSource())
+                .compare(getTarget(), o.getTarget())
+                .result();
+    }
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
index db639f1132..f1e2c652c5 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
@@ -2,7 +2,10 @@ package eu.dnetlib.dhp.oa.provision.model;
 
 import eu.dnetlib.dhp.schema.oaf.Relation;
 
-public class Tuple2 {
+import java.io.Serializable;
+import java.util.Objects;
+
+public class Tuple2 implements Serializable {
 
     private Relation relation;
 
@@ -28,4 +31,18 @@ public class Tuple2 {
     public void setRelatedEntity(RelatedEntity relatedEntity) {
         this.relatedEntity = relatedEntity;
     }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        Tuple2 t2 = (Tuple2) o;
+        return getRelation().equals(t2.getRelation());
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(getRelation().hashCode());
+    }
+
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
index 27b42e69d5..8418db8e69 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
@@ -1,30 +1,47 @@
 package eu.dnetlib.dhp.oa.provision.utils;
 
-import com.fasterxml.jackson.annotation.JsonInclude;
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
-import com.jayway.jsonpath.DocumentContext;
-import com.jayway.jsonpath.JsonPath;
-import eu.dnetlib.dhp.oa.provision.model.*;
+import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
+import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
 import eu.dnetlib.dhp.schema.oaf.*;
-import net.minidev.json.JSONArray;
-import org.apache.commons.lang3.StringUtils;
 
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
 import java.util.stream.Collectors;
 
-import static org.apache.commons.lang3.StringUtils.*;
+import static org.apache.commons.lang3.StringUtils.substringAfter;
 
 public class GraphMappingUtils {
 
     public static final String SEPARATOR = "_";
 
+    public final static Map<EntityType, Class> entityTypes = Maps.newHashMap();
+
+    static {
+        entityTypes.put(EntityType.datasource, Datasource.class);
+        entityTypes.put(EntityType.organization, Organization.class);
+        entityTypes.put(EntityType.project, Project.class);
+        entityTypes.put(EntityType.dataset, Dataset.class);
+        entityTypes.put(EntityType.otherresearchproduct, OtherResearchProduct.class);
+        entityTypes.put(EntityType.software, Software.class);
+        entityTypes.put(EntityType.publication, Publication.class);
+    }
+
     public enum EntityType {
-        publication, dataset, otherresearchproduct, software, datasource, organization, project
+        publication, dataset, otherresearchproduct, software, datasource, organization, project;
+
+        public static <T extends OafEntity> EntityType fromClass(Class<T> clazz) {
+            switch (clazz.getName()) {
+                case "eu.dnetlib.dhp.schema.oaf.Publication"            : return publication;
+                case "eu.dnetlib.dhp.schema.oaf.Dataset"                : return dataset;
+                case "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct"   : return otherresearchproduct;
+                case "eu.dnetlib.dhp.schema.oaf.Software"               : return software;
+                case "eu.dnetlib.dhp.schema.oaf.Datasource"             : return datasource;
+                case "eu.dnetlib.dhp.schema.oaf.Organization"           : return organization;
+                case "eu.dnetlib.dhp.schema.oaf.Project"                : return project;
+                default: throw new IllegalArgumentException("Unknown OafEntity class: " + clazz.getName());
+            }
+        }
     }
 
     public enum MainEntityType {
@@ -33,8 +50,6 @@ public class GraphMappingUtils {
 
     public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
 
-    public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
-
     private static final String schemeTemplate = "dnet:%s_%s_relations";
 
     private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
@@ -49,6 +64,38 @@ public class GraphMappingUtils {
         entityMapping.put(EntityType.project,                MainEntityType.project);
     }
 
+    public static Class[] getKryoClasses() {
+        return new Class[]{
+                Author.class,
+                Context.class,
+                Country.class,
+                DataInfo.class,
+                eu.dnetlib.dhp.schema.oaf.Dataset.class,
+                Datasource.class,
+                ExternalReference.class,
+                ExtraInfo.class,
+                Field.class,
+                GeoLocation.class,
+                Instance.class,
+                Journal.class,
+                KeyValue.class,
+                Oaf.class,
+                OafEntity.class,
+                OAIProvenance.class,
+                Organization.class,
+                OriginDescription.class,
+                OtherResearchProduct.class,
+                Project.class,
+                Publication.class,
+                Qualifier.class,
+                Relation.class,
+                SortableRelation.class, //SUPPORT
+                Result.class,
+                Software.class,
+                StructuredProperty.class
+        };
+    }
+
     public static String getScheme(final String sourceType, final String targetType) {
         return String.format(schemeTemplate,
                 entityMapping.get(EntityType.valueOf(sourceType)).name(),
@@ -63,152 +110,81 @@ public class GraphMappingUtils {
         return MainEntityType.result.name().equals(getMainType(type));
     }
 
-    public static RelatedEntity asRelatedEntity(TypedRow e) {
+    public static <E extends OafEntity> RelatedEntity asRelatedEntity(E entity, Class<E> clazz) {
 
-        final DocumentContext j = JsonPath.parse(e.getOaf());
         final RelatedEntity re = new RelatedEntity();
-        re.setId(j.read("$.id"));
-        re.setType(e.getType());
+        re.setId(entity.getId());
+        re.setType(clazz.getName());
 
-        switch (EntityType.valueOf(e.getType())) {
+        re.setPid(entity.getPid());
+        re.setCollectedfrom(entity.getCollectedfrom());
+
+        switch (GraphMappingUtils.EntityType.fromClass(clazz)) {
             case publication:
             case dataset:
             case otherresearchproduct:
             case software:
-                mapTitle(j, re);
-                re.setDateofacceptance(j.read("$.dateofacceptance.value"));
-                re.setPublisher(j.read("$.publisher.value"));
 
-                JSONArray pids = j.read("$.pid");
-                re.setPid(pids.stream()
-                        .map(p -> asStructuredProperty((LinkedHashMap<String, Object>) p))
-                        .collect(Collectors.toList()));
+                Result r = (Result) entity;
 
-                re.setResulttype(asQualifier(j.read("$.resulttype")));
+                if (r.getTitle() == null && !r.getTitle().isEmpty()) {
+                    re.setTitle(r.getTitle().stream().findFirst().get());
+                }
 
-                JSONArray collfrom = j.read("$.collectedfrom");
-                re.setCollectedfrom(collfrom.stream()
-                        .map(c -> asKV((LinkedHashMap<String, Object>) c))
-                        .collect(Collectors.toList()));
-
-                // will throw exception when the instance is not found
-                JSONArray instances = j.read("$.instance");
-                re.setInstances(instances.stream()
-                        .map(i -> {
-                            final LinkedHashMap<String, Object> p = (LinkedHashMap<String, Object>) i;
-                            final Field<String> license = new Field<String>();
-                            license.setValue((String) ((LinkedHashMap<String, Object>) p.get("license")).get("value"));
-                            final Instance instance = new Instance();
-                            instance.setLicense(license);
-                            instance.setAccessright(asQualifier((LinkedHashMap<String, String>) p.get("accessright")));
-                            instance.setInstancetype(asQualifier((LinkedHashMap<String, String>) p.get("instancetype")));
-                            instance.setHostedby(asKV((LinkedHashMap<String, Object>) p.get("hostedby")));
-                            //TODO mapping of distributionlocation
-                            instance.setCollectedfrom(asKV((LinkedHashMap<String, Object>) p.get("collectedfrom")));
-
-                            Field<String> dateofacceptance = new Field<String>();
-                            dateofacceptance.setValue((String) ((LinkedHashMap<String, Object>) p.get("dateofacceptance")).get("value"));
-                            instance.setDateofacceptance(dateofacceptance);
-                            return instance;
-                        }).collect(Collectors.toList()));
+                re.setDateofacceptance(getValue(r.getDateofacceptance()));
+                re.setPublisher(getValue(r.getPublisher()));
+                re.setResulttype(re.getResulttype());
+                re.setInstances(re.getInstances());
 
                 //TODO still to be mapped
                 //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
 
                 break;
             case datasource:
-                re.setOfficialname(j.read("$.officialname.value"));
-                re.setWebsiteurl(j.read("$.websiteurl.value"));
-                re.setDatasourcetype(asQualifier(j.read("$.datasourcetype")));
-                re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility")));
+                Datasource d = (Datasource) entity;
+
+                re.setOfficialname(getValue(d.getOfficialname()));
+                re.setWebsiteurl(getValue(d.getWebsiteurl()));
+                re.setDatasourcetype(d.getDatasourcetype());
+                re.setOpenairecompatibility(d.getOpenairecompatibility());
 
                 break;
             case organization:
-                re.setLegalname(j.read("$.legalname.value"));
-                re.setLegalshortname(j.read("$.legalshortname.value"));
-                re.setCountry(asQualifier(j.read("$.country")));
-                re.setWebsiteurl(j.read("$.websiteurl.value"));
+                Organization o = (Organization) entity;
+
+                re.setLegalname(getValue(o.getLegalname()));
+                re.setLegalshortname(getValue(o.getLegalshortname()));
+                re.setCountry(o.getCountry());
+                re.setWebsiteurl(getValue(o.getWebsiteurl()));
                 break;
             case project:
-                re.setProjectTitle(j.read("$.title.value"));
-                re.setCode(j.read("$.code.value"));
-                re.setAcronym(j.read("$.acronym.value"));
-                re.setContracttype(asQualifier(j.read("$.contracttype")));
+                Project p = (Project) entity;
 
-                JSONArray f = j.read("$.fundingtree");
+                re.setProjectTitle(getValue(p.getTitle()));
+                re.setCode(getValue(p.getCode()));
+                re.setAcronym(getValue(p.getAcronym()));
+                re.setContracttype(p.getContracttype());
+
+                List<Field<String>> f = p.getFundingtree();
                 if (!f.isEmpty()) {
                     re.setFundingtree(f.stream()
-                            .map(s -> ((LinkedHashMap<String, String>) s).get("value"))
+                            .map(s -> s.getValue())
                             .collect(Collectors.toList()));
                 }
-
                 break;
         }
-
         return re;
     }
 
-
-    private static KeyValue asKV(LinkedHashMap<String, Object> j) {
-        final KeyValue kv = new KeyValue();
-        kv.setKey((String) j.get("key"));
-        kv.setValue((String) j.get("value"));
-        return kv;
+    private static String getValue(Field<String> field) {
+        return getFieldValueWithDefault(field, "");
     }
 
-    private static void mapTitle(DocumentContext j, RelatedEntity re) {
-        final JSONArray a = j.read("$.title");
-        if (!a.isEmpty()) {
-            final StructuredProperty sp = asStructuredProperty((LinkedHashMap<String, Object>) a.get(0));
-            if (StringUtils.isNotBlank(sp.getValue())) {
-                re.setTitle(sp);
-            }
-        }
-    }
-
-    private static StructuredProperty asStructuredProperty(LinkedHashMap<String, Object> j) {
-        final StructuredProperty sp = new StructuredProperty();
-        final String value = (String) j.get("value");
-        if (StringUtils.isNotBlank(value)) {
-            sp.setValue((String) j.get("value"));
-            sp.setQualifier(asQualifier((LinkedHashMap<String, String>) j.get("qualifier")));
-        }
-        return sp;
-    }
-
-    public static Qualifier asQualifier(LinkedHashMap<String, String> j) {
-        final Qualifier q = new Qualifier();
-
-        final String classid = j.get("classid");
-        if (StringUtils.isNotBlank(classid)) {
-            q.setClassid(classid);
-        }
-
-        final String classname = j.get("classname");
-        if (StringUtils.isNotBlank(classname)) {
-            q.setClassname(classname);
-        }
-
-        final String schemeid = j.get("schemeid");
-        if (StringUtils.isNotBlank(schemeid)) {
-            q.setSchemeid(schemeid);
-        }
-
-        final String schemename = j.get("schemename");
-        if (StringUtils.isNotBlank(schemename)) {
-            q.setSchemename(schemename);
-        }
-        return q;
-    }
-
-    public static String serialize(final Object o) {
-        try {
-            return new ObjectMapper()
-                    .setSerializationInclusion(JsonInclude.Include.NON_NULL)
-                    .writeValueAsString(o);
-        } catch (JsonProcessingException e) {
-            throw new IllegalArgumentException("unable to serialize: " + o.toString(), e);
-        }
+    private static <T> T getFieldValueWithDefault(Field<T> f, T defaultValue) {
+        return Optional.ofNullable(f)
+                .filter(Objects::nonNull)
+                .map(x -> x.getValue())
+                .orElse(defaultValue);
     }
 
     public static String removePrefix(final String s) {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
index bbac579feb..e57df9b09e 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json
@@ -1,8 +1,14 @@
 [
-  {"paramName":"mt",  "paramLongName":"master",          "paramDescription": "should be local or yarn",                                     "paramRequired": true},
-  {"paramName":"is",  "paramLongName":"isLookupUrl",     "paramDescription": "URL of the isLookUp Service",                                 "paramRequired": true},
-  {"paramName":"o",   "paramLongName":"outputPath",      "paramDescription": "the path used to store temporary output files",               "paramRequired": true},
-  {"paramName":"s",   "paramLongName":"sourcePath",      "paramDescription": "the path of the sequence file to read",                       "paramRequired": true},
-  {"paramName":"t",   "paramLongName":"otherDsTypeId",   "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true},
-  {"paramName":"sp",  "paramLongName":"sparkSqlShufflePartitions",   "paramDescription": "Configures the number of partitions to use when shuffling data for joins or aggregations", "paramRequired": true}
+  {
+    "paramName": "in",
+    "paramLongName": "inputPath",
+    "paramDescription": "the path of the sequence file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path used to store temporary output files",
+    "paramRequired": true
+  }
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json
new file mode 100644
index 0000000000..043129c9f5
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json
@@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "irp",
+    "paramLongName": "inputRelationsPath",
+    "paramDescription": "path to input relations prepare",
+    "paramRequired": true
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "root output location for prepared relations",
+    "paramRequired": true
+  }
+]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json
new file mode 100644
index 0000000000..0090716d69
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json
@@ -0,0 +1,32 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "irp",
+    "paramLongName": "inputRelationsPath",
+    "paramDescription": "path to input relations from the graph",
+    "paramRequired": true
+  },
+  {
+    "paramName": "iep",
+    "paramLongName": "inputEntityPath",
+    "paramDescription": "path to input entity from the graph",
+    "paramRequired": true
+  },
+  {
+    "paramName": "clazz",
+    "paramLongName": "graphTableClassName",
+    "paramDescription": "class name associated to the input entity path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "root output location for prepared relations",
+    "paramRequired": true
+  }
+]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json
new file mode 100644
index 0000000000..cb7949d492
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json
@@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "irp",
+    "paramLongName": "inputRelatedEntitiesPath",
+    "paramDescription": "path to input relations from the graph",
+    "paramRequired": true
+  },
+  {
+    "paramName": "iep",
+    "paramLongName": "inputGraphPath",
+    "paramDescription": "root graph path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "root output location for prepared relations",
+    "paramRequired": true
+  }
+]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
index 0d45e9e29f..146cc9943b 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
@@ -1,7 +1,7 @@
 [
   {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
   {"paramName":"is",  "paramLongName":"isLookupUrl",        "paramDescription": "URL of the isLookUp Service",                              "paramRequired": true},
-  {"paramName":"s",   "paramLongName":"sourcePath",         "paramDescription": "the path of the sequence file to read the XML records",    "paramRequired": true},
+  {"paramName":"i",   "paramLongName":"inputPath",         "paramDescription": "the path of the sequence file to read the XML records",    "paramRequired": true},
   {"paramName":"f",   "paramLongName":"format",             "paramDescription": "MDFormat name found in the IS profile",                    "paramRequired": true},
   {"paramName":"b",   "paramLongName":"batchSize",          "paramDescription": "size of the batch of documents sent to solr",              "paramRequired": false}
 ]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json
new file mode 100644
index 0000000000..32720514e1
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json
@@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "in",
+    "paramLongName": "inputPath",
+    "paramDescription": "the path of the sequence file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path used to store temporary output files",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ilu",
+    "paramLongName": "isLookupUrl",
+    "paramDescription": "URL of the isLookUp Service",
+    "paramRequired": true
+  },
+  {
+    "paramName": "odt",
+    "paramLongName": "otherDsTypeId",
+    "paramDescription": "list of datasource types to populate field datasourcetypeui",
+    "paramRequired": true
+  }
+]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 194cd43c81..5168215099 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -1,6 +1,11 @@
 <workflow-app name="index_infospace_graph" xmlns="uri:oozie:workflow:0.5">
 
     <parameters>
+        <property>
+            <name>inputGraphRootPath</name>
+            <description>root location of input materialized graph</description>
+        </property>
+
         <property>
             <name>sparkDriverMemoryForJoining</name>
             <description>memory for driver process</description>
@@ -64,7 +69,7 @@
 
     <decision name="reuse_records">
         <switch>
-            <case to="adjancency_lists">${wf:conf('reuseRecords') eq false}</case>
+            <case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
             <case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
             <default to="adjancency_lists"/>
         </switch>
@@ -74,16 +79,12 @@
         <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
     </kill>
 
-    <action name="adjancency_lists">
+    <action name="prepare_relations">
         <spark xmlns="uri:oozie:spark-action:0.2">
-            <prepare>
-                <delete path="${outputPath}"/>
-                <mkdir path="${outputPath}"/>
-            </prepare>
             <master>yarn</master>
             <mode>cluster</mode>
-            <name>build_adjacency_lists</name>
-            <class>eu.dnetlib.dhp.oa.provision.SparkXmlRecordBuilderJob_v2</class>
+            <name>PrepareRelations</name>
+            <class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
             <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-cores=${sparkExecutorCoresForJoining}
@@ -94,12 +95,135 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>
-            <arg>-mt</arg> <arg>yarn</arg>
-            <arg>-is</arg> <arg>${isLookupUrl}</arg>
-            <arg>-t</arg> <arg>${otherDsTypeId}</arg>
-            <arg>-s</arg><arg>${sourcePath}</arg>
-            <arg>-o</arg><arg>${outputPath}</arg>
-            <arg>-sp</arg><arg>${sparkSqlShufflePartitions}</arg>
+            <arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
+        </spark>
+        <ok to="prepare_publication_table"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_relation_publication">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = publication.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_relation_dataset"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_relation_dataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = dataset.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_all_entities"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_all_entities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = dataset.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
+        </spark>
+        <ok to="adjancency_lists"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="adjancency_lists">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>build_adjacency_lists</name>
+            <class>eu.dnetlib.dhp.oa.provision.AdjacencyListBuilderJob</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputPath</arg> <arg>${${workingDir}/join_entities</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/joined</arg>
+        </spark>
+        <ok to="convert_to_xml"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="convert_to_xml">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>build_adjacency_lists</name>
+            <class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${${workingDir}/joined</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/xml</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--otherDsTypeId</arg><arg>${otherDsTypeId}</arg>
         </spark>
         <ok to="to_solr_index"/>
         <error to="Kill"/>
@@ -122,9 +246,8 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>
-            <arg>-mt</arg> <arg>yarn</arg>
-            <arg>-is</arg> <arg>${isLookupUrl}</arg>
-            <arg>--sourcePath</arg><arg>${outputPath}/xml</arg>
+            <arg>--isLookupUrl</arg> <arg>${isLookupUrl}</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/xml</arg>
             <arg>--format</arg><arg>${format}</arg>
             <arg>--batchSize</arg><arg>${batchSize}</arg>
         </spark>

From eb2f5f31983113a96d701f24b4e806b08bc67ee0 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Sat, 4 Apr 2020 17:41:31 +0200
Subject: [PATCH 07/13] dataset based provision WIP

---
 .../CreateRelatedEntitiesJob_phase2.java      |   2 +-
 .../dhp/oa/provision/oozie_app/workflow.xml   | 158 +++++++++++++++++-
 2 files changed, 151 insertions(+), 9 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
index 6c7f1efd74..85a9113f21 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@@ -119,7 +119,7 @@ public class CreateRelatedEntitiesJob_phase2 {
                     return re;
                 }, Encoders.bean(EntityRelEntity.class))
                 .write()
-                .mode(SaveMode.Append)
+                .mode(SaveMode.Overwrite)
                 .parquet(outputPath);
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 5168215099..33b9291c47 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -98,10 +98,20 @@
             <arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
             <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
         </spark>
-        <ok to="prepare_publication_table"/>
+        <ok to="fork_join_related_entities"/>
         <error to="Kill"/>
     </action>
 
+    <fork name="fork_join_related_entities">
+        <path start="join_relation_publication"/>
+        <path start="join_relation_dataset"/>
+        <path start="join_relation_otherresearchproduct"/>
+        <path start="join_relation_software"/>
+        <path start="join_relation_datasource"/>
+        <path start="join_relation_organization"/>
+        <path start="join_relation_project"/>
+    </fork>
+
     <action name="join_relation_publication">
         <spark xmlns="uri:oozie:spark-action:0.2">
             <master>yarn</master>
@@ -124,7 +134,7 @@
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation_dataset"/>
+        <ok to="join_relation"/>
         <error to="Kill"/>
     </action>
 
@@ -150,15 +160,147 @@
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_all_entities"/>
+        <ok to="join_relation"/>
         <error to="Kill"/>
     </action>
 
+    <action name="join_relation_otherresearchproduct">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = otherresearchproduct.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_relation"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_relation_software">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = software.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_relation"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_relation_datasource">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = datasource.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/datasource</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_relation"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_relation_organization">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = organization.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/organization</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_relation"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="join_relation_project">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Join[relation.target = project.id]</name>
+            <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-memory=${sparkExecutorMemoryForJoining}
+                --driver-memory=${sparkDriverMemoryForJoining}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/project</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
+        </spark>
+        <ok to="join_relation"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="join_relation" to="join_all_entities"/>
+
     <action name="join_all_entities">
         <spark xmlns="uri:oozie:spark-action:0.2">
             <master>yarn</master>
             <mode>cluster</mode>
-            <name>Join[relation.target = dataset.id]</name>
+            <name>Join[entities.id = relatedEntity.source]</name>
             <class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
             <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
@@ -171,8 +313,8 @@
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
             </spark-opts>
-            <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}</arg>
+            <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
         </spark>
         <ok to="adjancency_lists"/>
@@ -196,7 +338,7 @@
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
             </spark-opts>
-            <arg>--inputPath</arg> <arg>${${workingDir}/join_entities</arg>
+            <arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
             <arg>--outputPath</arg><arg>${workingDir}/joined</arg>
         </spark>
         <ok to="convert_to_xml"/>
@@ -220,7 +362,7 @@
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
             </spark-opts>
-            <arg>--inputPath</arg><arg>${${workingDir}/joined</arg>
+            <arg>--inputPath</arg><arg>${workingDir}/joined</arg>
             <arg>--outputPath</arg><arg>${workingDir}/xml</arg>
             <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
             <arg>--otherDsTypeId</arg><arg>${otherDsTypeId}</arg>
@@ -246,8 +388,8 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>
-            <arg>--isLookupUrl</arg> <arg>${isLookupUrl}</arg>
             <arg>--inputPath</arg><arg>${workingDir}/xml</arg>
+            <arg>--isLookupUrl</arg> <arg>${isLookupUrl}</arg>
             <arg>--format</arg><arg>${format}</arg>
             <arg>--batchSize</arg><arg>${batchSize}</arg>
         </spark>

From c8f4b95464197958dab5aad0693b36c7360b1623 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Mon, 6 Apr 2020 08:59:58 +0200
Subject: [PATCH 08/13] dataset based provision WIP

---
 .../oa/provision/AdjacencyListBuilderJob.java | 91 +++++-------------
 .../CreateRelatedEntitiesJob_phase1.java      |  8 +-
 .../CreateRelatedEntitiesJob_phase2.java      | 94 +++++++++++++------
 .../dhp/oa/provision/PrepareRelationsJob.java | 46 ++++-----
 .../dhp/oa/provision/model/JoinedEntity.java  | 18 +---
 .../dnetlib/dhp/oa/provision/model/Links.java |  6 +-
 .../oa/provision/model/SortableRelation.java  |  3 +-
 .../dhp/oa/provision/model/Tuple2.java        |  3 +
 .../oa/provision/utils/GraphMappingUtils.java |  6 +-
 .../oa/provision/utils/XmlRecordFactory.java  | 66 +++++++++----
 .../input_params_prepare_relations.json       |  6 ++
 ...input_params_related_entities_pahase2.json |  8 +-
 .../dhp/oa/provision/oozie_app/workflow.xml   | 49 ++++++----
 13 files changed, 224 insertions(+), 180 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
index dcb3ac171d..291a44858e 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
@@ -9,19 +9,22 @@ import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
 import eu.dnetlib.dhp.schema.oaf.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.api.java.function.*;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.expressions.Encode;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.Tuple2;
 
 import java.io.IOException;
+import java.util.Iterator;
 import java.util.Optional;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
 
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
@@ -57,6 +60,7 @@ public class AdjacencyListBuilderJob {
     private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);
 
     private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+    public static final int MAX_LINKS = 100;
 
     public static void main(String[] args) throws Exception {
 
@@ -92,72 +96,27 @@ public class AdjacencyListBuilderJob {
 
     private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
 
-        RDD<JoinedEntity> joined = spark.read()
+        log.info("Reading joined entities from: {}", inputPath);
+        spark.read()
                 .load(inputPath)
-                .as(Encoders.kryo(EntityRelEntity.class))
-                .javaRDD()
-                .map(e -> getJoinedEntity(e))
-                .mapToPair(e -> new Tuple2<>(e.getEntity().getId(), e))
-                .reduceByKey((j1, j2) -> getJoinedEntity(j1, j2))
-                .map(Tuple2::_2)
-                .rdd();
-
-        spark.createDataset(joined, Encoders.bean(JoinedEntity.class))
+                .as(Encoders.bean(EntityRelEntity.class))
+                .groupByKey((MapFunction<EntityRelEntity, String>) value -> value.getEntity().getId(), Encoders.STRING())
+                .mapGroups((MapGroupsFunction<String, EntityRelEntity, JoinedEntity>) (key, values) -> {
+                    JoinedEntity j = new JoinedEntity();
+                    Links links = new Links();
+                    while (values.hasNext() && links.size() < MAX_LINKS) {
+                        EntityRelEntity curr = values.next();
+                        if (j.getEntity() == null) {
+                            j.setEntity(curr.getEntity());
+                        }
+                        links.add(new Tuple2(curr.getRelation(), curr.getTarget()));
+                    }
+                    j.setLinks(links);
+                    return j;
+                }, Encoders.bean(JoinedEntity.class))
                 .write()
                 .mode(SaveMode.Overwrite)
                 .parquet(outputPath);
-
-    }
-
-    private static JoinedEntity getJoinedEntity(JoinedEntity j1, JoinedEntity j2) {
-        JoinedEntity je = new JoinedEntity();
-        je.setEntity(je.getEntity());
-        je.setType(j1.getType());
-
-        Links links = new Links();
-        links.addAll(j1.getLinks());
-        links.addAll(j2.getLinks());
-
-        return je;
-    }
-
-    private static JoinedEntity getJoinedEntity(EntityRelEntity e) {
-        JoinedEntity j = new JoinedEntity();
-        j.setEntity(toOafEntity(e.getEntity()));
-        j.setType(EntityType.valueOf(e.getEntity().getType()));
-        Links links = new Links();
-        links.add(new eu.dnetlib.dhp.oa.provision.model.Tuple2(e.getRelation(), e.getTarget()));
-        j.setLinks(links);
-        return j;
-    }
-
-    private static OafEntity toOafEntity(TypedRow typedRow) {
-        return parseOaf(typedRow.getOaf(), typedRow.getType());
-    }
-
-    private static OafEntity parseOaf(final String json, final String type) {
-        try {
-            switch (GraphMappingUtils.EntityType.valueOf(type)) {
-                case publication:
-                    return OBJECT_MAPPER.readValue(json, Publication.class);
-                case dataset:
-                    return OBJECT_MAPPER.readValue(json, Dataset.class);
-                case otherresearchproduct:
-                    return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class);
-                case software:
-                    return OBJECT_MAPPER.readValue(json, Software.class);
-                case datasource:
-                    return OBJECT_MAPPER.readValue(json, Datasource.class);
-                case organization:
-                    return OBJECT_MAPPER.readValue(json, Organization.class);
-                case project:
-                    return OBJECT_MAPPER.readValue(json, Project.class);
-                default:
-                    throw new IllegalArgumentException("invalid type: " + type);
-            }
-        } catch (IOException e) {
-            throw new IllegalArgumentException(e);
-        }
     }
 
     private static void removeOutputDir(SparkSession spark, String path) {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
index 0b153f8269..0e3a5e4727 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@@ -42,17 +42,15 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
  *      for each entity type E_i
  *          join (R.target = E_i.id),
  *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          save the tuples [R - T_i] in append mode
  *
  *  3) CreateRelatedEntitiesJob_phase2:
  *      prepare tuples [source entity - relation - target entity] (S - R - T):
  *      create the union of the each entity type, hash by id (S)
  *      for each [R - T_i] produced in phase1
  *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
- *          save in append mode
  *
  *  4) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
  *
  *  5) XmlConverterJob:
  *      convert the JoinedEntities as XML records
@@ -121,8 +119,8 @@ public class CreateRelatedEntitiesJob_phase1 {
                         t -> new EntityRelEntity(t._1()._2(), GraphMappingUtils.asRelatedEntity(t._2()._2(), entityClazz)),
                         Encoders.bean(EntityRelEntity.class))
                 .write()
-                .mode(SaveMode.Append)
-                .parquet(outputPath);
+                .mode(SaveMode.Overwrite)
+                .parquet(outputPath + "/" + EntityType.fromClass(entityClazz));
     }
 
     private static <E extends OafEntity> Dataset<E> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
index 85a9113f21..645883f6fb 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@@ -1,5 +1,6 @@
 package eu.dnetlib.dhp.oa.provision;
 
+import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@@ -8,7 +9,10 @@ import eu.dnetlib.dhp.oa.provision.model.TypedRow;
 import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
 import eu.dnetlib.dhp.schema.oaf.*;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@@ -17,7 +21,10 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.Tuple2;
+import scala.collection.JavaConverters;
+import scala.collection.Seq;
 
+import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.function.Function;
@@ -68,7 +75,7 @@ public class CreateRelatedEntitiesJob_phase2 {
 
         String jsonConfiguration = IOUtils.toString(
                 PrepareRelationsJob.class
-                        .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"));
+                        .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json"));
         final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
         parser.parseArgument(args);
 
@@ -81,14 +88,14 @@ public class CreateRelatedEntitiesJob_phase2 {
         String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath");
         log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath);
 
-        String inputGraphPath = parser.get("inputGraphPath");
-        log.info("inputGraphPath: {}", inputGraphPath);
+        String inputGraphRootPath = parser.get("inputGraphRootPath");
+        log.info("inputGraphRootPath: {}", inputGraphRootPath);
 
         String outputPath = parser.get("outputPath");
         log.info("outputPath: {}", outputPath);
 
-        String graphTableClassName = parser.get("graphTableClassName");
-        log.info("graphTableClassName: {}", graphTableClassName);
+        int numPartitions = Integer.parseInt(parser.get("numPartitions"));
+        log.info("numPartitions: {}", numPartitions);
 
         SparkConf conf = new SparkConf();
         conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
@@ -97,14 +104,14 @@ public class CreateRelatedEntitiesJob_phase2 {
         runWithSparkSession(conf, isSparkSessionManaged,
                 spark -> {
                     removeOutputDir(spark, outputPath);
-                    joinAllEntities(spark, inputRelatedEntitiesPath, inputGraphPath, outputPath);
+                    joinAllEntities(spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions);
                 });
     }
 
-    private static void joinAllEntities(SparkSession spark, String inputRelatedEntitiesPath, String inputGraphPath, String outputPath) {
+    private static void joinAllEntities(SparkSession spark, String inputRelatedEntitiesPath, String inputGraphRootPath, String outputPath, int numPartitions) {
 
+        Dataset<Tuple2<String, TypedRow>> entities = readAllEntities(spark, inputGraphRootPath, numPartitions);
         Dataset<Tuple2<String, EntityRelEntity>> relsBySource = readRelatedEntities(spark, inputRelatedEntitiesPath);
-        Dataset<Tuple2<String, TypedRow>> entities = readAllEntities(spark, inputGraphPath);
 
         entities
                 .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer")
@@ -118,51 +125,76 @@ public class CreateRelatedEntitiesJob_phase2 {
                     }
                     return re;
                 }, Encoders.bean(EntityRelEntity.class))
+                .repartition(numPartitions)
+                .filter((FilterFunction<EntityRelEntity>) value -> value.getEntity() != null && StringUtils.isNotBlank(value.getEntity().getId()))
                 .write()
                 .mode(SaveMode.Overwrite)
                 .parquet(outputPath);
     }
 
-    private static Dataset<Tuple2<String, TypedRow>> readAllEntities(SparkSession spark, String inputGraphPath) {
-        return GraphMappingUtils.entityTypes.entrySet()
-                .stream()
-                .map((Function<Map.Entry<GraphMappingUtils.EntityType, Class>, Dataset<TypedRow>>)
-                        e -> readPathEntity(spark, inputGraphPath + "/" + e.getKey().name(), e.getValue())
-                        .map((MapFunction<OafEntity, TypedRow>) entity -> {
-                            TypedRow t = new TypedRow();
-                            t.setType(e.getKey().name());
-                            t.setDeleted(entity.getDataInfo().getDeletedbyinference());
-                            t.setId(entity.getId());
-                            t.setOaf(OBJECT_MAPPER.writeValueAsString(entity));
-                            return t;
-                        }, Encoders.bean(TypedRow.class)))
-                .reduce(spark.emptyDataset(Encoders.bean(TypedRow.class)), Dataset::union)
+    private static Dataset<Tuple2<String, TypedRow>> readAllEntities(SparkSession spark, String inputGraphPath, int numPartitions) {
+        Dataset<TypedRow> publication = readPathEntity(spark, inputGraphPath + "/publication", Publication.class);
+        Dataset<TypedRow> dataset = readPathEntity(spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
+        Dataset<TypedRow> other = readPathEntity(spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class);
+        Dataset<TypedRow> software = readPathEntity(spark, inputGraphPath + "/software", Software.class);
+        Dataset<TypedRow> datasource = readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class);
+        Dataset<TypedRow> organization = readPathEntity(spark, inputGraphPath + "/organization", Organization.class);
+        Dataset<TypedRow> project = readPathEntity(spark, inputGraphPath + "/project", Project.class);
+
+        return publication
+                .union(dataset)
+                .union(other)
+                .union(software)
+                .union(datasource)
+                .union(organization)
+                .union(project)
                 .map((MapFunction<TypedRow, Tuple2<String, TypedRow>>)
-                        value -> new Tuple2<>(value.getId(), value),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)));
+                                value -> new Tuple2<>(value.getId(), value),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
+                .repartition(numPartitions);
     }
 
     private static Dataset<Tuple2<String, EntityRelEntity>> readRelatedEntities(SparkSession spark, String inputRelatedEntitiesPath) {
+
+        log.info("Reading related entities from: {}", inputRelatedEntitiesPath);
+
+        final List<String> paths = HdfsSupport.listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration());
+
+        log.info("Found paths: {}", String.join(",", paths));
+
         return spark.read()
-                .load(inputRelatedEntitiesPath)
-                .as(Encoders.kryo(EntityRelEntity.class))
+                .load(toSeq(paths))
+                .as(Encoders.bean(EntityRelEntity.class))
                 .map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>)
-                        value -> new Tuple2<>(value.getRelation().getSource(), value),
-                    Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
+                                value -> new Tuple2<>(value.getRelation().getSource(), value),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
     }
 
-
-    private static <E extends OafEntity> Dataset<E> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
+    private static <E extends OafEntity> Dataset<TypedRow> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
 
         log.info("Reading Graph table from: {}", inputEntityPath);
         return spark
                 .read()
                 .textFile(inputEntityPath)
-                .map((MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz));
+                .map((MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz))
+                .map((MapFunction<E, TypedRow>) value -> getTypedRow(StringUtils.substringAfterLast(inputEntityPath, "/"), value), Encoders.bean(TypedRow.class));
+    }
+
+    private static TypedRow getTypedRow(String type, OafEntity entity) throws JsonProcessingException {
+        TypedRow t = new TypedRow();
+        t.setType(type);
+        t.setDeleted(entity.getDataInfo().getDeletedbyinference());
+        t.setId(entity.getId());
+        t.setOaf(OBJECT_MAPPER.writeValueAsString(entity));
+        return t;
     }
 
     private static void removeOutputDir(SparkSession spark, String path) {
         HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
     }
 
+    private static Seq<String> toSeq(List<String> list) {
+        return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
+    }
+
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
index 19599b52c4..4c324a4c4e 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@@ -1,25 +1,30 @@
 package eu.dnetlib.dhp.oa.provision;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
 import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.FlatMapGroupsFunction;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.Tuple2;
+import scala.math.Ordering;
 
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Optional;
 
@@ -78,34 +83,24 @@ public class PrepareRelationsJob {
         String outputPath = parser.get("outputPath");
         log.info("outputPath: {}", outputPath);
 
+        int numPartitions = Integer.parseInt(parser.get("relPartitions"));
+        log.info("relPartitions: {}", numPartitions);
+
         SparkConf conf = new SparkConf();
 
         runWithSparkSession(conf, isSparkSessionManaged,
                 spark -> {
                     removeOutputDir(spark, outputPath);
-                    prepareRelationsFromPaths(spark, inputRelationsPath, outputPath);
+                    prepareRelationsFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
                 });
     }
 
-    private static void prepareRelationsFromPaths(SparkSession spark, String inputRelationsPath, String outputPath) {
-        RDD<SortableRelation> rels = readPathRelation(spark, inputRelationsPath)
-                .filter((FilterFunction<SortableRelation>) r -> r.getDataInfo().getDeletedbyinference() == false)
-                .javaRDD()
-                .mapToPair((PairFunction<SortableRelation, String, List<SortableRelation>>) rel -> new Tuple2<>(
-                        rel.getSource(),
-                        Lists.newArrayList(rel)))
-                .reduceByKey((v1, v2) -> {
-                    v1.addAll(v2);
-                    v1.sort(SortableRelation::compareTo);
-                    if (v1.size() > MAX_RELS) {
-                        return v1.subList(0, MAX_RELS);
-                    }
-                    return new ArrayList<>(v1.subList(0, MAX_RELS));
-                })
-                .flatMap(r -> r._2().iterator())
-                .rdd();
-
-        spark.createDataset(rels, Encoders.bean(SortableRelation.class))
+    private static void prepareRelationsFromPaths(SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) {
+        readPathRelation(spark, inputRelationsPath)
+                .filter((FilterFunction<SortableRelation>) value -> value.getDataInfo().getDeletedbyinference() == false)
+                .groupByKey((MapFunction<SortableRelation, String>) value -> value.getSource(), Encoders.STRING())
+                .flatMapGroups((FlatMapGroupsFunction<String, SortableRelation, SortableRelation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(SortableRelation.class))
+                .repartition(numPartitions)
                 .write()
                 .mode(SaveMode.Overwrite)
                 .parquet(outputPath);
@@ -121,8 +116,7 @@ public class PrepareRelationsJob {
     private static Dataset<SortableRelation> readPathRelation(SparkSession spark, final String inputPath) {
         return spark.read()
                 .textFile(inputPath)
-                .map((MapFunction<String, SortableRelation>) s -> OBJECT_MAPPER.readValue(s, SortableRelation.class),
-                        Encoders.bean(SortableRelation.class));
+                .map((MapFunction<String, SortableRelation>) value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), Encoders.bean(SortableRelation.class));
     }
 
     private static void removeOutputDir(SparkSession spark, String path) {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
index 4dd4348040..8d1c79798d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
@@ -1,31 +1,21 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
-import eu.dnetlib.dhp.schema.oaf.OafEntity;
-
 import java.io.Serializable;
 
 public class JoinedEntity implements Serializable {
 
-    private GraphMappingUtils.EntityType type;
-
-    private OafEntity entity;
+    private TypedRow entity;
 
     private Links links;
 
-    public GraphMappingUtils.EntityType getType() {
-        return type;
+    public JoinedEntity() {
     }
 
-    public void setType(GraphMappingUtils.EntityType type) {
-        this.type = type;
-    }
-
-    public OafEntity getEntity() {
+    public TypedRow getEntity() {
         return entity;
     }
 
-    public void setEntity(OafEntity entity) {
+    public void setEntity(TypedRow entity) {
         this.entity = entity;
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
index 4ea1948766..f23d961901 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
@@ -1,6 +1,10 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
+import java.io.Serializable;
 import java.util.HashSet;
 
-public class Links extends HashSet<Tuple2> {
+public class Links extends HashSet<Tuple2> implements Serializable {
+
+    public Links() {
+    }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java
index 430779c727..b294a66334 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java
@@ -4,9 +4,10 @@ import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 
+import java.io.Serializable;
 import java.util.Map;
 
-public class SortableRelation extends Relation implements Comparable<Relation> {
+public class SortableRelation extends Relation implements Comparable<Relation>, Serializable {
 
     private final static Map<String, Integer> weights = Maps.newHashMap();
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
index f1e2c652c5..942acaea1d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java
@@ -11,6 +11,9 @@ public class Tuple2 implements Serializable {
 
     private RelatedEntity relatedEntity;
 
+    public Tuple2() {
+    }
+
     public Tuple2(Relation relation, RelatedEntity relatedEntity) {
         this.relation = relation;
         this.relatedEntity = relatedEntity;
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
index 8418db8e69..b65c88201b 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
@@ -102,11 +102,11 @@ public class GraphMappingUtils {
                 entityMapping.get(EntityType.valueOf(targetType)).name());
     }
 
-    public static String getMainType(final String type) {
-        return entityMapping.get(EntityType.valueOf(type)).name();
+    public static String getMainType(final EntityType type) {
+        return entityMapping.get(type).name();
     }
 
-    public static boolean isResult(String type) {
+    public static boolean isResult(EntityType type) {
         return MainEntityType.result.name().equals(getMainType(type));
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
index f2b3aa2e73..9c339d41cd 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@@ -1,5 +1,6 @@
 package eu.dnetlib.dhp.oa.provision.utils;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Lists;
@@ -48,6 +49,8 @@ public class XmlRecordFactory implements Serializable {
 
     private boolean indent = false;
 
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
     public XmlRecordFactory(
             final ContextMapper contextMapper, final boolean indent,
             final String schemaLocation, final String otherDatasourceTypesUForUI) {
@@ -72,22 +75,24 @@ public class XmlRecordFactory implements Serializable {
 
         final Set<String> contexts = Sets.newHashSet();
 
-        final OafEntity entity = je.getEntity();
+        final OafEntity entity = toOafEntity(je.getEntity());
         TemplateFactory templateFactory = new TemplateFactory();
         try {
-            final List<String> metadata = metadata(je.getType(), entity, contexts);
+            final EntityType type = GraphMappingUtils.EntityType.valueOf(je.getEntity().getType());
+            final List<String> metadata = metadata(type, entity, contexts);
 
             // rels has to be processed before the contexts because they enrich the contextMap with the funding info.
             final List<String> relations = listRelations(je, templateFactory, contexts);
 
-            metadata.addAll(buildContexts(getMainType(je.getType()), contexts));
+            final String mainType = getMainType(type);
+            metadata.addAll(buildContexts(mainType, contexts));
             metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo()));
 
             final String body = templateFactory.buildBody(
-                    getMainType(je.getType()),
+                    mainType,
                     metadata,
                     relations,
-                    listChildren(je, templateFactory), listExtraInfo(je));
+                    listChildren(entity, je.getEntity().getType(), templateFactory), listExtraInfo(entity));
 
             return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
         } catch (final Throwable e) {
@@ -95,6 +100,35 @@ public class XmlRecordFactory implements Serializable {
         }
     }
 
+    private static OafEntity toOafEntity(TypedRow typedRow) {
+        return parseOaf(typedRow.getOaf(), typedRow.getType());
+    }
+
+    private static OafEntity parseOaf(final String json, final String type) {
+        try {
+            switch (GraphMappingUtils.EntityType.valueOf(type)) {
+                case publication:
+                    return OBJECT_MAPPER.readValue(json, Publication.class);
+                case dataset:
+                    return OBJECT_MAPPER.readValue(json, Dataset.class);
+                case otherresearchproduct:
+                    return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class);
+                case software:
+                    return OBJECT_MAPPER.readValue(json, Software.class);
+                case datasource:
+                    return OBJECT_MAPPER.readValue(json, Datasource.class);
+                case organization:
+                    return OBJECT_MAPPER.readValue(json, Organization.class);
+                case project:
+                    return OBJECT_MAPPER.readValue(json, Project.class);
+                default:
+                    throw new IllegalArgumentException("invalid type: " + type);
+            }
+        } catch (IOException e) {
+            throw new IllegalArgumentException(e);
+        }
+    }
+
     private String printXML(String xml, boolean indent) {
         try {
             final Document doc = new SAXReader().read(new StringReader(xml));
@@ -110,7 +144,7 @@ public class XmlRecordFactory implements Serializable {
         }
     }
 
-    private List<String> metadata(final String type, final OafEntity entity, final Set<String> contexts) {
+    private List<String> metadata(final EntityType type, final OafEntity entity, final Set<String> contexts) {
 
         final List<String> metadata = Lists.newArrayList();
 
@@ -262,7 +296,7 @@ public class XmlRecordFactory implements Serializable {
             metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", getBestAccessright(r)));
         }
 
-        switch (EntityType.valueOf(type)) {
+        switch (type) {
             case publication:
                 final Publication pub = (Publication) entity;
 
@@ -746,14 +780,14 @@ public class XmlRecordFactory implements Serializable {
         return rels;
     }
 
-    private List<String> listChildren(final JoinedEntity je, TemplateFactory templateFactory) {
+    private List<String> listChildren(final OafEntity entity, String type, TemplateFactory templateFactory) {
 
         final List<String> children = Lists.newArrayList();
-
-        if (MainEntityType.result.toString().equals(getMainType(je.getType()))) {
-            final List<Instance> instances = ((Result) je.getEntity()).getInstance();
+        EntityType entityType = EntityType.valueOf(type);
+        if (MainEntityType.result.toString().equals(getMainType(entityType))) {
+            final List<Instance> instances = ((Result) entity).getInstance();
             if (instances != null) {
-                for (final Instance instance : ((Result) je.getEntity()).getInstance()) {
+                for (final Instance instance : ((Result) entity).getInstance()) {
 
                     final List<String> fields = Lists.newArrayList();
 
@@ -788,9 +822,9 @@ public class XmlRecordFactory implements Serializable {
                     children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl()));
                 }
             }
-            final List<ExternalReference> ext = ((Result) je.getEntity()).getExternalReference();
+            final List<ExternalReference> ext = ((Result) entity).getExternalReference();
             if (ext != null) {
-                for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) {
+                for (final ExternalReference er : ((Result) entity).getExternalReference()) {
 
                     final List<String> fields = Lists.newArrayList();
 
@@ -824,8 +858,8 @@ public class XmlRecordFactory implements Serializable {
         return children;
     }
 
-    private List<String> listExtraInfo(JoinedEntity je) {
-        final List<ExtraInfo> extraInfo = je.getEntity().getExtraInfo();
+    private List<String> listExtraInfo(OafEntity entity) {
+        final List<ExtraInfo> extraInfo = entity.getExtraInfo();
         return extraInfo != null ? extraInfo
                 .stream()
                 .map(e -> XmlSerializationUtils.mapExtraInfo(e))
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json
index 043129c9f5..bfb248d012 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json
@@ -16,5 +16,11 @@
     "paramLongName": "outputPath",
     "paramDescription": "root output location for prepared relations",
     "paramRequired": true
+  },
+  {
+    "paramName": "rp",
+    "paramLongName": "relPartitions",
+    "paramDescription": "number or partitions for the relations Dataset",
+    "paramRequired": true
   }
 ]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json
index cb7949d492..2727f153bd 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json
@@ -13,7 +13,7 @@
   },
   {
     "paramName": "iep",
-    "paramLongName": "inputGraphPath",
+    "paramLongName": "inputGraphRootPath",
     "paramDescription": "root graph path",
     "paramRequired": true
   },
@@ -22,5 +22,11 @@
     "paramLongName": "outputPath",
     "paramDescription": "root output location for prepared relations",
     "paramRequired": true
+  },
+  {
+    "paramName": "np",
+    "paramLongName": "numPartitions",
+    "paramDescription": "number of partitions to use for the output",
+    "paramRequired": true
   }
 ]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 33b9291c47..4a78df5b0d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -5,6 +5,10 @@
             <name>inputGraphRootPath</name>
             <description>root location of input materialized graph</description>
         </property>
+        <property>
+            <name>isLookupUrl</name>
+            <description>URL for the isLookup service</description>
+        </property>
 
         <property>
             <name>sparkDriverMemoryForJoining</name>
@@ -97,6 +101,7 @@
             </spark-opts>
             <arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
             <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
+            <arg>--relPartitions</arg><arg>3000</arg>
         </spark>
         <ok to="fork_join_related_entities"/>
         <error to="Kill"/>
@@ -128,13 +133,14 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/publication</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
@@ -154,13 +160,14 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/dataset</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
@@ -180,13 +187,14 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/otherresearchproduct</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
@@ -206,13 +214,14 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/software</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
@@ -232,13 +241,14 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/datasource</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
@@ -258,13 +268,14 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/organization</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
@@ -284,17 +295,19 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputRelationsPath</arg><arg>${workingDir}/relations</arg>
+            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
             <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/project</arg>
             <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
         </spark>
-        <ok to="join_relation"/>
+        <ok to="wait_joins"/>
         <error to="Kill"/>
     </action>
 
-    <join name="join_relation" to="join_all_entities"/>
+    <join name="wait_joins" to="join_all_entities"/>
+
 
     <action name="join_all_entities">
         <spark xmlns="uri:oozie:spark-action:0.2">
@@ -312,10 +325,12 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
-            <arg>--inputEntityPath</arg><arg>${inputGraphRootPath}</arg>
+            <arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
             <arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
             <arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
+            <arg>--numPartitions</arg><arg>12000</arg>
         </spark>
         <ok to="adjancency_lists"/>
         <error to="Kill"/>
@@ -337,6 +352,7 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
             <arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
             <arg>--outputPath</arg><arg>${workingDir}/joined</arg>
@@ -361,6 +377,7 @@
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                 --conf spark.sql.shuffle.partitions=3840
+                --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
             <arg>--inputPath</arg><arg>${workingDir}/joined</arg>
             <arg>--outputPath</arg><arg>${workingDir}/xml</arg>

From ca345aaad33139c85e105fab671cf59a0570e646 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Mon, 6 Apr 2020 15:33:31 +0200
Subject: [PATCH 09/13] dataset based provision WIP

---
 .../oa/provision/AdjacencyListBuilderJob.java | 41 ++++++++---------
 .../CreateRelatedEntitiesJob_phase1.java      | 45 ++++++++++---------
 .../CreateRelatedEntitiesJob_phase2.java      | 34 ++++++--------
 .../dhp/oa/provision/PrepareRelationsJob.java | 32 ++++++-------
 .../dhp/oa/provision/XmlConverterJob.java     | 45 +++++++++++++++----
 .../dhp/oa/provision/model/JoinedEntity.java  |  7 +--
 .../dnetlib/dhp/oa/provision/model/Links.java | 10 -----
 .../oa/provision/utils/GraphMappingUtils.java | 16 +++----
 .../oa/provision/utils/XmlRecordFactory.java  |  2 +-
 .../dhp/oa/provision/oozie_app/workflow.xml   | 10 ++---
 10 files changed, 124 insertions(+), 118 deletions(-)
 delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
index 291a44858e..2cc52fb62b 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
@@ -3,31 +3,25 @@ package eu.dnetlib.dhp.oa.provision;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.oa.provision.model.*;
-import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
+import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
+import eu.dnetlib.dhp.oa.provision.model.Tuple2;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.*;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.catalyst.expressions.Encode;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Optional;
-import java.util.Spliterator;
-import java.util.Spliterators;
-import java.util.stream.Collectors;
-import java.util.stream.StreamSupport;
 
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;
 
 /**
  * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
@@ -43,14 +37,19 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
  *      can be linked at most to 100 other objects
  *
  *  2) JoinRelationEntityByTargetJob:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
  *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
  *
  *  3) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
  *
  *  4) XmlConverterJob:
  *      convert the JoinedEntities as XML records
@@ -59,7 +58,6 @@ public class AdjacencyListBuilderJob {
 
     private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);
 
-    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
     public static final int MAX_LINKS = 100;
 
     public static void main(String[] args) throws Exception {
@@ -91,7 +89,6 @@ public class AdjacencyListBuilderJob {
                     removeOutputDir(spark, outputPath);
                     createAdjacencyLists(spark, inputPath, outputPath);
                 });
-
     }
 
     private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
@@ -103,7 +100,7 @@ public class AdjacencyListBuilderJob {
                 .groupByKey((MapFunction<EntityRelEntity, String>) value -> value.getEntity().getId(), Encoders.STRING())
                 .mapGroups((MapGroupsFunction<String, EntityRelEntity, JoinedEntity>) (key, values) -> {
                     JoinedEntity j = new JoinedEntity();
-                    Links links = new Links();
+                    List<Tuple2> links = new ArrayList<>();
                     while (values.hasNext() && links.size() < MAX_LINKS) {
                         EntityRelEntity curr = values.next();
                         if (j.getEntity() == null) {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
index 0e3a5e4727..7d3555b6c2 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@@ -4,9 +4,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
+import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
@@ -37,22 +37,22 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
  *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
  *      can be linked at most to 100 other objects
  *
- *  2) CreateRelatedEntitiesJob_phase1:
- *      prepare tuples [relation - target entity] (R - T):
+ *  2) JoinRelationEntityByTargetJob:
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
  *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
  *
- *  3) CreateRelatedEntitiesJob_phase2:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
- *      create the union of the each entity type, hash by id (S)
- *      for each [R - T_i] produced in phase1
- *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
- *
- *  4) AdjacencyListBuilderJob:
+ *  3) AdjacencyListBuilderJob:
  *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
  *
- *  5) XmlConverterJob:
+ *  4) XmlConverterJob:
  *      convert the JoinedEntities as XML records
  */
 public class CreateRelatedEntitiesJob_phase1 {
@@ -103,20 +103,21 @@ public class CreateRelatedEntitiesJob_phase1 {
     private static <E extends OafEntity> void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class<E> entityClazz, String outputPath) {
 
         Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
+                .filter((FilterFunction<SortableRelation>) value -> value.getDataInfo().getDeletedbyinference() == false)
                 .map((MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)));
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)))
+                .cache();
 
-        Dataset<Tuple2<String, E>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
-                .map((MapFunction<E, Tuple2<String, E>>) e -> new Tuple2<>(e.getId(), e),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(entityClazz)))
+        Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
+                .map((MapFunction<E, RelatedEntity>) value -> asRelatedEntity(value, entityClazz), Encoders.bean(RelatedEntity.class))
+                .map((MapFunction<RelatedEntity, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), e),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
                 .cache();
 
         relsByTarget
                 .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
-                .filter((FilterFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>>)
-                        value -> value._2()._2().getDataInfo().getDeletedbyinference() == false)
-                .map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>, EntityRelEntity>)
-                        t -> new EntityRelEntity(t._1()._2(), GraphMappingUtils.asRelatedEntity(t._2()._2(), entityClazz)),
+                .map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, RelatedEntity>>, EntityRelEntity>)
+                        t -> new EntityRelEntity(t._1()._2(), t._2()._2()),
                         Encoders.bean(EntityRelEntity.class))
                 .write()
                 .mode(SaveMode.Overwrite)
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
index 645883f6fb..2b5c627b62 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@@ -6,11 +6,9 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
 import eu.dnetlib.dhp.oa.provision.model.TypedRow;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
 import eu.dnetlib.dhp.schema.oaf.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
@@ -25,12 +23,10 @@ import scala.collection.JavaConverters;
 import scala.collection.Seq;
 
 import java.util.List;
-import java.util.Map;
 import java.util.Optional;
-import java.util.function.Function;
 
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;
 
 /**
  * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
@@ -45,24 +41,22 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
  *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
  *      can be linked at most to 100 other objects
  *
- *  2) CreateRelatedEntitiesJob_phase1:
- *      prepare tuples [relation - target entity] (R - T):
+ *  2) JoinRelationEntityByTargetJob:
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
  *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          save the tuples [R - T_i] in append mode
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
  *
- *  3) CreateRelatedEntitiesJob_phase2:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
- *      create the union of the each entity type, hash by id (S)
- *      for each [R - T_i] produced in phase1
- *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
- *          save in append mode
+ *  3) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
  *
- *  4) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
- *
- *  5) XmlConverterJob:
+ *  4) XmlConverterJob:
  *      convert the JoinedEntities as XML records
  */
 public class CreateRelatedEntitiesJob_phase2 {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
index 4c324a4c4e..5a70e258f6 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@@ -1,31 +1,22 @@
 package eu.dnetlib.dhp.oa.provision;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
-import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
 import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapGroupsFunction;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.Tuple2;
-import scala.math.Ordering;
 
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
 import java.util.Optional;
 
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@@ -44,14 +35,19 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
  *      can be linked at most to 100 other objects
  *
  *  2) JoinRelationEntityByTargetJob:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
  *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
  *
  *  3) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
  *
  *  4) XmlConverterJob:
  *      convert the JoinedEntities as XML records
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
index 74a36c580b..910cd85436 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
@@ -1,5 +1,6 @@
 package eu.dnetlib.dhp.oa.provision;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@@ -27,8 +28,11 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.Tuple2;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.stream.Collectors;
 
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 
@@ -37,23 +41,25 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
  * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
  * and all the possible relationships (similarity links produced by the Dedup process are excluded).
  *
- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
- * by E, finally grouped by E.id;
- *
  * The workflow is organized in different parts aimed to to reduce the complexity of the operation
  *  1) PrepareRelationsJob:
  *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
  *      can be linked at most to 100 other objects
  *
  *  2) JoinRelationEntityByTargetJob:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
  *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
  *
  *  3) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
  *
  *  4) XmlConverterJob:
  *      convert the JoinedEntities as XML records
@@ -62,6 +68,8 @@ public class XmlConverterJob {
 
     private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
 
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
     public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
 
     public static void main(String[] args) throws Exception {
@@ -107,12 +115,31 @@ public class XmlConverterJob {
         spark.read()
                 .load(inputPath)
                 .as(Encoders.bean(JoinedEntity.class))
+ /*               .map((MapFunction<JoinedEntity, String>) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING())
+                .write()
+                .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
+                .text("/tmp/json");
+
+        spark.read()
+                .textFile("/tmp/json")
+                .map((MapFunction<String, JoinedEntity>) value -> OBJECT_MAPPER.readValue(value, JoinedEntity.class), Encoders.bean(JoinedEntity.class))
+                .map((MapFunction<JoinedEntity, JoinedEntity>) j -> {
+                    if (j.getLinks() != null) {
+                        j.setLinks(j.getLinks()
+                                .stream()
+                                .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null)
+                                .collect(Collectors.toCollection(ArrayList::new)));
+                    }
+                    return j;
+                }, Encoders.bean(JoinedEntity.class))
+
+  */
                 .map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
                         je.getEntity().getId(),
                         recordFactory.build(je)
                 ), Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
                 .javaRDD()
-                .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> t)
+                .mapToPair((PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()), new Text(t._2())))
                 .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
     }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
index 8d1c79798d..b6e97a503f 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
@@ -1,12 +1,13 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
 import java.io.Serializable;
+import java.util.List;
 
 public class JoinedEntity implements Serializable {
 
     private TypedRow entity;
 
-    private Links links;
+    private List<Tuple2> links;
 
     public JoinedEntity() {
     }
@@ -19,11 +20,11 @@ public class JoinedEntity implements Serializable {
         this.entity = entity;
     }
 
-    public Links getLinks() {
+    public List<Tuple2> getLinks() {
         return links;
     }
 
-    public void setLinks(Links links) {
+    public void setLinks(List<Tuple2> links) {
         this.links = links;
     }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
deleted file mode 100644
index f23d961901..0000000000
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
+++ /dev/null
@@ -1,10 +0,0 @@
-package eu.dnetlib.dhp.oa.provision.model;
-
-import java.io.Serializable;
-import java.util.HashSet;
-
-public class Links extends HashSet<Tuple2> implements Serializable {
-
-    public Links() {
-    }
-}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
index b65c88201b..398a272e23 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
@@ -114,7 +114,7 @@ public class GraphMappingUtils {
 
         final RelatedEntity re = new RelatedEntity();
         re.setId(entity.getId());
-        re.setType(clazz.getName());
+        re.setType(EntityType.fromClass(clazz).name());
 
         re.setPid(entity.getPid());
         re.setCollectedfrom(entity.getCollectedfrom());
@@ -125,16 +125,16 @@ public class GraphMappingUtils {
             case otherresearchproduct:
             case software:
 
-                Result r = (Result) entity;
+                Result result = (Result) entity;
 
-                if (r.getTitle() == null && !r.getTitle().isEmpty()) {
-                    re.setTitle(r.getTitle().stream().findFirst().get());
+                if (result.getTitle() == null && !result.getTitle().isEmpty()) {
+                    re.setTitle(result.getTitle().stream().findFirst().get());
                 }
 
-                re.setDateofacceptance(getValue(r.getDateofacceptance()));
-                re.setPublisher(getValue(r.getPublisher()));
-                re.setResulttype(re.getResulttype());
-                re.setInstances(re.getInstances());
+                re.setDateofacceptance(getValue(result.getDateofacceptance()));
+                re.setPublisher(getValue(result.getPublisher()));
+                re.setResulttype(result.getResulttype());
+                re.setInstances(result.getInstance());
 
                 //TODO still to be mapped
                 //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
index 9c339d41cd..2ea78fe845 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@@ -694,7 +694,7 @@ public class XmlRecordFactory implements Serializable {
                     if (isNotBlank(re.getCodeRepositoryUrl())) {
                         metadata.add(XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
                     }
-                    if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
+                    if (re.getResulttype() != null & re.getResulttype().isBlank()) {
                         metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
                     }
                     if (re.getCollectedfrom() != null) {
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 4a78df5b0d..5bc89396bc 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -75,7 +75,7 @@
         <switch>
             <case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
             <case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
-            <default to="adjancency_lists"/>
+            <default to="prepare_relations"/>
         </switch>
     </decision>
 
@@ -132,7 +132,7 @@
                 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                 --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
             <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
@@ -324,7 +324,7 @@
                 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                 --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
             <arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
@@ -351,7 +351,7 @@
                 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                 --conf spark.network.timeout=${sparkNetworkTimeout}
             </spark-opts>
             <arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
@@ -365,7 +365,7 @@
         <spark xmlns="uri:oozie:spark-action:0.2">
             <master>yarn</master>
             <mode>cluster</mode>
-            <name>build_adjacency_lists</name>
+            <name>convert_to_xml</name>
             <class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
             <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>

From e3559619975078b0aab81eba36c79500a18c5287 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Mon, 6 Apr 2020 17:34:25 +0200
Subject: [PATCH 10/13] dataset based provision WIP

---
 .../dhp/oa/provision/PrepareRelationsJob.java | 37 ++++++-
 .../dhp/oa/provision/XmlConverterJob.java     | 10 --
 .../oa/provision/model/EntityRelEntity.java   | 17 ++++
 .../dhp/oa/provision/model/RelatedEntity.java | 36 +++++++
 .../provision/model/SortableRelationKey.java  | 96 -------------------
 .../provision/utils/RelationPartitioner.java  |  4 +-
 6 files changed, 88 insertions(+), 112 deletions(-)
 delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
index 5a70e258f6..337a2ebbb4 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@@ -1,21 +1,26 @@
 package eu.dnetlib.dhp.oa.provision;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
+import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
 import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.FilterFunction;
-import org.apache.spark.api.java.function.FlatMapGroupsFunction;
-import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.*;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import scala.Tuple2;
 
 import java.util.Optional;
 
@@ -87,7 +92,7 @@ public class PrepareRelationsJob {
         runWithSparkSession(conf, isSparkSessionManaged,
                 spark -> {
                     removeOutputDir(spark, outputPath);
-                    prepareRelationsFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
+                    prepareRelationsRDDFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
                 });
     }
 
@@ -115,6 +120,30 @@ public class PrepareRelationsJob {
                 .map((MapFunction<String, SortableRelation>) value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), Encoders.bean(SortableRelation.class));
     }
 
+    private static void prepareRelationsRDDFromPaths(SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) {
+        JavaRDD<SortableRelation> rels = readPathRelationRDD(spark, inputRelationsPath)
+                .repartition(numPartitions);
+
+        RDD<SortableRelation> d = rels
+                .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) //only consider those that are not virtually deleted
+                .mapToPair((PairFunction<SortableRelation, SortableRelation, SortableRelation>) rel -> new Tuple2<>(rel, rel))
+                .groupByKey(new RelationPartitioner(rels.getNumPartitions()))
+                .map(p -> Iterables.limit(p._2(), MAX_RELS))
+                .flatMap(p -> p.iterator())
+                .rdd();
+
+        spark.createDataset(d, Encoders.bean(SortableRelation.class))
+                .write()
+                .mode(SaveMode.Overwrite)
+                .parquet(outputPath);
+    }
+
+    private static JavaRDD<SortableRelation> readPathRelationRDD(SparkSession spark, final String inputPath) {
+        JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+        return sc.textFile(inputPath)
+                .map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class));
+    }
+
     private static void removeOutputDir(SparkSession spark, String path) {
         HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
     }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
index 910cd85436..059cb31f25 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
@@ -115,14 +115,6 @@ public class XmlConverterJob {
         spark.read()
                 .load(inputPath)
                 .as(Encoders.bean(JoinedEntity.class))
- /*               .map((MapFunction<JoinedEntity, String>) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING())
-                .write()
-                .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
-                .text("/tmp/json");
-
-        spark.read()
-                .textFile("/tmp/json")
-                .map((MapFunction<String, JoinedEntity>) value -> OBJECT_MAPPER.readValue(value, JoinedEntity.class), Encoders.bean(JoinedEntity.class))
                 .map((MapFunction<JoinedEntity, JoinedEntity>) j -> {
                     if (j.getLinks() != null) {
                         j.setLinks(j.getLinks()
@@ -132,8 +124,6 @@ public class XmlConverterJob {
                     }
                     return j;
                 }, Encoders.bean(JoinedEntity.class))
-
-  */
                 .map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
                         je.getEntity().getId(),
                         recordFactory.build(je)
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
index 35dfa41d38..e1ca8e3164 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java
@@ -1,5 +1,7 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
+import com.google.common.base.Objects;
+
 import java.io.Serializable;
 
 public class EntityRelEntity implements Serializable {
@@ -44,4 +46,19 @@ public class EntityRelEntity implements Serializable {
     public void setTarget(RelatedEntity target) {
         this.target = target;
     }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        EntityRelEntity that = (EntityRelEntity) o;
+        return Objects.equal(entity, that.entity) &&
+                Objects.equal(relation, that.relation) &&
+                Objects.equal(target, that.target);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(entity, relation, target);
+    }
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
index 2e5b4186cb..011d9276d3 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java
@@ -1,5 +1,6 @@
 package eu.dnetlib.dhp.oa.provision.model;
 
+import com.google.common.base.Objects;
 import eu.dnetlib.dhp.schema.oaf.Instance;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
@@ -228,4 +229,39 @@ public class RelatedEntity implements Serializable {
     public void setFundingtree(List<String> fundingtree) {
         this.fundingtree = fundingtree;
     }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        RelatedEntity that = (RelatedEntity) o;
+        return Objects.equal(id, that.id) &&
+                Objects.equal(type, that.type) &&
+                Objects.equal(title, that.title) &&
+                Objects.equal(websiteurl, that.websiteurl) &&
+                Objects.equal(dateofacceptance, that.dateofacceptance) &&
+                Objects.equal(publisher, that.publisher) &&
+                Objects.equal(pid, that.pid) &&
+                Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) &&
+                Objects.equal(resulttype, that.resulttype) &&
+                Objects.equal(collectedfrom, that.collectedfrom) &&
+                Objects.equal(instances, that.instances) &&
+                Objects.equal(officialname, that.officialname) &&
+                Objects.equal(datasourcetype, that.datasourcetype) &&
+                Objects.equal(datasourcetypeui, that.datasourcetypeui) &&
+                Objects.equal(openairecompatibility, that.openairecompatibility) &&
+                Objects.equal(legalname, that.legalname) &&
+                Objects.equal(legalshortname, that.legalshortname) &&
+                Objects.equal(country, that.country) &&
+                Objects.equal(projectTitle, that.projectTitle) &&
+                Objects.equal(code, that.code) &&
+                Objects.equal(acronym, that.acronym) &&
+                Objects.equal(contracttype, that.contracttype) &&
+                Objects.equal(fundingtree, that.fundingtree);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(id, type, title, websiteurl, dateofacceptance, publisher, pid, codeRepositoryUrl, resulttype, collectedfrom, instances, officialname, datasourcetype, datasourcetypeui, openairecompatibility, legalname, legalshortname, country, projectTitle, code, acronym, contracttype, fundingtree);
+    }
 }
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java
deleted file mode 100644
index fef9915e80..0000000000
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package eu.dnetlib.dhp.oa.provision.model;
-
-import com.google.common.collect.ComparisonChain;
-import com.google.common.collect.Maps;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-
-import java.io.Serializable;
-import java.util.Map;
-
-/**
- * Allows to sort relationships according to the priority defined in weights map.
- */
-public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
-
-    private String sourceId;
-    private String targetId;
-
-    private String relType;
-    private String subRelType;
-    private String relClass;
-
-    private final static Map<String, Integer> weights = Maps.newHashMap();
-
-    static {
-        weights.put("outcome", 0);
-        weights.put("supplement", 1);
-        weights.put("publicationDataset", 2);
-        weights.put("relationship", 3);
-        weights.put("similarity", 4);
-        weights.put("affiliation", 5);
-
-        weights.put("provision", 6);
-        weights.put("participation", 7);
-        weights.put("dedup", 8);
-    }
-
-    public static SortableRelationKey from(final Relation r) {
-        final SortableRelationKey s = new SortableRelationKey();
-        s.setSourceId(r.getSource());
-        s.setTargetId(r.getTarget());
-        s.setRelType(r.getRelType());
-        s.setSubRelType(r.getSubRelType());
-        s.setRelClass(r.getRelClass());
-        return s;
-    }
-
-    public String getSourceId() {
-        return sourceId;
-    }
-
-    public void setSourceId(String sourceId) {
-        this.sourceId = sourceId;
-    }
-
-    public String getTargetId() {
-        return targetId;
-    }
-
-    public void setTargetId(String targetId) {
-        this.targetId = targetId;
-    }
-
-    public String getRelType() {
-        return relType;
-    }
-
-    public void setRelType(String relType) {
-        this.relType = relType;
-    }
-
-    public String getSubRelType() {
-        return subRelType;
-    }
-
-    public void setSubRelType(String subRelType) {
-        this.subRelType = subRelType;
-    }
-
-    public String getRelClass() {
-        return relClass;
-    }
-
-    public void setRelClass(String relClass) {
-        this.relClass = relClass;
-    }
-
-    @Override
-    public int compareTo(SortableRelationKey o) {
-        return ComparisonChain.start()
-                .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
-                .compare(getSourceId(), o.getSourceId())
-                .compare(getTargetId(), o.getTargetId())
-                .result();
-    }
-
-}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java
index 9714830d35..c8e7a24292 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java
@@ -1,6 +1,6 @@
 package eu.dnetlib.dhp.oa.provision.utils;
 
-import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
+import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
 import org.apache.spark.Partitioner;
 import org.apache.spark.util.Utils;
 
@@ -23,7 +23,7 @@ public class RelationPartitioner extends Partitioner {
 
     @Override
     public int getPartition(Object key) {
-        return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions());
+        return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions());
     }
 
 }

From 77f59b1b1084cc79ee0bd9e64222fa30eed05f7a Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Mon, 6 Apr 2020 19:37:27 +0200
Subject: [PATCH 11/13] dataset based provision WIP

---
 ...mlIndexingJob.java => XmlIndexingJob.java} | 10 ++-----
 .../provision/input_params_update_index.json  | 29 +++++++++++++++----
 .../dhp/oa/provision/oozie_app/workflow.xml   |  6 ++--
 3 files changed, 30 insertions(+), 15 deletions(-)
 rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/{SparkXmlIndexingJob.java => XmlIndexingJob.java} (95%)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
similarity index 95%
rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
index eae8cf1a14..84538c924a 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SparkXmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@@ -2,7 +2,6 @@ package eu.dnetlib.dhp.oa.provision;
 
 import com.lucidworks.spark.util.SolrSupport;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
@@ -11,14 +10,11 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.io.Text;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -35,9 +31,9 @@ import java.util.Optional;
 
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 
-public class SparkXmlIndexingJob {
+public class XmlIndexingJob {
 
-    private static final Logger log = LoggerFactory.getLogger(SparkXmlIndexingJob.class);
+    private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);
 
     private static final Integer DEFAULT_BATCH_SIZE = 1000;
 
@@ -50,7 +46,7 @@ public class SparkXmlIndexingJob {
 
         final ArgumentApplicationParser parser = new ArgumentApplicationParser(
                 IOUtils.toString(
-                        SparkXmlIndexingJob.class.getResourceAsStream(
+                        XmlIndexingJob.class.getResourceAsStream(
                                 "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json")));
         parser.parseArgument(args);
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
index 146cc9943b..3396020e07 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
@@ -1,7 +1,26 @@
 [
-  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
-  {"paramName":"is",  "paramLongName":"isLookupUrl",        "paramDescription": "URL of the isLookUp Service",                              "paramRequired": true},
-  {"paramName":"i",   "paramLongName":"inputPath",         "paramDescription": "the path of the sequence file to read the XML records",    "paramRequired": true},
-  {"paramName":"f",   "paramLongName":"format",             "paramDescription": "MDFormat name found in the IS profile",                    "paramRequired": true},
-  {"paramName":"b",   "paramLongName":"batchSize",          "paramDescription": "size of the batch of documents sent to solr",              "paramRequired": false}
+  {
+    "paramName": "is",
+    "paramLongName": "isLookupUrl",
+    "paramDescription": "URL of the isLookUp Service",
+    "paramRequired": true
+  },
+  {
+    "paramName": "i",
+    "paramLongName": "inputPath",
+    "paramDescription": "the path of the sequence file to read the XML records",
+    "paramRequired": true
+  },
+  {
+    "paramName": "f",
+    "paramLongName": "format",
+    "paramDescription": "MDFormat name found in the IS profile",
+    "paramRequired": true
+  },
+  {
+    "paramName": "b",
+    "paramLongName": "batchSize",
+    "paramDescription": "size of the batch of documents sent to solr",
+    "paramRequired": false
+  }
 ]
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 5bc89396bc..e6587ef5e0 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -393,12 +393,12 @@
             <master>yarn</master>
             <mode>cluster</mode>
             <name>to_solr_index</name>
-            <class>eu.dnetlib.dhp.oa.provision.SparkXmlIndexingJob</class>
+            <class>eu.dnetlib.dhp.oa.provision.XmlIndexingJob</class>
             <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
-                --executor-cores ${sparkExecutorCoresForIndexing}
-                --executor-memory ${sparkExecutorMemoryForIndexing}
+                --executor-memory=${sparkExecutorMemoryForIndexing}
                 --driver-memory=${sparkDriverMemoryForIndexing}
+                --conf spark.dynamicAllocation.enabled=true
                 --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
                 --conf spark.extraListeners=${spark2ExtraListeners}
                 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

From fbdd18a96b5c56db2148ce5eb69eb5d060d83cfe Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Tue, 7 Apr 2020 08:54:39 +0200
Subject: [PATCH 12/13] using dataset based relation preparation procedure

---
 .../java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
index 337a2ebbb4..caddfaf8d0 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@@ -92,7 +92,7 @@ public class PrepareRelationsJob {
         runWithSparkSession(conf, isSparkSessionManaged,
                 spark -> {
                     removeOutputDir(spark, outputPath);
-                    prepareRelationsRDDFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
+                    prepareRelationsFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
                 });
     }
 

From 1a1a026a18adebb1367f97fad7ce7077f2c93870 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Tue, 7 Apr 2020 08:55:33 +0200
Subject: [PATCH 13/13] we do expect to find field bestaccessright already
 defined. No need to add it again

---
 .../eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
index 2ea78fe845..5cf881f00d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@@ -292,8 +292,6 @@ public class XmlRecordFactory implements Serializable {
             if (r.getResourcetype() != null) {
                 metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype()));
             }
-
-            metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", getBestAccessright(r)));
         }
 
         switch (type) {