dataset based provision WIP

2020-04-06 15:33:31 +02:00 · 2020-04-06 15:33:31 +02:00 · ca345aaad3
parent c8f4b95464
commit ca345aaad3
10 changed files with 124 additions and 118 deletions
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java
@ -3,31 +3,25 @@ package eu.dnetlib.dhp.oa.provision;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
-import eu.dnetlib.dhp.oa.provision.model.*;
-import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
+import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
+import eu.dnetlib.dhp.oa.provision.model.Tuple2;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.*;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.catalyst.expressions.Encode;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.IOException;
-import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Optional;
-import java.util.Spliterator;
-import java.util.Spliterators;
-import java.util.stream.Collectors;
-import java.util.stream.StreamSupport;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;

 /**
 * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
@ -43,14 +37,19 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
 *      can be linked at most to 100 other objects
 *
 *  2) JoinRelationEntityByTargetJob:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
 *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
 *
 *  3) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
 *
 *  4) XmlConverterJob:
 *      convert the JoinedEntities as XML records
@ -59,7 +58,6 @@ public class AdjacencyListBuilderJob {

    private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);

-    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
    public static final int MAX_LINKS = 100;

    public static void main(String[] args) throws Exception {
@ -91,7 +89,6 @@ public class AdjacencyListBuilderJob {
                    removeOutputDir(spark, outputPath);
                    createAdjacencyLists(spark, inputPath, outputPath);
                });
-
    }

    private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
@ -103,7 +100,7 @@ public class AdjacencyListBuilderJob {
                .groupByKey((MapFunction<EntityRelEntity, String>) value -> value.getEntity().getId(), Encoders.STRING())
                .mapGroups((MapGroupsFunction<String, EntityRelEntity, JoinedEntity>) (key, values) -> {
                    JoinedEntity j = new JoinedEntity();
-                    Links links = new Links();
+                    List<Tuple2> links = new ArrayList<>();
                    while (values.hasNext() && links.size() < MAX_LINKS) {
                        EntityRelEntity curr = values.next();
                        if (j.getEntity() == null) {
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -4,9 +4,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
+import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
@ -37,22 +37,22 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
 *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
 *      can be linked at most to 100 other objects
 *
- *  2) CreateRelatedEntitiesJob_phase1:
- *      prepare tuples [relation - target entity] (R - T):
+ *  2) JoinRelationEntityByTargetJob:
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
 *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
 *
- *  3) CreateRelatedEntitiesJob_phase2:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
- *      create the union of the each entity type, hash by id (S)
- *      for each [R - T_i] produced in phase1
- *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
- *
- *  4) AdjacencyListBuilderJob:
+ *  3) AdjacencyListBuilderJob:
 *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
 *
- *  5) XmlConverterJob:
+ *  4) XmlConverterJob:
 *      convert the JoinedEntities as XML records
 */
 public class CreateRelatedEntitiesJob_phase1 {
@ -103,20 +103,21 @@ public class CreateRelatedEntitiesJob_phase1 {
    private static <E extends OafEntity> void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class<E> entityClazz, String outputPath) {

        Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
+                .filter((FilterFunction<SortableRelation>) value -> value.getDataInfo().getDeletedbyinference() == false)
                .map((MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)));
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)))
+                .cache();

-        Dataset<Tuple2<String, E>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
-                .map((MapFunction<E, Tuple2<String, E>>) e -> new Tuple2<>(e.getId(), e),
-                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(entityClazz)))
+        Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
+                .map((MapFunction<E, RelatedEntity>) value -> asRelatedEntity(value, entityClazz), Encoders.bean(RelatedEntity.class))
+                .map((MapFunction<RelatedEntity, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), e),
+                        Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
                .cache();

        relsByTarget
                .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
-                .filter((FilterFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>>)
-                        value -> value._2()._2().getDataInfo().getDeletedbyinference() == false)
-                .map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, E>>, EntityRelEntity>)
-                        t -> new EntityRelEntity(t._1()._2(), GraphMappingUtils.asRelatedEntity(t._2()._2(), entityClazz)),
+                .map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, RelatedEntity>>, EntityRelEntity>)
+                        t -> new EntityRelEntity(t._1()._2(), t._2()._2()),
                        Encoders.bean(EntityRelEntity.class))
                .write()
                .mode(SaveMode.Overwrite)
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@ -6,11 +6,9 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
 import eu.dnetlib.dhp.oa.provision.model.TypedRow;
-import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
 import eu.dnetlib.dhp.schema.oaf.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.MapFunction;
@ -25,12 +23,10 @@ import scala.collection.JavaConverters;
 import scala.collection.Seq;

 import java.util.List;
-import java.util.Map;
 import java.util.Optional;
-import java.util.function.Function;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
+import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;

 /**
 * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
@ -45,24 +41,22 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
 *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
 *      can be linked at most to 100 other objects
 *
- *  2) CreateRelatedEntitiesJob_phase1:
- *      prepare tuples [relation - target entity] (R - T):
+ *  2) JoinRelationEntityByTargetJob:
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
 *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          save the tuples [R - T_i] in append mode
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
 *
- *  3) CreateRelatedEntitiesJob_phase2:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
- *      create the union of the each entity type, hash by id (S)
- *      for each [R - T_i] produced in phase1
- *          join S.id = [R - T_i].source to produce (S_i - R - T_i)
- *          save in append mode
+ *  3) AdjacencyListBuilderJob:
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
 *
- *  4) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
- *
- *  5) XmlConverterJob:
+ *  4) XmlConverterJob:
 *      convert the JoinedEntities as XML records
 */
 public class CreateRelatedEntitiesJob_phase2 {
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@ -1,31 +1,22 @@
 package eu.dnetlib.dhp.oa.provision;

 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
-import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
 import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FilterFunction;
 import org.apache.spark.api.java.function.FlatMapGroupsFunction;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.Tuple2;
-import scala.math.Ordering;

-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
 import java.util.Optional;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -44,14 +35,19 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 *      can be linked at most to 100 other objects
 *
 *  2) JoinRelationEntityByTargetJob:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
 *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
 *
 *  3) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
 *
 *  4) XmlConverterJob:
 *      convert the JoinedEntities as XML records
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java
@ -1,5 +1,6 @@
 package eu.dnetlib.dhp.oa.provision;

+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@ -27,8 +28,11 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.Tuple2;

+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.stream.Collectors;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

@ -37,23 +41,25 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
 * and all the possible relationships (similarity links produced by the Dedup process are excluded).
 *
- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
- * by E, finally grouped by E.id;
- *
 * The workflow is organized in different parts aimed to to reduce the complexity of the operation
 *  1) PrepareRelationsJob:
 *      only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
 *      can be linked at most to 100 other objects
 *
 *  2) JoinRelationEntityByTargetJob:
- *      prepare tuples [source entity - relation - target entity] (S - R - T):
+ *     (phase 1): prepare tuples [relation - target entity] (R - T):
 *      for each entity type E_i
- *          join (R.target = E_i.id),
- *          map E_i as RelatedEntity T_i, extracting only the necessary information beforehand to produce [R - T_i]
- *          join (E_i.id = [R - T_i].source), where E_i becomes the source entity S
+ *          map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
+ *          join (R.target = T_i.id)
+ *          save the tuples (R_i, T_i)
+ *     (phase 2):
+ *          create the union of all the entity types E, hash by id
+ *          read the tuples (R, T), hash by R.source
+ *          join E.id = (R, T).source, where E becomes the Source Entity S
+ *          save the tuples (S, R, T)
 *
 *  3) AdjacencyListBuilderJob:
- *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mappnig the result as JoinedEntity
+ *      given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
 *
 *  4) XmlConverterJob:
 *      convert the JoinedEntities as XML records
@ -62,6 +68,8 @@ public class XmlConverterJob {

    private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);

+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
    public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";

    public static void main(String[] args) throws Exception {
@ -107,12 +115,31 @@ public class XmlConverterJob {
        spark.read()
                .load(inputPath)
                .as(Encoders.bean(JoinedEntity.class))
+ /*               .map((MapFunction<JoinedEntity, String>) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING())
+                .write()
+                .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
+                .text("/tmp/json");
+
+        spark.read()
+                .textFile("/tmp/json")
+                .map((MapFunction<String, JoinedEntity>) value -> OBJECT_MAPPER.readValue(value, JoinedEntity.class), Encoders.bean(JoinedEntity.class))
+                .map((MapFunction<JoinedEntity, JoinedEntity>) j -> {
+                    if (j.getLinks() != null) {
+                        j.setLinks(j.getLinks()
+                                .stream()
+                                .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null)
+                                .collect(Collectors.toCollection(ArrayList::new)));
+                    }
+                    return j;
+                }, Encoders.bean(JoinedEntity.class))
+
+  */
                .map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
                        je.getEntity().getId(),
                        recordFactory.build(je)
                ), Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
                .javaRDD()
-                .mapToPair((PairFunction<Tuple2<String, String>, String, String>) t -> t)
+                .mapToPair((PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()), new Text(t._2())))
                .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
    }

--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java
@ -1,12 +1,13 @@
 package eu.dnetlib.dhp.oa.provision.model;

 import java.io.Serializable;
+import java.util.List;

 public class JoinedEntity implements Serializable {

    private TypedRow entity;

-    private Links links;
+    private List<Tuple2> links;

    public JoinedEntity() {
    }
@ -19,11 +20,11 @@ public class JoinedEntity implements Serializable {
        this.entity = entity;
    }

-    public Links getLinks() {
+    public List<Tuple2> getLinks() {
        return links;
    }

-    public void setLinks(Links links) {
+    public void setLinks(List<Tuple2> links) {
        this.links = links;
    }
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Links.java
@ -1,10 +0,0 @@
-package eu.dnetlib.dhp.oa.provision.model;
-
-import java.io.Serializable;
-import java.util.HashSet;
-
-public class Links extends HashSet<Tuple2> implements Serializable {
-
-    public Links() {
-    }
-}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java
@ -114,7 +114,7 @@ public class GraphMappingUtils {

        final RelatedEntity re = new RelatedEntity();
        re.setId(entity.getId());
-        re.setType(clazz.getName());
+        re.setType(EntityType.fromClass(clazz).name());

        re.setPid(entity.getPid());
        re.setCollectedfrom(entity.getCollectedfrom());
@ -125,16 +125,16 @@ public class GraphMappingUtils {
            case otherresearchproduct:
            case software:

-                Result r = (Result) entity;
+                Result result = (Result) entity;

-                if (r.getTitle() == null && !r.getTitle().isEmpty()) {
-                    re.setTitle(r.getTitle().stream().findFirst().get());
+                if (result.getTitle() == null && !result.getTitle().isEmpty()) {
+                    re.setTitle(result.getTitle().stream().findFirst().get());
                }

-                re.setDateofacceptance(getValue(r.getDateofacceptance()));
-                re.setPublisher(getValue(r.getPublisher()));
-                re.setResulttype(re.getResulttype());
-                re.setInstances(re.getInstances());
+                re.setDateofacceptance(getValue(result.getDateofacceptance()));
+                re.setPublisher(getValue(result.getPublisher()));
+                re.setResulttype(result.getResulttype());
+                re.setInstances(result.getInstance());

                //TODO still to be mapped
                //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@ -694,7 +694,7 @@ public class XmlRecordFactory implements Serializable {
                    if (isNotBlank(re.getCodeRepositoryUrl())) {
                        metadata.add(XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
                    }
-                    if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
+                    if (re.getResulttype() != null & re.getResulttype().isBlank()) {
                        metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
                    }
                    if (re.getCollectedfrom() != null) {
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -75,7 +75,7 @@
        <switch>
            <case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
            <case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
-            <default to="adjancency_lists"/>
+            <default to="prepare_relations"/>
        </switch>
    </decision>

@ -132,7 +132,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
@ -324,7 +324,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
@ -351,7 +351,7 @@
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=7680
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
@ -365,7 +365,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>build_adjacency_lists</name>
+            <name>convert_to_xml</name>
            <class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
            <spark-opts>