fixed bug on missing relation in ANDS

2020-11-06 17:12:31 +01:00 · 2020-11-06 17:12:31 +01:00 · cd27df91a1
parent 3581244daf
commit cd27df91a1
11 changed files with 216 additions and 139 deletions
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala
@ -1,6 +1,6 @@
 package eu.dnetlib.dhp.doiboost
-import eu.dnetlib.dhp.schema.oaf.Publication
+import eu.dnetlib.dhp.schema.oaf.{Publication, Relation}
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
 import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
@ -21,6 +21,13 @@ class QueryTest {
  }
  def has_ands(r:Relation) :Boolean = {
    r.getCollectedfrom!= null && r.getCollectedfrom.asScala.count(k => k.getValue.contains("Australian")) > 0
  }
  def hasInstanceWithUrl(p:Publication):Boolean = {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java
@ -109,20 +109,20 @@ public class CleaningFunctions {
 			}
 			if (Objects.nonNull(r.getPid())) {
 				r
-						.setPid(
+					.setPid(
-								r
+						r
-										.getPid()
+							.getPid()
-										.stream()
+							.stream()
-										.filter(Objects::nonNull)
+							.filter(Objects::nonNull)
-										.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
+							.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
-										.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
+							.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
-										.filter(sp -> Objects.nonNull(sp.getQualifier()))
+							.filter(sp -> Objects.nonNull(sp.getQualifier()))
-										.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
+							.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
-										.map(sp -> {
+							.map(sp -> {
-											sp.setValue(StringUtils.trim(sp.getValue()));
+								sp.setValue(StringUtils.trim(sp.getValue()));
-											return sp;
+								return sp;
-										})
+							})
-										.collect(Collectors.toList()));
+							.collect(Collectors.toList()));
 			}
 			if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
 				r
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/IdReplace.scala
@ -0,0 +1,3 @@
 package eu.dnetlib.dhp.sx.graph
 case class IdReplace(newId:String, oldId:String) {}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSplitOafTODLIEntities.scala
@ -1,12 +1,15 @@
 package eu.dnetlib.dhp.sx.graph
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation}
+import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
 import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
 import eu.dnetlib.dhp.sx.ebi.EBIAggregator
 import org.apache.commons.io.IOUtils
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.LoggerFactory
 import org.apache.spark.sql.functions.col
 object SparkSplitOafTODLIEntities {
@ -83,14 +86,42 @@ object SparkSplitOafTODLIEntities {
  }
  def extract_ids(o:Oaf) :(String, String) = {
    o match {
      case p: DLIPublication =>
        val prefix = StringUtils.substringBefore(p.getId, "|")
        val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
        (p.getId, s"$prefix|$original")
      case p: DLIDataset =>
        val prefix = StringUtils.substringBefore(p.getId, "|")
        val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
        (p.getId, s"$prefix|$original")
      case _ =>null
    }
  }
  def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
    implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
    implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
    import spark.implicits._
    val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
    val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
    OAFDataset
      .filter(o => o.isInstanceOf[Result])
      .map(extract_ids)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
      .filter(r => r != null)
      .where("_1 != _2")
      .select(col("_1").alias("newId"), col("_2").alias("oldId"))
      .distinct()
      .map(f => IdReplace(f.getString(0), f.getString(1)))
      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/id_replace")
    OAFDataset
      .filter(s => s != null && s.isInstanceOf[Relation])
      .map(s =>s.asInstanceOf[Relation])
@ -100,7 +131,41 @@ object SparkSplitOafTODLIEntities {
      .agg(EBIAggregator.getRelationAggregator().toColumn)
      .map(p => p._2)
      .repartition(4000)
-      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation_unfixed")
    val relations = spark.read.load(s"$workingPath/graph/relation_unfixed").as[Relation]
    val ids = spark.read.load(s"$workingPath/graph/id_replace").as[IdReplace]
    relations
      .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
      .joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
      .map(i =>{
        val r = i._1._2
        if (i._2 != null)
          {
            val id = i._2.newId
            r.setSource(id)
          }
        r
      }).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/rel_f_source")
    val rel_source:Dataset[Relation] = spark.read.load(s"$workingPath/graph/rel_f_source").as[Relation]
    rel_source
      .map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING, relEncoder))
      .joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
      .map(i =>{
        val r:Relation = i._1._2
        if (i._2 != null)
        {
          val id = i._2.newId
          r.setTarget(id)
        }
        r
      }).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
  }
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java
@ -12,6 +12,7 @@ import com.fasterxml.jackson.databind.SerializationFeature;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
 import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
 import eu.dnetlib.scholexplorer.relation.RelationMapper;
 public class ScholexplorerParserTest {
@ -37,4 +38,26 @@ public class ScholexplorerParserTest {
 					}
 				});
 	}
 	@Test
 	public void testPublicationParser() throws Exception {
 		String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml"));
 		PublicationScholexplorerParser p = new PublicationScholexplorerParser();
 		List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
 		ObjectMapper m = new ObjectMapper();
 		m.enable(SerializationFeature.INDENT_OUTPUT);
 		oaves
 			.forEach(
 				oaf -> {
 					try {
 						System.out.println(m.writeValueAsString(oaf));
 						System.out.println("----------------------------");
 					} catch (JsonProcessingException e) {
 					}
 				});
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/dmf.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/dmf.xml
@ -1,51 +1,38 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
-			xmlns:oaf="http://namespace.openaire.eu/oaf"
+			xmlns="http://namespace.openaire.eu/">
-			xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+	<oai:header xmlns="">
-			xmlns:dc="http://purl.org/dc/elements/1.1/">
+		<dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464::0002882a9d38c4f4612e7666ad768ccd</dri:objIdentifier>
-	<oai:header>
+		<dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce::url</dri:recordIdentifier>
-		<dri:repositoryId>aaadf8b3-01a8-4cc2-9964-63cfb19df3b4_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
+		<dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2020-11-02T16:14:07.831Z</dri:dateOfCollection>
-		<dri:recordIdentifier>oai:pangaea.de:doi:10.1594/PANGAEA.432865</dri:recordIdentifier>
+		<dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri">ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
-		<dri:datasourceprefix>r3d100010134</dri:datasourceprefix>
+		<dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464</dri:datasourceprefix>
 		<dri:objIdentifier>r3d100010134::00002f60593fd1f758fb838fafb46795</dri:objIdentifier>
 		<dri:dateOfCollection>2020-02-18T03:05:02.534Z</dri:dateOfCollection>
 		<oaf:datasourceprefix/>
 		<identifier>oai:pangaea.de:doi:10.1594/PANGAEA.432865</identifier>
 		<setSpec>citable topicOceans</setSpec>
 	</oai:header>
-	<oai:metadata>
+	<metadata xmlns="">
-		<resource xmlns="http://datacite.org/schema/kernel-3">
+		<resource xmlns="http://datacite.org/schema/kernel-3"
-			<identifier identifierType="doi">10.1594/pangaea.432865</identifier>
+				  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-			<titles xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+				  xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
-				<title>Daily sea level from coastal tide gauge station Woods_Hole in 1978 (Research quality database)</title>
+			<identifier xmlns="" identifierType="url">https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce</identifier>
 			<titles xmlns="">
 				<title>Vertebrate monitoring in the Australian Wet Tropics rainforest at CU6A1 (145.30367623, -16.57767628, 600.0m above MSL) collected by Reptile Surveys</title>
 			</titles>
-			<publisher xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">PANGAEA - Data Publisher for Earth &amp; Environmental Science</publisher>
+			<publisher xmlns="">James Cook University</publisher>
-			<publicationYear xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">2006</publicationYear>
+			<dates xmlns="">
-			<dates xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+				<date dateType="Collected">2013-05-07</date>
 				<date dateType="Collected">1978-01-01T12:00:00/1978-12-31T12:00:00</date>
 			</dates>
-			<creators xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+			<creators xmlns=""/>
-				<creator>
+			<resourceType xmlns="" resourceTypeGeneral="Dataset">Dataset</resourceType>
-					<creatorName>WOCE Sea Level, WSL</creatorName>
+			<relatedIdentifiers xmlns="">
-				</creator>
+				<relatedIdentifier entityType="publication" inverseRelationType="related"
-			</creators>
+								   relatedIdentifierType="dnet"
-			<subjects xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+								   relationType="IsRelatedTo">r3d100010464::57793c5aa995172db237d9da17353f8b</relatedIdentifier>
 				<subject subjectScheme="Parameter">DATE/TIME</subject>
 				<subject subjectScheme="Parameter">Sea level</subject>
 				<subject subjectScheme="Method">Tide gauge station</subject>
 				<subject subjectScheme="Campaign">SeaLevel</subject>
 				<subject subjectScheme="Project">World Ocean Circulation Experiment (WOCE)</subject>
 			</subjects>
 			<resourceType resourceTypeGeneral="Dataset"/>
 			<relatedIdentifiers>
 				<relatedIdentifier relatedIdentifierType="URL" relationType="isDocumentedBy"
 								   inverseRelationType="documents">http://store.pangaea.de/Projects/WOCE/SeaLevel_rqds/Woods_Hole.txt</relatedIdentifier>
 			</relatedIdentifiers>
 		</resource>
-	</oai:metadata>
+	</metadata>
-	<oaf:about>
+	<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
 		<oaf:datainfo>
-			<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010134" name="Pangaea"/>
+			<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464"
 							   name="Australian National Data Service"/>
 			<oaf:completionStatus>complete</oaf:completionStatus>
 			<oaf:provisionMode>collected</oaf:provisionMode>
 		</oaf:datainfo>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/pmf.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/pmf.xml
@ -0,0 +1,25 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
            xmlns="http://namespace.openaire.eu/">
    <oai:header xmlns="">
        <dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464::57793c5aa995172db237d9da17353f8b</dri:objIdentifier>
        <dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">10.1111/j.1365-2486.2005.00995.x::doi</dri:recordIdentifier>
        <dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2020-11-02T16:14:07.831Z</dri:dateOfCollection>
        <dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri">ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
        <dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464</dri:datasourceprefix>
    </oai:header>
    <metadata xmlns="">
        <oaf:pid xmlns:oaf="http://namespace.dnet.eu/oaf" type="doi">10.1111/j.1365-2486.2005.00995.x</oaf:pid>
        <dc:identifier xmlns:dc="http://purl.org/dc/elements/1.1/">10.1111/j.1365-2486.2005.00995.x</dc:identifier>
        <dc:title xmlns:dc="http://purl.org/dc/elements/1.1/">Potential decoupling of trends in distribution area and population size of species with climate change.</dc:title>
        <dc:type xmlns:dc="http://purl.org/dc/elements/1.1/">publication</dc:type>
    </metadata>
    <oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
        <oaf:datainfo>
            <oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464"
                               name="Australian National Data Service"/>
            <oaf:completionStatus>complete</oaf:completionStatus>
            <oaf:provisionMode>collected</oaf:provisionMode>
        </oaf:datainfo>
    </oaf:about>
 </oai:record>
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java
@ -97,12 +97,17 @@ public class Scholix implements Serializable {
 	}
 	private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
-		final List<ScholixEntityId> m = new ArrayList<>(a);
+		final List<ScholixEntityId> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
 		if (b != null)
 			b.forEach(s -> {
-				int tt = (int) m.stream().filter(t -> t.getName().equalsIgnoreCase(s.getName())).count();
+				if (s != null) {
-				if (tt == 0) {
+					int tt = (int) m
-					m.add(s);
+						.stream()
 						.filter(t -> t != null && t.getName() != null && t.getName().equalsIgnoreCase(s.getName()))
 						.count();
 					if (tt == 0) {
 						m.add(s);
 					}
 				}
 			});
 		return m;
@ -110,7 +115,7 @@ public class Scholix implements Serializable {
 	private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
 		final List<ScholixIdentifier> b) {
-		final List<ScholixIdentifier> m = new ArrayList<>(a);
+		final List<ScholixIdentifier> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
 		if (b != null)
 			b.forEach(s -> {
 				int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
@ -123,7 +128,7 @@ public class Scholix implements Serializable {
 	private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
 		final List<ScholixCollectedFrom> b) {
-		final List<ScholixCollectedFrom> m = new ArrayList<>(a);
+		final List<ScholixCollectedFrom> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
 		if (b != null)
 			b.forEach(s -> {
 				int tt = (int) m
@ -139,14 +144,15 @@ public class Scholix implements Serializable {
 	private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
 		ScholixRelationship result = new ScholixRelationship();
-		result.setName(StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
+		result.setName(a == null || StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
-		result.setInverse(StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
+		result.setInverse(a == null || StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
-		result.setSchema(StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
+		result.setSchema(a == null || StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
 		return result;
 	}
 	private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
-
+		if (a == null)
 			return b;
 		final ScholixResource result = new ScholixResource();
 		result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
 		result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/config-default.xml
@ -7,4 +7,8 @@
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/index/oozie_app/workflow.xml
@ -1,9 +1,17 @@
-<workflow-app name="Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>workingDirPath</name>
            <description>the source path</description>
        </property>
        <property>
            <name>index</name>
            <description>the index name</description>
        </property>
        <property>
            <name>esCluster</name>
            <description>the Index cluster</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -12,39 +20,43 @@
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>index</name>
            <description>index name</description>
        </property>
        <property>
            <name>indexHost</name>
            <description>index host name</description>
        </property>
    </parameters>
-    <start to="indexSummary"/>
+    <start to="DropAndCreateIndex"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="DropAndCreateIndex">
        <java>
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <main-class>eu.dnetlib.dhp.provision.DropAndCreateESIndex</main-class>
            <arg>-i</arg><arg>${index}</arg>
            <arg>-c</arg><arg>${esCluster}</arg>
        </java>
        <ok to="indexSummary"/>
        <error to="Kill"/>
    </action>
    <action name="indexSummary">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
-            <name>index Summary</name>
+            <name>index summary</name>
            <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts>
+            <spark-opts>--executor-memory ${sparkExecutorMemory}  --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
-            <arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
+            <arg>--sourcePath</arg><arg>${workingDirPath}/summary_json</arg>
            <arg>--index</arg><arg>${index}_object</arg>
            <arg>--esHost</arg><arg>${indexHost}</arg>
            <arg>--idPath</arg><arg>id</arg>
-            <arg>--type</arg><arg>summary</arg>
+            <arg>--cluster</arg><arg>${esCluster}</arg>
        </spark>
        <ok to="indexScholix"/>
        <error to="Kill"/>
@ -63,9 +75,8 @@
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
            <arg>--index</arg><arg>${index}_scholix</arg>
            <arg>--esHost</arg><arg>${indexHost}</arg>
            <arg>--idPath</arg><arg>identifier</arg>
-            <arg>--type</arg><arg>scholix</arg>
+            <arg>--cluster</arg><arg>${esCluster}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/workflow.xml
@ -112,59 +112,5 @@
        <error to="Kill"/>
    </action>
    <action name="DropAndCreateIndex">
        <java>
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <main-class>eu.dnetlib.dhp.provision.DropAndCreateESIndex</main-class>
            <arg>-i</arg><arg>${index}</arg>
            <arg>-c</arg><arg>${esCluster}</arg>
        </java>
        <ok to="indexSummary"/>
        <error to="Kill"/>
    </action>
    <action name="indexSummary">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>index summary</name>
            <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory ${sparkExecutorMemory}  --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${workingDirPath}/summary_json</arg>
            <arg>--index</arg><arg>${index}_object</arg>
            <arg>--idPath</arg><arg>id</arg>
            <arg>--cluster</arg><arg>${esCluster}</arg>
        </spark>
        <ok to="indexScholix"/>
        <error to="Kill"/>
    </action>
    <action name="indexScholix">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>index scholix</name>
            <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory ${sparkExecutorMemory}  --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
            <arg>--index</arg><arg>${index}_scholix</arg>
            <arg>--idPath</arg><arg>identifier</arg>
            <arg>--cluster</arg><arg>${esCluster}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
		`@ -0,0 +1,3 @@`
							`package eu.dnetlib.dhp.sx.graph`

							`case class IdReplace(newId:String, oldId:String) {}`