forked from D-Net/dnet-hadoop
partial refactoring of some joins
This commit is contained in:
parent
8a3bc7c183
commit
af2f7705fc
|
@ -13,6 +13,7 @@ import org.apache.spark.sql.SaveMode;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
@ -52,18 +53,23 @@ public class PrepareRelatedDatasetsJob {
|
||||||
|
|
||||||
ClusterUtils.removeDir(spark, relsPath);
|
ClusterUtils.removeDir(spark, relsPath);
|
||||||
|
|
||||||
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = ClusterUtils
|
final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||||
|
.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
|
||||||
|
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
|
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||||
|
|
||||||
rels
|
rels
|
||||||
.joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner")
|
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||||
.map(
|
.map(
|
||||||
t -> new RelatedDataset(
|
t -> new RelatedDataset(
|
||||||
t._1.getSource(),
|
t._1.getSource(),
|
||||||
t._1.getRelType(),
|
t._1.getRelType(),
|
||||||
ConversionUtils.oafDatasetToBrokerDataset(t._2)),
|
t._2),
|
||||||
Encoders.bean(RelatedDataset.class))
|
Encoders.bean(RelatedDataset.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
|
|
@ -62,7 +62,9 @@ public class PrepareRelatedProjectsJob {
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT));
|
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||||
|
|
||||||
rels
|
rels
|
||||||
.joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner")
|
.joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner")
|
||||||
|
|
|
@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
@ -31,9 +32,8 @@ public class PrepareRelatedPublicationsJob {
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(PrepareRelatedPublicationsJob.class
|
||||||
PrepareRelatedPublicationsJob.class
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
@ -57,19 +57,22 @@ public class PrepareRelatedPublicationsJob {
|
||||||
|
|
||||||
ClusterUtils.removeDir(spark, relsPath);
|
ClusterUtils.removeDir(spark, relsPath);
|
||||||
|
|
||||||
final Dataset<Publication> pubs = ClusterUtils
|
final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/publication", Publication.class);
|
.readPath(spark, graphPath + "/publication", Publication.class)
|
||||||
|
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||||
|
.map(ConversionUtils::oafPublicationToBrokerPublication, Encoders.bean(OaBrokerRelatedPublication.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
|
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||||
|
|
||||||
rels
|
rels
|
||||||
.joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner")
|
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||||
.map(
|
.map(t -> new RelatedPublication(
|
||||||
t -> new RelatedPublication(
|
t._1.getSource(),
|
||||||
t._1.getSource(),
|
t._1.getRelType(),
|
||||||
t._1.getRelType(),
|
t._2), Encoders.bean(RelatedPublication.class))
|
||||||
ConversionUtils.oafPublicationToBrokerPublication(t._2)),
|
|
||||||
Encoders.bean(RelatedPublication.class))
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(relsPath);
|
.json(relsPath);
|
||||||
|
|
|
@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||||
|
@ -31,9 +32,8 @@ public class PrepareRelatedSoftwaresJob {
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(PrepareRelatedSoftwaresJob.class
|
||||||
PrepareRelatedSoftwaresJob.class
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
@ -57,18 +57,22 @@ public class PrepareRelatedSoftwaresJob {
|
||||||
|
|
||||||
ClusterUtils.removeDir(spark, relsPath);
|
ClusterUtils.removeDir(spark, relsPath);
|
||||||
|
|
||||||
final Dataset<Software> softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class);
|
final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
|
||||||
|
.readPath(spark, graphPath + "/software", Software.class)
|
||||||
|
.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
|
||||||
|
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
|
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||||
|
|
||||||
rels
|
rels
|
||||||
.joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner")
|
.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||||
.map(
|
.map(t -> new RelatedSoftware(
|
||||||
t -> new RelatedSoftware(
|
t._1.getSource(),
|
||||||
t._1.getSource(),
|
t._1.getRelType(),
|
||||||
t._1.getRelType(),
|
t._2), Encoders.bean(RelatedSoftware.class))
|
||||||
ConversionUtils.oafSoftwareToBrokerSoftware(t._2)),
|
|
||||||
Encoders.bean(RelatedSoftware.class))
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(relsPath);
|
.json(relsPath);
|
||||||
|
|
|
@ -74,6 +74,7 @@ public class PrepareSimpleEntititiesJob {
|
||||||
|
|
||||||
return ClusterUtils
|
return ClusterUtils
|
||||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||||
|
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
|
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,4 +32,8 @@ public class ClusterUtils {
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean isDedupRoot(final String id) {
|
||||||
|
return id.contains("dedup_wf_");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,11 +88,11 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="start_entities_and_rels">
|
<fork name="start_entities_and_rels">
|
||||||
<path start="prepare_simple_entities"/>
|
<path start="prepare_simple_entities"/>
|
||||||
|
<path start="prepare_related_softwares"/>
|
||||||
<path start="prepare_related_datasets"/>
|
<path start="prepare_related_datasets"/>
|
||||||
<path start="prepare_related_projects"/>
|
<path start="prepare_related_projects"/>
|
||||||
<path start="prepare_related_publications"/>
|
<path start="prepare_related_publications"/>
|
||||||
<path start="prepare_related_softwares"/>
|
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="prepare_simple_entities">
|
<action name="prepare_simple_entities">
|
||||||
|
@ -119,6 +119,7 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<action name="prepare_related_datasets">
|
<action name="prepare_related_datasets">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,215 @@
|
||||||
|
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>graphInputPath</name>
|
||||||
|
<description>the path where the graph is stored</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the path where the the generated data will be stored</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>isLookupUrl</name>
|
||||||
|
<description>the address of the lookUp service</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dedupConfProfId</name>
|
||||||
|
<description>the id of a valid Dedup Configuration Profile</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="ensure_working_path"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ensure_working_path">
|
||||||
|
<fs>
|
||||||
|
<mkdir path='${workingPath}'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="prepare_related_publications"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="prepare_related_publications">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareRelatedPublicationsJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="prepare_related_datasets"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="prepare_related_datasets">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareRelatedDatasetsJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="join_entities"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="join_entities">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>JoinEntitiesJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.JoinEntitiesJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="prepare_groups"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="prepare_groups">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareGroupsJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="generate_events"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="generate_events">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>GenerateEventsJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
Loading…
Reference in New Issue