forked from D-Net/dnet-hadoop
partial refactoring of some joins
This commit is contained in:
parent
8a3bc7c183
commit
af2f7705fc
|
@ -13,6 +13,7 @@ import org.apache.spark.sql.SaveMode;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
|
@ -52,18 +53,23 @@ public class PrepareRelatedDatasetsJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = ClusterUtils
|
||||
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
|
||||
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||
.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
|
||||
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
.joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner")
|
||||
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedDataset(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafDatasetToBrokerDataset(t._2)),
|
||||
t._2),
|
||||
Encoders.bean(RelatedDataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -62,7 +62,9 @@ public class PrepareRelatedProjectsJob {
|
|||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT));
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
.joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner")
|
||||
|
|
|
@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
|
@ -31,9 +32,8 @@ public class PrepareRelatedPublicationsJob {
|
|||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
PrepareRelatedPublicationsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
.toString(PrepareRelatedPublicationsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -57,19 +57,22 @@ public class PrepareRelatedPublicationsJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final Dataset<Publication> pubs = ClusterUtils
|
||||
.readPath(spark, graphPath + "/publication", Publication.class);
|
||||
final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
|
||||
.readPath(spark, graphPath + "/publication", Publication.class)
|
||||
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
.map(ConversionUtils::oafPublicationToBrokerPublication, Encoders.bean(OaBrokerRelatedPublication.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
.joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedPublication(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafPublicationToBrokerPublication(t._2)),
|
||||
Encoders.bean(RelatedPublication.class))
|
||||
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> new RelatedPublication(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
t._2), Encoders.bean(RelatedPublication.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
|
|
|
@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
|
@ -31,9 +32,8 @@ public class PrepareRelatedSoftwaresJob {
|
|||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
PrepareRelatedSoftwaresJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
.toString(PrepareRelatedSoftwaresJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -57,18 +57,22 @@ public class PrepareRelatedSoftwaresJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final Dataset<Software> softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class);
|
||||
final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
|
||||
.readPath(spark, graphPath + "/software", Software.class)
|
||||
.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
.joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedSoftware(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafSoftwareToBrokerSoftware(t._2)),
|
||||
Encoders.bean(RelatedSoftware.class))
|
||||
.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> new RelatedSoftware(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
t._2), Encoders.bean(RelatedSoftware.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
|
|
|
@ -74,6 +74,7 @@ public class PrepareSimpleEntititiesJob {
|
|||
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
|
||||
}
|
||||
|
|
|
@ -32,4 +32,8 @@ public class ClusterUtils {
|
|||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static boolean isDedupRoot(final String id) {
|
||||
return id.contains("dedup_wf_");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -88,11 +88,11 @@
|
|||
</action>
|
||||
|
||||
<fork name="start_entities_and_rels">
|
||||
<path start="prepare_simple_entities"/>
|
||||
<path start="prepare_simple_entities"/>
|
||||
<path start="prepare_related_softwares"/>
|
||||
<path start="prepare_related_datasets"/>
|
||||
<path start="prepare_related_projects"/>
|
||||
<path start="prepare_related_publications"/>
|
||||
<path start="prepare_related_softwares"/>
|
||||
</fork>
|
||||
|
||||
<action name="prepare_simple_entities">
|
||||
|
@ -119,6 +119,7 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepare_related_datasets">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -190,7 +191,7 @@
|
|||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepare_related_softwares">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -214,7 +215,7 @@
|
|||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<join name="wait_entities_and_rels" to="join_entities"/>
|
||||
|
||||
<action name="join_entities">
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,215 @@
|
|||
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dedupConfProfId</name>
|
||||
<description>the id of a valid Dedup Configuration Profile</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ensure_working_path"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ensure_working_path">
|
||||
<fs>
|
||||
<mkdir path='${workingPath}'/>
|
||||
</fs>
|
||||
<ok to="prepare_related_publications"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_related_publications">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareRelatedPublicationsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="prepare_related_datasets"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepare_related_datasets">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareRelatedDatasetsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="join_entities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>JoinEntitiesJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.JoinEntitiesJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="prepare_groups"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_groups">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareGroupsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="generate_events"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="generate_events">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEventsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
Loading…
Reference in New Issue