partial refactoring of some joins

2020-06-23 08:37:35 +02:00 · 2020-06-23 08:37:35 +02:00 · af2f7705fc
parent 8a3bc7c183
commit af2f7705fc
9 changed files with 289 additions and 35 deletions
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java
@ -13,6 +13,7 @@ import org.apache.spark.sql.SaveMode;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
@ -52,18 +53,23 @@ public class PrepareRelatedDatasetsJob {

 			ClusterUtils.removeDir(spark, relsPath);

-			final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = ClusterUtils
-				.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
+			final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
+				.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
+				.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
+				.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));

-			final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
+			final Dataset<Relation> rels = ClusterUtils
+				.readPath(spark, graphPath + "/relation", Relation.class)
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));

 			rels
-				.joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner")
+				.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
 				.map(
 					t -> new RelatedDataset(
 						t._1.getSource(),
 						t._1.getRelType(),
-						ConversionUtils.oafDatasetToBrokerDataset(t._2)),
+						t._2),
 					Encoders.bean(RelatedDataset.class))
 				.write()
 				.mode(SaveMode.Overwrite)
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java
@ -62,7 +62,9 @@ public class PrepareRelatedProjectsJob {

 			final Dataset<Relation> rels = ClusterUtils
 				.readPath(spark, graphPath + "/relation", Relation.class)
-				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT));
+				.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));

 			rels
 				.joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner")
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java
@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

+import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
@ -31,9 +32,8 @@ public class PrepareRelatedPublicationsJob {
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
-				.toString(
-					PrepareRelatedPublicationsJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
+				.toString(PrepareRelatedPublicationsJob.class
+					.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);

 		final Boolean isSparkSessionManaged = Optional
@ -57,19 +57,22 @@ public class PrepareRelatedPublicationsJob {

 			ClusterUtils.removeDir(spark, relsPath);

-			final Dataset<Publication> pubs = ClusterUtils
-				.readPath(spark, graphPath + "/publication", Publication.class);
+			final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
+				.readPath(spark, graphPath + "/publication", Publication.class)
+				.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
+				.map(ConversionUtils::oafPublicationToBrokerPublication, Encoders.bean(OaBrokerRelatedPublication.class));

-			final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
+			final Dataset<Relation> rels = ClusterUtils
+				.readPath(spark, graphPath + "/relation", Relation.class)
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));

 			rels
-				.joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner")
-				.map(
-					t -> new RelatedPublication(
-						t._1.getSource(),
-						t._1.getRelType(),
-						ConversionUtils.oafPublicationToBrokerPublication(t._2)),
-					Encoders.bean(RelatedPublication.class))
+				.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
+				.map(t -> new RelatedPublication(
+					t._1.getSource(),
+					t._1.getRelType(),
+					t._2), Encoders.bean(RelatedPublication.class))
 				.write()
 				.mode(SaveMode.Overwrite)
 				.json(relsPath);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java
@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

+import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
 import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
@ -31,9 +32,8 @@ public class PrepareRelatedSoftwaresJob {
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
-				.toString(
-					PrepareRelatedSoftwaresJob.class
-						.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
+				.toString(PrepareRelatedSoftwaresJob.class
+					.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
 		parser.parseArgument(args);

 		final Boolean isSparkSessionManaged = Optional
@ -57,18 +57,22 @@ public class PrepareRelatedSoftwaresJob {

 			ClusterUtils.removeDir(spark, relsPath);

-			final Dataset<Software> softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class);
+			final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
+				.readPath(spark, graphPath + "/software", Software.class)
+				.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
+				.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));

-			final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
+			final Dataset<Relation> rels = ClusterUtils
+				.readPath(spark, graphPath + "/relation", Relation.class)
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
+				.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));

 			rels
-				.joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner")
-				.map(
-					t -> new RelatedSoftware(
-						t._1.getSource(),
-						t._1.getRelType(),
-						ConversionUtils.oafSoftwareToBrokerSoftware(t._2)),
-					Encoders.bean(RelatedSoftware.class))
+				.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
+				.map(t -> new RelatedSoftware(
+					t._1.getSource(),
+					t._1.getRelType(),
+					t._2), Encoders.bean(RelatedSoftware.class))
 				.write()
 				.mode(SaveMode.Overwrite)
 				.json(relsPath);
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java
@ -74,6 +74,7 @@ public class PrepareSimpleEntititiesJob {

 		return ClusterUtils
 			.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
+			.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
 			.filter(r -> r.getDataInfo().getDeletedbyinference())
 			.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
 	}
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@ -32,4 +32,8 @@ public class ClusterUtils {
 			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
 	}

+	public static boolean isDedupRoot(final String id) {
+		return id.contains("dedup_wf_");
+	}
+
 }
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml
@ -88,11 +88,11 @@
    </action>
    
    <fork name="start_entities_and_rels">
-        <path start="prepare_simple_entities"/>
+		<path start="prepare_simple_entities"/>
+        <path start="prepare_related_softwares"/> 
        <path start="prepare_related_datasets"/>
        <path start="prepare_related_projects"/>
        <path start="prepare_related_publications"/>
-        <path start="prepare_related_softwares"/>
    </fork>
    
    <action name="prepare_simple_entities">
@ -119,6 +119,7 @@
        <error to="Kill"/>
    </action>

+ 
    <action name="prepare_related_datasets">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml
@ -0,0 +1,215 @@
+<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>graphInputPath</name>
+            <description>the path where the graph is stored</description>
+        </property>
+        <property>
+            <name>workingPath</name>
+            <description>the path where the the generated data will be stored</description>
+        </property>
+        <property>
+            <name>isLookupUrl</name>
+            <description>the address of the lookUp service</description>
+        </property>
+        <property>
+            <name>dedupConfProfId</name>
+            <description>the id of a valid Dedup Configuration Profile</description>
+        </property>
+
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="ensure_working_path"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ensure_working_path">
+        <fs>
+            <mkdir path='${workingPath}'/>
+        </fs>
+        <ok to="prepare_related_publications"/>
+        <error to="Kill"/>
+    </action>
+    
+    <action name="prepare_related_publications">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PrepareRelatedPublicationsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="prepare_related_datasets"/>
+        <error to="Kill"/>
+    </action> 
+
+ 
+    <action name="prepare_related_datasets">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PrepareRelatedDatasetsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="join_entities"/>
+        <error to="Kill"/>
+    </action>
+    
+    <action name="join_entities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>JoinEntitiesJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.JoinEntitiesJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="prepare_groups"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_groups">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PrepareGroupsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+        </spark>
+        <ok to="generate_events"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="generate_events">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>GenerateEventsJob</name>
+            <class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
+            <jar>dhp-broker-events-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--graphPath</arg><arg>${graphInputPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+
+</workflow-app>