added a sequentialization step on the spark job. Addedd new parameter

2020-05-05 17:03:43 +02:00 · 2020-05-05 17:03:43 +02:00 · dd2e698a72
parent 252b219dd5
commit dd2e698a72
4 changed files with 133 additions and 47 deletions
--- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
@ -0,0 +1,4 @@
+package eu.dnetlib.dhp.countrypropagation;
+
+public class PrepareResultCountrySet {
+}
--- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java
@ -3,8 +3,10 @@ package eu.dnetlib.dhp.countrypropagation;

 import static eu.dnetlib.dhp.PropagationConstant.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+import static jdk.nashorn.internal.objects.NativeDebug.map;

 import java.util.*;
+import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -52,6 +54,11 @@ public class SparkCountryPropagationJob2 {
 		final String datasourcecountrypath = parser.get("preparedInfoPath");
 		log.info("preparedInfoPath: {}", datasourcecountrypath);

+		final String possibleUpdatesPath = datasourcecountrypath
+			.substring(0, datasourcecountrypath.lastIndexOf("/") + 1)
+			+ "possibleUpdates";
+		log.info("possibleUpdatesPath: {}", possibleUpdatesPath);
+
 		final String resultClassName = parser.get("resultTableName");
 		log.info("resultTableName: {}", resultClassName);

@ -70,13 +77,14 @@ public class SparkCountryPropagationJob2 {
 			conf,
 			isSparkSessionManaged,
 			spark -> {
+				removeOutputDir(spark, possibleUpdatesPath);
 				execPropagation(
 					spark,
 					datasourcecountrypath,
 					inputPath,
 					outputPath,
 					resultClazz,
-					saveGraph);
+					saveGraph, possibleUpdatesPath);
 			});
 	}

@ -86,19 +94,30 @@ public class SparkCountryPropagationJob2 {
 		String inputPath,
 		String outputPath,
 		Class<R> resultClazz,
-		boolean saveGraph) {
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+		boolean saveGraph, String possilbeUpdatesPath) {
+		// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

 		// Load file with preprocessed association datasource - country
 		Dataset<DatasourceCountry> datasourcecountryassoc = readAssocDatasourceCountry(spark, datasourcecountrypath);
 		// broadcasting the result of the preparation step
-		Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc = sc.broadcast(datasourcecountryassoc);
+		// Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc =
+		// sc.broadcast(datasourcecountryassoc);

 		Dataset<ResultCountrySet> potentialUpdates = getPotentialResultToUpdate(
-			spark, inputPath, resultClazz, broadcast_datasourcecountryassoc)
+			spark, inputPath, resultClazz, datasourcecountryassoc)
 				.as(Encoders.bean(ResultCountrySet.class));

+		potentialUpdates.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(possilbeUpdatesPath);
+
 		if (saveGraph) {
+			// updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
+			potentialUpdates = spark
+				.read()
+				.textFile(possilbeUpdatesPath)
+				.map(
+					(MapFunction<String, ResultCountrySet>) value -> OBJECT_MAPPER
+						.readValue(value, ResultCountrySet.class),
+					Encoders.bean(ResultCountrySet.class));
 			updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
 		}
 	}
@ -113,69 +132,116 @@ public class SparkCountryPropagationJob2 {
 		log.info("Reading Graph table from: {}", inputPath);
 		Dataset<R> result = readPathEntity(spark, inputPath, resultClazz);

-		Dataset<Tuple2<String, R>> result_pair = result
-			.map(
-				r -> new Tuple2<>(r.getId(), r),
-				Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
-
-		Dataset<R> new_table = result_pair
+		Dataset<R> new_table = result
 			.joinWith(
-				potentialUpdates,
-				result_pair.col("_1").equalTo(potentialUpdates.col("resultId")),
+				potentialUpdates, result
+					.col("id")
+					.equalTo(potentialUpdates.col("resultId")),
 				"left_outer")
-			.map(
-				(MapFunction<Tuple2<Tuple2<String, R>, ResultCountrySet>, R>) value -> {
-					R r = value._1()._2();
+			.map((MapFunction<Tuple2<R, ResultCountrySet>, R>) value -> {
+				R r = value._1();
 				Optional<ResultCountrySet> potentialNewCountries = Optional.ofNullable(value._2());
 				if (potentialNewCountries.isPresent()) {
-						HashSet<String> countries = new HashSet<>();
-						for (Qualifier country : r.getCountry()) {
-							countries.add(country.getClassid());
-						}
-						Result res = new Result();
-						res.setId(r.getId());
-						List<Country> countryList = new ArrayList<>();
-						for (CountrySbs country : potentialNewCountries
+					HashSet<String> countries = r
+						.getCountry()
+						.stream()
+						.map(c -> c.getClassid())
+						.collect(Collectors.toCollection(HashSet::new));
+
+					r
+						.getCountry()
+						.addAll(
+							potentialNewCountries
 								.get()
-							.getCountrySet()) {
-							if (!countries.contains(country.getClassid())) {
-								countryList
-									.add(
-										getCountry(
-											country.getClassid(),
-											country.getClassname()));
-							}
-						}
-						res.setCountry(countryList);
-						r.mergeFrom(res);
+								.getCountrySet()
+								.stream()
+								.filter(c -> !countries.contains(c.getClassid()))
+								.map(c -> getCountry(c.getClassid(), c.getClassname()))
+								.collect(Collectors.toList()));
+
+//					Result res = new Result();
+//					res.setId(r.getId());
+//					List<Country> countryList = new ArrayList<>();
+//					for (CountrySbs country : potentialNewCountries
+//						.get()
+//						.getCountrySet()) {
+//						if (!countries.contains(country.getClassid())) {
+//							countryList
+//								.add(
+//									getCountry(
+//										country.getClassid(),
+//										country.getClassname()));
+//						}
+//					}
+//					res.setCountry(countryList);
+//					r.mergeFrom(res);
 				}
 				return r;
-				},
-				Encoders.bean(resultClazz));
+			}, Encoders.bean(resultClazz));
+//		Dataset<Tuple2<String, R>> result_pair = result
+//			.map(
+//				r -> new Tuple2<>(r.getId(), r),
+//				Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
+//
+//		Dataset<R> new_table = result_pair
+//			.joinWith(
+//				potentialUpdates,
+//				result_pair.col("_1").equalTo(potentialUpdates.col("resultId")),
+//				"left_outer")
+//			.map(
+//				(MapFunction<Tuple2<Tuple2<String, R>, ResultCountrySet>, R>) value -> {
+//					R r = value._1()._2();
+//					Optional<ResultCountrySet> potentialNewCountries = Optional.ofNullable(value._2());
+//					if (potentialNewCountries.isPresent()) {
+//						HashSet<String> countries = new HashSet<>();
+//						for (Qualifier country : r.getCountry()) {
+//							countries.add(country.getClassid());
+//						}
+//						Result res = new Result();
+//						res.setId(r.getId());
+//						List<Country> countryList = new ArrayList<>();
+//						for (CountrySbs country : potentialNewCountries
+//							.get()
+//							.getCountrySet()) {
+//							if (!countries.contains(country.getClassid())) {
+//								countryList
+//									.add(
+//										getCountry(
+//											country.getClassid(),
+//											country.getClassname()));
+//							}
+//						}
+//						res.setCountry(countryList);
+//						r.mergeFrom(res);
+//					}
+//					return r;
+//				},
+//				Encoders.bean(resultClazz));

 		log.info("Saving graph table to path: {}", outputPath);
 		// log.info("number of saved recordsa: {}", new_table.count());
-		new_table.toJSON().write().option("compression", "gzip").text(outputPath);
+		new_table.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(outputPath);
+
 	}

 	private static <R extends Result> Dataset<Row> getPotentialResultToUpdate(
 		SparkSession spark,
 		String inputPath,
 		Class<R> resultClazz,
-		Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
+		Dataset<DatasourceCountry> datasourcecountryassoc) {

 		Dataset<R> result = readPathEntity(spark, inputPath, resultClazz);
 		result.createOrReplaceTempView("result");
 		// log.info("number of results: {}", result.count());
 		createCfHbforresult(spark);
-		return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc);
+		return countryPropagationAssoc(spark, datasourcecountryassoc);
 	}

 	private static Dataset<Row> countryPropagationAssoc(
 		SparkSession spark,
-		Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
+		Dataset<DatasourceCountry> datasource_country) {

-		Dataset<DatasourceCountry> datasource_country = broadcast_datasourcecountryassoc.value();
+		// Dataset<DatasourceCountry> datasource_country = broadcast_datasourcecountryassoc.value();
 		datasource_country.createOrReplaceTempView("datasource_country");
 		log.info("datasource_country number : {}", datasource_country.count());

--- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
@ -140,6 +140,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -169,6 +170,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -198,6 +200,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -227,6 +230,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
--- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -261,6 +261,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
+                --conf spark.hadoop.mapreduce.map.speculative=false
+                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
@ -289,6 +292,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
+                --conf spark.hadoop.mapreduce.map.speculative=false
+                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
@ -317,6 +323,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
+                --conf spark.hadoop.mapreduce.map.speculative=false
+                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
@ -345,6 +354,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --conf spark.speculation=false
+                --conf spark.hadoop.mapreduce.map.speculative=false
+                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>