added a sequentialization step on the spark job. Addedd new parameter

2020-05-05 17:03:43 +02:00 · 2020-05-05 17:03:43 +02:00 · dd2e698a72
parent 252b219dd5
commit dd2e698a72
4 changed files with 133 additions and 47 deletions
--- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
@ -0,0 +1,4 @@
 package eu.dnetlib.dhp.countrypropagation;
 public class PrepareResultCountrySet {
 }
--- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java
@ -3,8 +3,10 @@ package eu.dnetlib.dhp.countrypropagation;
 import static eu.dnetlib.dhp.PropagationConstant.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
 import static jdk.nashorn.internal.objects.NativeDebug.map;
 import java.util.*;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -52,6 +54,11 @@ public class SparkCountryPropagationJob2 {
 		final String datasourcecountrypath = parser.get("preparedInfoPath");
 		log.info("preparedInfoPath: {}", datasourcecountrypath);
 		final String possibleUpdatesPath = datasourcecountrypath
 			.substring(0, datasourcecountrypath.lastIndexOf("/") + 1)
 			+ "possibleUpdates";
 		log.info("possibleUpdatesPath: {}", possibleUpdatesPath);
 		final String resultClassName = parser.get("resultTableName");
 		log.info("resultTableName: {}", resultClassName);
@ -70,13 +77,14 @@ public class SparkCountryPropagationJob2 {
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, possibleUpdatesPath);
 				execPropagation(
 					spark,
 					datasourcecountrypath,
 					inputPath,
 					outputPath,
 					resultClazz,
-					saveGraph);
+					saveGraph, possibleUpdatesPath);
 			});
 	}
@ -86,19 +94,30 @@ public class SparkCountryPropagationJob2 {
 		String inputPath,
 		String outputPath,
 		Class<R> resultClazz,
-		boolean saveGraph) {
+		boolean saveGraph, String possilbeUpdatesPath) {
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+		// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		// Load file with preprocessed association datasource - country
 		Dataset<DatasourceCountry> datasourcecountryassoc = readAssocDatasourceCountry(spark, datasourcecountrypath);
 		// broadcasting the result of the preparation step
-		Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc = sc.broadcast(datasourcecountryassoc);
+		// Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc =
 		// sc.broadcast(datasourcecountryassoc);
 		Dataset<ResultCountrySet> potentialUpdates = getPotentialResultToUpdate(
-			spark, inputPath, resultClazz, broadcast_datasourcecountryassoc)
+			spark, inputPath, resultClazz, datasourcecountryassoc)
 				.as(Encoders.bean(ResultCountrySet.class));
 		potentialUpdates.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(possilbeUpdatesPath);
 		if (saveGraph) {
 			// updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
 			potentialUpdates = spark
 				.read()
 				.textFile(possilbeUpdatesPath)
 				.map(
 					(MapFunction<String, ResultCountrySet>) value -> OBJECT_MAPPER
 						.readValue(value, ResultCountrySet.class),
 					Encoders.bean(ResultCountrySet.class));
 			updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
 		}
 	}
@ -113,69 +132,116 @@ public class SparkCountryPropagationJob2 {
 		log.info("Reading Graph table from: {}", inputPath);
 		Dataset<R> result = readPathEntity(spark, inputPath, resultClazz);
-		Dataset<Tuple2<String, R>> result_pair = result
+		Dataset<R> new_table = result
 			.map(
 				r -> new Tuple2<>(r.getId(), r),
 				Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
 		Dataset<R> new_table = result_pair
 			.joinWith(
-				potentialUpdates,
+				potentialUpdates, result
-				result_pair.col("_1").equalTo(potentialUpdates.col("resultId")),
+					.col("id")
 					.equalTo(potentialUpdates.col("resultId")),
 				"left_outer")
-			.map(
+			.map((MapFunction<Tuple2<R, ResultCountrySet>, R>) value -> {
-				(MapFunction<Tuple2<Tuple2<String, R>, ResultCountrySet>, R>) value -> {
+				R r = value._1();
-					R r = value._1()._2();
+				Optional<ResultCountrySet> potentialNewCountries = Optional.ofNullable(value._2());
-					Optional<ResultCountrySet> potentialNewCountries = Optional.ofNullable(value._2());
+				if (potentialNewCountries.isPresent()) {
-					if (potentialNewCountries.isPresent()) {
+					HashSet<String> countries = r
-						HashSet<String> countries = new HashSet<>();
+						.getCountry()
-						for (Qualifier country : r.getCountry()) {
+						.stream()
-							countries.add(country.getClassid());
+						.map(c -> c.getClassid())
-						}
+						.collect(Collectors.toCollection(HashSet::new));
-						Result res = new Result();
+
-						res.setId(r.getId());
+					r
-						List<Country> countryList = new ArrayList<>();
+						.getCountry()
-						for (CountrySbs country : potentialNewCountries
+						.addAll(
-							.get()
+							potentialNewCountries
-							.getCountrySet()) {
+								.get()
-							if (!countries.contains(country.getClassid())) {
+								.getCountrySet()
-								countryList
+								.stream()
-									.add(
+								.filter(c -> !countries.contains(c.getClassid()))
-										getCountry(
+								.map(c -> getCountry(c.getClassid(), c.getClassname()))
-											country.getClassid(),
+								.collect(Collectors.toList()));
-											country.getClassname()));
+
-							}
+//					Result res = new Result();
-						}
+//					res.setId(r.getId());
-						res.setCountry(countryList);
+//					List<Country> countryList = new ArrayList<>();
-						r.mergeFrom(res);
+//					for (CountrySbs country : potentialNewCountries
-					}
+//						.get()
-					return r;
+//						.getCountrySet()) {
-				},
+//						if (!countries.contains(country.getClassid())) {
-				Encoders.bean(resultClazz));
+//							countryList
 //								.add(
 //									getCountry(
 //										country.getClassid(),
 //										country.getClassname()));
 //						}
 //					}
 //					res.setCountry(countryList);
 //					r.mergeFrom(res);
 				}
 				return r;
 			}, Encoders.bean(resultClazz));
 //		Dataset<Tuple2<String, R>> result_pair = result
 //			.map(
 //				r -> new Tuple2<>(r.getId(), r),
 //				Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
 //
 //		Dataset<R> new_table = result_pair
 //			.joinWith(
 //				potentialUpdates,
 //				result_pair.col("_1").equalTo(potentialUpdates.col("resultId")),
 //				"left_outer")
 //			.map(
 //				(MapFunction<Tuple2<Tuple2<String, R>, ResultCountrySet>, R>) value -> {
 //					R r = value._1()._2();
 //					Optional<ResultCountrySet> potentialNewCountries = Optional.ofNullable(value._2());
 //					if (potentialNewCountries.isPresent()) {
 //						HashSet<String> countries = new HashSet<>();
 //						for (Qualifier country : r.getCountry()) {
 //							countries.add(country.getClassid());
 //						}
 //						Result res = new Result();
 //						res.setId(r.getId());
 //						List<Country> countryList = new ArrayList<>();
 //						for (CountrySbs country : potentialNewCountries
 //							.get()
 //							.getCountrySet()) {
 //							if (!countries.contains(country.getClassid())) {
 //								countryList
 //									.add(
 //										getCountry(
 //											country.getClassid(),
 //											country.getClassname()));
 //							}
 //						}
 //						res.setCountry(countryList);
 //						r.mergeFrom(res);
 //					}
 //					return r;
 //				},
 //				Encoders.bean(resultClazz));
 		log.info("Saving graph table to path: {}", outputPath);
 		// log.info("number of saved recordsa: {}", new_table.count());
-		new_table.toJSON().write().option("compression", "gzip").text(outputPath);
+		new_table.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(outputPath);
 	}
 	private static <R extends Result> Dataset<Row> getPotentialResultToUpdate(
 		SparkSession spark,
 		String inputPath,
 		Class<R> resultClazz,
-		Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
+		Dataset<DatasourceCountry> datasourcecountryassoc) {
 		Dataset<R> result = readPathEntity(spark, inputPath, resultClazz);
 		result.createOrReplaceTempView("result");
 		// log.info("number of results: {}", result.count());
 		createCfHbforresult(spark);
-		return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc);
+		return countryPropagationAssoc(spark, datasourcecountryassoc);
 	}
 	private static Dataset<Row> countryPropagationAssoc(
 		SparkSession spark,
-		Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
+		Dataset<DatasourceCountry> datasource_country) {
-		Dataset<DatasourceCountry> datasource_country = broadcast_datasourcecountryassoc.value();
+		// Dataset<DatasourceCountry> datasource_country = broadcast_datasourcecountryassoc.value();
 		datasource_country.createOrReplaceTempView("datasource_country");
 		log.info("datasource_country number : {}", datasource_country.count());
--- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml
@ -140,6 +140,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -169,6 +170,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -198,6 +200,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -227,6 +230,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
            <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
--- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml
@ -261,6 +261,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
@ -289,6 +292,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
@ -317,6 +323,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
@ -345,6 +354,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.dynamicAllocation.enabled=true
                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
                --conf spark.speculation=false
                --conf spark.hadoop.mapreduce.map.speculative=false
                --conf spark.hadoop.mapreduce.reduce.speculative=false
            </spark-opts>
            <arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>