forked from D-Net/dnet-hadoop
removed the writeUpdate option. The update is available in the preparedInfo path
This commit is contained in:
parent
8802e4126b
commit
95a54d5460
|
@ -50,16 +50,6 @@ public class SparkCountryPropagationJob2 {
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("resultTableName: {}", resultClassName);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
final String resultType =
|
|
||||||
resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
|
||||||
log.info("resultType: {}", resultType);
|
|
||||||
|
|
||||||
final Boolean writeUpdates =
|
|
||||||
Optional.ofNullable(parser.get("writeUpdate"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("writeUpdate: {}", writeUpdates);
|
|
||||||
|
|
||||||
final Boolean saveGraph =
|
final Boolean saveGraph =
|
||||||
Optional.ofNullable(parser.get("saveGraph"))
|
Optional.ofNullable(parser.get("saveGraph"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
|
@ -76,17 +66,12 @@ public class SparkCountryPropagationJob2 {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
// createOutputDirs(outputPath,
|
|
||||||
// FileSystem.get(spark.sparkContext().hadoopConfiguration()));
|
|
||||||
removeOutputDir(spark, outputPath);
|
|
||||||
execPropagation(
|
execPropagation(
|
||||||
spark,
|
spark,
|
||||||
datasourcecountrypath,
|
datasourcecountrypath,
|
||||||
inputPath,
|
inputPath,
|
||||||
outputPath,
|
outputPath,
|
||||||
resultClazz,
|
resultClazz,
|
||||||
resultType,
|
|
||||||
writeUpdates,
|
|
||||||
saveGraph);
|
saveGraph);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -97,12 +82,10 @@ public class SparkCountryPropagationJob2 {
|
||||||
String inputPath,
|
String inputPath,
|
||||||
String outputPath,
|
String outputPath,
|
||||||
Class<R> resultClazz,
|
Class<R> resultClazz,
|
||||||
String resultType,
|
|
||||||
boolean writeUpdates,
|
|
||||||
boolean saveGraph) {
|
boolean saveGraph) {
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
// Load parque file with preprocessed association datasource - country
|
// Load file with preprocessed association datasource - country
|
||||||
Dataset<DatasourceCountry> datasourcecountryassoc =
|
Dataset<DatasourceCountry> datasourcecountryassoc =
|
||||||
readAssocDatasourceCountry(spark, datasourcecountrypath);
|
readAssocDatasourceCountry(spark, datasourcecountrypath);
|
||||||
// broadcasting the result of the preparation step
|
// broadcasting the result of the preparation step
|
||||||
|
@ -114,10 +97,6 @@ public class SparkCountryPropagationJob2 {
|
||||||
spark, inputPath, resultClazz, broadcast_datasourcecountryassoc)
|
spark, inputPath, resultClazz, broadcast_datasourcecountryassoc)
|
||||||
.as(Encoders.bean(ResultCountrySet.class));
|
.as(Encoders.bean(ResultCountrySet.class));
|
||||||
|
|
||||||
if (writeUpdates) {
|
|
||||||
writeUpdates(potentialUpdates, outputPath + "/update_" + resultType);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (saveGraph) {
|
if (saveGraph) {
|
||||||
updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
|
updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
|
||||||
}
|
}
|
||||||
|
@ -138,11 +117,6 @@ public class SparkCountryPropagationJob2 {
|
||||||
r -> new Tuple2<>(r.getId(), r),
|
r -> new Tuple2<>(r.getId(), r),
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
|
||||||
|
|
||||||
// Dataset<Tuple2<String, ResultCountrySet>> potential_update_pair =
|
|
||||||
// potentialUpdates.map(pu -> new Tuple2<>(pu.getResultId(),
|
|
||||||
// pu),
|
|
||||||
// Encoders.tuple(Encoders.STRING(), Encoders.bean(ResultCountrySet.class)));
|
|
||||||
|
|
||||||
Dataset<R> new_table =
|
Dataset<R> new_table =
|
||||||
result_pair
|
result_pair
|
||||||
.joinWith(
|
.joinWith(
|
||||||
|
@ -184,10 +158,6 @@ public class SparkCountryPropagationJob2 {
|
||||||
log.info("Saving graph table to path: {}", outputPath);
|
log.info("Saving graph table to path: {}", outputPath);
|
||||||
// log.info("number of saved recordsa: {}", new_table.count());
|
// log.info("number of saved recordsa: {}", new_table.count());
|
||||||
new_table.toJSON().write().option("compression", "gzip").text(outputPath);
|
new_table.toJSON().write().option("compression", "gzip").text(outputPath);
|
||||||
// .toJavaRDD()
|
|
||||||
// .map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
|
||||||
// .saveAsTextFile(outputPath , GzipCodec.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> Dataset<Row> getPotentialResultToUpdate(
|
private static <R extends Result> Dataset<Row> getPotentialResultToUpdate(
|
||||||
|
@ -203,18 +173,6 @@ public class SparkCountryPropagationJob2 {
|
||||||
return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc);
|
return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// private static void createCfHbforresult(SparkSession spark) {
|
|
||||||
// String query;
|
|
||||||
// query = "SELECT id, inst.collectedfrom.key cf , inst.hostedby.key hb " +
|
|
||||||
// "FROM ( SELECT id, instance " +
|
|
||||||
// "FROM result " +
|
|
||||||
// " WHERE datainfo.deletedbyinference = false) ds " +
|
|
||||||
// "LATERAL VIEW EXPLODE(instance) i AS inst";
|
|
||||||
// Dataset<Row> cfhb = spark.sql(query);
|
|
||||||
// cfhb.createOrReplaceTempView("cfhb");
|
|
||||||
// //log.info("cfhb_number : {}", cfhb.count());
|
|
||||||
// }
|
|
||||||
|
|
||||||
private static Dataset<Row> countryPropagationAssoc(
|
private static Dataset<Row> countryPropagationAssoc(
|
||||||
SparkSession spark,
|
SparkSession spark,
|
||||||
Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
|
Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
|
||||||
|
@ -248,16 +206,4 @@ public class SparkCountryPropagationJob2 {
|
||||||
value -> OBJECT_MAPPER.readValue(value, DatasourceCountry.class),
|
value -> OBJECT_MAPPER.readValue(value, DatasourceCountry.class),
|
||||||
Encoders.bean(DatasourceCountry.class));
|
Encoders.bean(DatasourceCountry.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void writeUpdates(
|
|
||||||
Dataset<ResultCountrySet> potentialUpdates, String outputPath) {
|
|
||||||
potentialUpdates
|
|
||||||
.toJSON()
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.text(outputPath);
|
|
||||||
// map(u -> OBJECT_MAPPER.writeValueAsString(u))
|
|
||||||
// .saveAsTextFile(outputPath, GzipCodec.class);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue