removed the writeUpdate option. The update is available in the preparedInfo path

2020-04-27 10:30:32 +02:00 · 2020-04-27 10:30:32 +02:00 · 95a54d5460
parent 8802e4126b
commit 95a54d5460
1 changed files with 1 additions and 55 deletions
--- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java
@ -50,16 +50,6 @@ public class SparkCountryPropagationJob2 {
        final String resultClassName = parser.get("resultTableName");
        log.info("resultTableName: {}", resultClassName);
        final String resultType =
                resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
        log.info("resultType: {}", resultType);
        final Boolean writeUpdates =
                Optional.ofNullable(parser.get("writeUpdate"))
                        .map(Boolean::valueOf)
                        .orElse(Boolean.TRUE);
        log.info("writeUpdate: {}", writeUpdates);
        final Boolean saveGraph =
                Optional.ofNullable(parser.get("saveGraph"))
                        .map(Boolean::valueOf)
@ -76,17 +66,12 @@ public class SparkCountryPropagationJob2 {
                conf,
                isSparkSessionManaged,
                spark -> {
                    // createOutputDirs(outputPath,
                    // FileSystem.get(spark.sparkContext().hadoopConfiguration()));
                    removeOutputDir(spark, outputPath);
                    execPropagation(
                            spark,
                            datasourcecountrypath,
                            inputPath,
                            outputPath,
                            resultClazz,
                            resultType,
                            writeUpdates,
                            saveGraph);
                });
    }
@ -97,12 +82,10 @@ public class SparkCountryPropagationJob2 {
            String inputPath,
            String outputPath,
            Class<R> resultClazz,
            String resultType,
            boolean writeUpdates,
            boolean saveGraph) {
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        // Load parque file with preprocessed association datasource - country
+        // Load file with preprocessed association datasource - country
        Dataset<DatasourceCountry> datasourcecountryassoc =
                readAssocDatasourceCountry(spark, datasourcecountrypath);
        // broadcasting the result of the preparation step
@ -114,10 +97,6 @@ public class SparkCountryPropagationJob2 {
                                spark, inputPath, resultClazz, broadcast_datasourcecountryassoc)
                        .as(Encoders.bean(ResultCountrySet.class));
        if (writeUpdates) {
            writeUpdates(potentialUpdates, outputPath + "/update_" + resultType);
        }
        if (saveGraph) {
            updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath);
        }
@ -138,11 +117,6 @@ public class SparkCountryPropagationJob2 {
                        r -> new Tuple2<>(r.getId(), r),
                        Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz)));
        //        Dataset<Tuple2<String, ResultCountrySet>> potential_update_pair =
        // potentialUpdates.map(pu -> new Tuple2<>(pu.getResultId(),
        //                        pu),
        //                Encoders.tuple(Encoders.STRING(), Encoders.bean(ResultCountrySet.class)));
        Dataset<R> new_table =
                result_pair
                        .joinWith(
@ -184,10 +158,6 @@ public class SparkCountryPropagationJob2 {
        log.info("Saving graph table to path: {}", outputPath);
        // log.info("number of saved recordsa: {}", new_table.count());
        new_table.toJSON().write().option("compression", "gzip").text(outputPath);
        //                    .toJavaRDD()
        //                    .map(r -> OBJECT_MAPPER.writeValueAsString(r))
        //                    .saveAsTextFile(outputPath , GzipCodec.class);
    }
    private static <R extends Result> Dataset<Row> getPotentialResultToUpdate(
@ -203,18 +173,6 @@ public class SparkCountryPropagationJob2 {
        return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc);
    }
    //    private static void createCfHbforresult(SparkSession spark) {
    //        String query;
    //        query = "SELECT id, inst.collectedfrom.key cf , inst.hostedby.key hb " +
    //                "FROM ( SELECT id, instance " +
    //                "FROM result " +
    //                " WHERE datainfo.deletedbyinference = false)  ds " +
    //                "LATERAL VIEW EXPLODE(instance) i AS inst";
    //        Dataset<Row> cfhb = spark.sql(query);
    //        cfhb.createOrReplaceTempView("cfhb");
    //        //log.info("cfhb_number : {}", cfhb.count());
    //    }
    private static Dataset<Row> countryPropagationAssoc(
            SparkSession spark,
            Broadcast<Dataset<DatasourceCountry>> broadcast_datasourcecountryassoc) {
@ -248,16 +206,4 @@ public class SparkCountryPropagationJob2 {
                        value -> OBJECT_MAPPER.readValue(value, DatasourceCountry.class),
                        Encoders.bean(DatasourceCountry.class));
    }
    private static void writeUpdates(
            Dataset<ResultCountrySet> potentialUpdates, String outputPath) {
        potentialUpdates
                .toJSON()
                .write()
                .mode(SaveMode.Overwrite)
                .option("compression", "gzip")
                .text(outputPath);
        //                map(u -> OBJECT_MAPPER.writeValueAsString(u))
        //                .saveAsTextFile(outputPath, GzipCodec.class);
    }
 }