Added documentation on a class, and reused ArgumetApplicationParser on dhp-aggregation

2019-10-07 17:02:53 +02:00 · 2019-10-07 17:02:53 +02:00 · 4b8c7c279d
parent a423a6ebfd
commit 4b8c7c279d
3 changed files with 48 additions and 114 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java
@ -7,21 +7,41 @@ import java.io.Serializable;


 /**
- * This class models a record inside the new MetadataStore
+ * This class models a record inside the new Metadata store collection on HDFS *
 *
 */
 public class MetadataRecord implements Serializable {

+    /**
+     * The D-Net Identifier associated to the record
+     */
    private String id;

+    /**
+     * The original Identifier of the record
+     */
    private String originalId;

+
+    /**
+     * The encoding of the record, should be JSON or XML
+     */
    private String encoding;

+    /**
+     * The information about the provenance of the record see @{@link Provenance}
+     * for the model of this information
+     */
    private Provenance provenance;

+    /**
+     * The content of the metadata
+     */
    private String body;

+    /**
+     * the date when the record has been stored
+     */
    private long dateOfCollection;


--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
@ -1,12 +1,14 @@
 package eu.dnetlib.dhp.collection;

 import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
 import eu.dnetlib.dhp.model.mdstore.Provenance;
 import eu.dnetlib.message.Message;
 import eu.dnetlib.message.MessageManager;
 import eu.dnetlib.message.MessageType;
 import org.apache.commons.cli.*;
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
@ -57,26 +59,11 @@ public class GenerateNativeStoreSparkJob {

    public static void main(String[] args) throws Exception {

-        Options options = generateApplicationArguments();
-
-
-        CommandLineParser parser = new DefaultParser();
-        CommandLine cmd = parser.parse( options, args);
-
-        final String encoding               = cmd.getOptionValue("e");
-        final long dateOfCollection         = new Long(cmd.getOptionValue("d"));
-        final String jsonProvenance         = cmd.getOptionValue("p");
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(GenerateNativeStoreSparkJob.class.getResourceAsStream("/eu/dnetlib/dhp/collection/collection_input_parameters.json")));
+        parser.parseArgument(args);
        final ObjectMapper jsonMapper       = new ObjectMapper();
-        final Provenance provenance         = jsonMapper.readValue(jsonProvenance, Provenance.class);
-        final String xpath                  = cmd.getOptionValue("x");
-        final String inputPath              = cmd.getOptionValue("i");
-        final String outputPath             = cmd.getOptionValue("o");
-        final String rabbitUser 			= cmd.getOptionValue("ru");
-        final String rabbitPassword		    = cmd.getOptionValue("rp");
-        final String rabbitHost 			= cmd.getOptionValue("rh");
-        final String rabbitOngoingQueue 	= cmd.getOptionValue("ro");
-        final String rabbitReportQueue  	= cmd.getOptionValue("rr");
-        final String workflowId 			= cmd.getOptionValue("w");
+        final Provenance provenance         = jsonMapper.readValue(parser.get("provenance"), Provenance.class);
+        final long dateOfCollection         = new Long(parser.get("dateOfCollection"));

        final SparkSession spark = SparkSession
                .builder()
@ -89,118 +76,31 @@ public class GenerateNativeStoreSparkJob {

        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

-        final JavaPairRDD<IntWritable, Text> inputRDD = sc.sequenceFile(inputPath, IntWritable.class, Text.class);
+        final JavaPairRDD<IntWritable, Text> inputRDD = sc.sequenceFile(parser.get("input"), IntWritable.class, Text.class);

        final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");

        final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");

-        final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, false, null);
+        final MessageManager manager = new MessageManager(parser.get("rabbitHost"), parser.get("rabbitUser"), parser.get("rabbitPassword"), false, false, null);

-        final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), xpath, encoding, provenance, dateOfCollection, totalItems, invalidRecords))
+        final JavaRDD<MetadataRecord> mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"),provenance, dateOfCollection, totalItems, invalidRecords))
                .filter(Objects::nonNull).distinct();

        ongoingMap.put("ongoing", "0");
-        manager.sendMessage(new Message(workflowId,"DataFrameCreation", MessageType.ONGOING, ongoingMap ), rabbitOngoingQueue, true, false);
+        manager.sendMessage(new Message(parser.get("workflowId"),"DataFrameCreation", MessageType.ONGOING, ongoingMap ), parser.get("rabbitOngoingQueue"), true, false);

        final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
        final Dataset<MetadataRecord> mdstore = spark.createDataset(mappeRDD.rdd(), encoder);
        final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
        mdStoreRecords.add(mdstore.count());
        ongoingMap.put("ongoing", ""+ totalItems.value());
-        manager.sendMessage(new Message(workflowId,"DataFrameCreation", MessageType.ONGOING, ongoingMap ), rabbitOngoingQueue, true, false);
+        manager.sendMessage(new Message(parser.get("workflowId"),"DataFrameCreation", MessageType.ONGOING, ongoingMap ), parser.get("rabbitOngoingQueue"), true, false);

-        mdstore.write().format("parquet").save(outputPath);
+        mdstore.write().format("parquet").save(parser.get("output"));
        reportMap.put("inputItem" , ""+ totalItems.value());
        reportMap.put("invalidRecords", "" + invalidRecords.value());
        reportMap.put("mdStoreSize", "" + mdStoreRecords.value());
-        manager.sendMessage(new Message(workflowId,"Collection", MessageType.REPORT, reportMap ), rabbitReportQueue, true, false);
-    }
-
-    private static Options generateApplicationArguments() {
-        Options options = new Options();
-        options.addOption(Option.builder("e")
-                .longOpt("encoding")
-                .required(true)
-                .desc("the encoding type should be xml or json")
-                .hasArg() // This option has an argument.
-                .build());
-        options.addOption(Option.builder("d")
-                .longOpt("dateOfCollection")
-                .required(true)
-                .desc("the date of collection")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("p")
-                .longOpt("provenance")
-                .required(true)
-                .desc("the json Provenance information")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("x")
-                .longOpt("xpath")
-                .required(true)
-                .desc("xpath of the identifier")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("i")
-                .longOpt("input")
-                .required(true)
-                .desc("input path of the sequence file")
-                .hasArg() // This option has an argument.
-                .build());
-        options.addOption(Option.builder("o")
-                .longOpt("output")
-                .required(true)
-                .desc("output path of the mdstore")
-                .hasArg()
-                .build());
-
-        options.addOption(Option.builder("ru")
-                .longOpt("rabbitUser")
-                .required(true)
-                .desc("the user to connect with RabbitMq for messaging")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("rp")
-                .longOpt("rabbitPassWord")
-                .required(true)
-                .desc("the password to connect with RabbitMq for messaging")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("rh")
-                .longOpt("rabbitHost")
-                .required(true)
-                .desc("the host of the RabbitMq server")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("ro")
-                .longOpt("rabbitOngoingQueue")
-                .required(true)
-                .desc("the name of the ongoing queue")
-                .hasArg() // This option has an argument.
-                .build());
-
-        options.addOption(Option.builder("rr")
-                .longOpt("rabbitReportQueue")
-                .required(true)
-                .desc("the name of the report queue")
-                .hasArg() // This option has an argument.
-                .build());
-
-
-        options.addOption(Option.builder("w")
-                .longOpt("workflowId")
-                .required(true)
-                .desc("the identifier of the dnet Workflow")
-                .hasArg() // This option has an argument.
-                .build());
-        return options;
+        manager.sendMessage(new Message(parser.get("workflowId"),"Collection", MessageType.REPORT, reportMap ), parser.get("rabbitReportQueue"), true, false);
    }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json
@ -0,0 +1,14 @@
+[
+  {"paramName":"e",   "paramLongName":"encoding",           "paramDescription": "the encoding of the input record should be JSON or XML",   "paramRequired": true},
+  {"paramName":"d",   "paramLongName":"dateOfCollection",   "paramDescription": "the date when the record has been stored",                 "paramRequired": true},
+  {"paramName":"p",   "paramLongName":"provenance",         "paramDescription": "the infos about the provenance of the collected records",  "paramRequired": true},
+  {"paramName":"x",   "paramLongName":"xpath",              "paramDescription": "the xpath to identify the record ifentifier",              "paramRequired": true},
+  {"paramName":"i",   "paramLongName":"input",              "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true},
+  {"paramName":"o",   "paramLongName":"output",             "paramDescription": "the path of the result DataFrame on HDFS",                 "paramRequired": true},
+  {"paramName":"ru",  "paramLongName":"rabbitUser",         "paramDescription": "the user to connect with RabbitMq for messaging",          "paramRequired": true},
+  {"paramName":"rp",  "paramLongName":"rabbitPassword",     "paramDescription": "the password to connect with RabbitMq for messaging",      "paramRequired": true},
+  {"paramName":"rh",  "paramLongName":"rabbitHost",         "paramDescription": "the host of the RabbitMq server",                          "paramRequired": true},
+  {"paramName":"ro",  "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue",                            "paramRequired": true},
+  {"paramName":"rr",  "paramLongName":"rabbitReportQueue",  "paramDescription": "the name of the report queue",                             "paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workflowId",         "paramDescription": "the identifier of the dnet Workflow",                      "paramRequired": true}
+]