From cbe7c6734a1538c4f57ca7ae8e1649698c89ab14 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 2 Feb 2024 14:18:46 +0200 Subject: [PATCH] - Add documentation. - Code polishing/cleanup. --- .../dhp-continuous-validation/README.md | 28 +++++++- .../dhp-continuous-validation/pom.xml | 6 -- .../runOozieWorkflow.sh | 2 +- .../ContinuousValidation.java} | 64 +++++++++---------- ...put_continuous_validation_parameters.json} | 4 +- .../oozie_app/config-default.xml | 0 .../oozie_app/workflow.xml | 12 ++-- .../src/main/resources/logback.xml | 7 +- 8 files changed, 69 insertions(+), 54 deletions(-) rename dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/{continuous_validator/ContinuousValidator.java => continuous_validation/ContinuousValidation.java} (79%) rename dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/{continuous_validator/input_continuous_validator_parameters.json => continuous_validation/input_continuous_validation_parameters.json} (90%) rename dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/{continuous_validator => continuous_validation}/oozie_app/config-default.xml (100%) rename dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/{continuous_validator => continuous_validation}/oozie_app/workflow.xml (92%) diff --git a/dhp-workflows/dhp-continuous-validation/README.md b/dhp-workflows/dhp-continuous-validation/README.md index 7905a0e55..d1e8afb3a 100644 --- a/dhp-workflows/dhp-continuous-validation/README.md +++ b/dhp-workflows/dhp-continuous-validation/README.md @@ -2,10 +2,32 @@ This module is responsible for deploying an **Oozie Workflow** (on the desired cluster), which executes a **Spark** action.
This action takes the HDFS-path of a directory of parquet files containing metadata records, and applies the validation process on all of them, in parallel. Then it outputs the results, in json-format, in the given directory.
-The validation process is powered by the [**uoa-validator-engine2**](https://code-repo.d4science.org/MaDgIK/uoa-validator-engine2) software.
+The validation process is powered by the "[**uoa-validator-engine2**](https://code-repo.d4science.org/MaDgIK/uoa-validator-engine2)" software, +which is included as a dependency inside the main "pom.xml" file.
-### Install and run + +### Configure the workflow + +Add the wanted values for each of the parameters, defined in the "/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml" file.
+The most important parameters are the following: +- ***parquet_path***: the input parquet +- ***openaire_guidelines***: valid values: "4.0", "3.0", "2.0", "fair_data", "fair_literature_v4" +- ***output_path***: Be careful to use a base directory which is different from the one that this module is running on, as during a new deployment, that base directory will be deleted. + + +### Install the project and then deploy and run the workflow Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script.
-[...] \ No newline at end of file +Use the "workflow-id" displayed by the "runOozieWorkflow.sh" script to check the running status and logs, in the remote machine, as follows: +- Check the status: `oozie job -oozie http:///oozie -info ` +- Copy the "Job-id" from the output of the above command (numbers with ONE underscore between them). +- Check the job's logs (not the app's logs!): `yarn logs -applicationId application_` +

+ +**Note**:
+If you encounter any "java.lang.NoSuchFieldError" issues in the logs, rerun using the following steps: +- Delete some remote directories related to the workflow in your user's dir: /user// + - ***.sparkStaging*** + - ***oozie-oozi*** +- Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script. diff --git a/dhp-workflows/dhp-continuous-validation/pom.xml b/dhp-workflows/dhp-continuous-validation/pom.xml index 77c8969de..68d61f921 100644 --- a/dhp-workflows/dhp-continuous-validation/pom.xml +++ b/dhp-workflows/dhp-continuous-validation/pom.xml @@ -37,12 +37,6 @@ spark-sql_${scala.binary.version} - - - com.google.code.gson - gson - - \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh b/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh index 8ab67a39b..386f0b41f 100755 --- a/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh +++ b/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh @@ -7,7 +7,7 @@ CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} # Build and deploy this module. mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \ - -Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validator + -Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validation # Show the Oozie-job-ID. echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n" diff --git a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validation/ContinuousValidation.java similarity index 79% rename from dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java rename to dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validation/ContinuousValidation.java index e7ce7d319..721cd23ba 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java +++ b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validation/ContinuousValidation.java @@ -1,13 +1,13 @@ -package eu.dnetlib.dhp.continuous_validator; +package eu.dnetlib.dhp.continuous_validation; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.util.Objects; +import java.util.Optional; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.validator2.validation.StandardValidationResult; -import eu.dnetlib.validator2.validation.XMLApplicationProfile; -import eu.dnetlib.validator2.validation.guideline.Guideline; -import eu.dnetlib.validator2.validation.guideline.StandardResult; -import eu.dnetlib.validator2.validation.guideline.openaire.*; -import eu.dnetlib.validator2.validation.utils.TestUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -16,23 +16,24 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.slf4j.LoggerFactory; -import java.io.File; -import java.nio.charset.StandardCharsets; -import java.util.Objects; -import java.util.Optional; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.validator2.validation.StandardValidationResult; +import eu.dnetlib.validator2.validation.XMLApplicationProfile; +import eu.dnetlib.validator2.validation.guideline.Guideline; +import eu.dnetlib.validator2.validation.guideline.StandardResult; +import eu.dnetlib.validator2.validation.guideline.openaire.*; +import eu.dnetlib.validator2.validation.utils.TestUtils; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +public class ContinuousValidation { -public class ContinuousValidator { - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class); - private static final String parametersFile = "input_continuous_validator_parameters.json"; + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidation.class); + private static final String parametersFile = "input_continuous_validation_parameters.json"; public static void main(String[] args) { ArgumentApplicationParser parser = null; Boolean isSparkSessionManaged = false; - String parquet_file_path = null; + String parquetPath = null; String guidelines = null; String outputPath = null; @@ -41,8 +42,8 @@ public class ContinuousValidator { .toString( Objects .requireNonNull( - ContinuousValidator.class - .getResourceAsStream("/eu/dnetlib/dhp/continuous_validator/" + parametersFile)), + ContinuousValidation.class + .getResourceAsStream("/eu/dnetlib/dhp/continuous_validation/" + parametersFile)), StandardCharsets.UTF_8); parser = new ArgumentApplicationParser(jsonConfiguration); @@ -62,22 +63,21 @@ public class ContinuousValidator { // unit test itself rather than inside the spark application" // Set the parquet input, either a parquet-file or a directory with parquet files. - parquet_file_path = parser.get("parquet_file_path"); - if (parquet_file_path == null) { - logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile); + parquetPath = parser.get("parquet_path"); + if (parquetPath == null) { + logger.error("The \"parquet_path\" was not retrieved from the parameters file: " + parametersFile); return; } guidelines = parser.get("openaire_guidelines"); if (guidelines == null) { - logger - .error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile); + logger.error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile); return; } - outputPath = parser.get("outputPath"); + outputPath = parser.get("output_path"); if (outputPath == null) { - logger.error("The \"outputPath\" was not retrieved from the parameters file: " + parametersFile); + logger.error("The \"output_path\" was not retrieved from the parameters file: " + parametersFile); return; } @@ -86,8 +86,8 @@ public class ContinuousValidator { logger .info( - "Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \"" - + guidelines + "\"" + " and will output the results in: " + outputPath); + "Will validate the contents of parquetFile: \"" + parquetPath + "\", against guidelines: \"" + + guidelines + "\"" + " and will output the results in the outputPath: " + outputPath); AbstractOpenAireProfile profile; switch (guidelines) { @@ -112,13 +112,13 @@ public class ContinuousValidator { } SparkConf conf = new SparkConf(); - conf.setAppName(ContinuousValidator.class.getSimpleName()); + conf.setAppName(ContinuousValidation.class.getSimpleName()); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(new Class[] { XMLApplicationProfile.ValidationResult.class, Guideline.Result.class, StandardValidationResult.class, StandardResult.class }); - String finalParquet_file_path = parquet_file_path; + String finalParquetPath = parquetPath; String finalOutputPath = outputPath; runWithSparkSession(conf, isSparkSessionManaged, spark -> { @@ -132,7 +132,7 @@ public class ContinuousValidator { spark .read() - .parquet(finalParquet_file_path) + .parquet(finalParquetPath) .filter("encoding = 'XML' and id is not NULL and body is not NULL") .map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class)) .write() diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/input_continuous_validation_parameters.json similarity index 90% rename from dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json rename to dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/input_continuous_validation_parameters.json index c37683e82..98b0e791b 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/input_continuous_validation_parameters.json @@ -7,7 +7,7 @@ }, { "paramName": "prq_file", - "paramLongName": "parquet_file_path", + "paramLongName": "parquet_path", "paramDescription": "the full path of a parquet-file or a directory with parquet files, to be processed", "paramRequired": true }, @@ -19,7 +19,7 @@ }, { "paramName": "o", - "paramLongName": "outputPath", + "paramLongName": "output_path", "paramDescription": "the path of the output-directory where the result-json-files will be stored", "paramRequired": true } diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml rename to dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/workflow.xml similarity index 92% rename from dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml rename to dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/workflow.xml index 8296d5f5c..d0bb0a8bd 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/workflow.xml @@ -1,7 +1,7 @@ - parquet_file_path + parquet_path /var/lib/dnet/mdstore_PROD/md-7763c517-538d-4aa7-83f8-6096b3ce0d96/md-7763c517-538d-4aa7-83f8-6096b3ce0d96-1702622132535/store the full path of a parquet-file or a directory with parquet files, to be processed @@ -11,8 +11,8 @@ the version of the OpenAIRE Guidelines to validate the records against - outputPath - /user/lsmyrnaios/continuous_validator/output + output_path + /user/${dhp.hadoop.frontend.user.name}/continuous_validation/output the path of the output-directory where the result-json-files will be stored @@ -89,7 +89,7 @@ yarn cluster Validate multiple records against OpenAIRE Guidelines - eu.dnetlib.dhp.continuous_validator.ContinuousValidator + eu.dnetlib.dhp.continuous_validation.ContinuousValidation dhp-continuous-validation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -103,9 +103,9 @@ --conf spark.sql.shuffle.partitions=3840 - --parquet_file_path${parquet_file_path} + --parquet_path${parquet_path} --openaire_guidelines${openaire_guidelines} - --outputPath${outputPath} + --output_path${output_path} diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml index a5ea95f7d..a4c056149 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml @@ -4,10 +4,10 @@ - logs/ContinuousValidator.log + logs/ContinuousValidation.log - logs/ContinuousValidator.%i.log.zip + logs/ContinuousValidation.%i.log.zip 1 10 @@ -33,8 +33,7 @@ - - +