diff --git a/dhp-workflows/dhp-continuous-validation/README.md b/dhp-workflows/dhp-continuous-validation/README.md
index 7905a0e55..d1e8afb3a 100644
--- a/dhp-workflows/dhp-continuous-validation/README.md
+++ b/dhp-workflows/dhp-continuous-validation/README.md
@@ -2,10 +2,32 @@
This module is responsible for deploying an **Oozie Workflow** (on the desired cluster), which executes a **Spark** action.
This action takes the HDFS-path of a directory of parquet files containing metadata records, and applies the validation process on all of them, in parallel. Then it outputs the results, in json-format, in the given directory.
-The validation process is powered by the [**uoa-validator-engine2**](https://code-repo.d4science.org/MaDgIK/uoa-validator-engine2) software.
+The validation process is powered by the "[**uoa-validator-engine2**](https://code-repo.d4science.org/MaDgIK/uoa-validator-engine2)" software,
+which is included as a dependency inside the main "pom.xml" file.
-### Install and run
+
+### Configure the workflow
+
+Add the wanted values for each of the parameters, defined in the "/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml" file.
+The most important parameters are the following:
+- ***parquet_path***: the input parquet
+- ***openaire_guidelines***: valid values: "4.0", "3.0", "2.0", "fair_data", "fair_literature_v4"
+- ***output_path***: Be careful to use a base directory which is different from the one that this module is running on, as during a new deployment, that base directory will be deleted.
+
+
+### Install the project and then deploy and run the workflow
Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script.
-[...]
\ No newline at end of file
+Use the "workflow-id" displayed by the "runOozieWorkflow.sh" script to check the running status and logs, in the remote machine, as follows:
+- Check the status: `oozie job -oozie http:///oozie -info `
+- Copy the "Job-id" from the output of the above command (numbers with ONE underscore between them).
+- Check the job's logs (not the app's logs!): `yarn logs -applicationId application_`
+
+
+**Note**:
+If you encounter any "java.lang.NoSuchFieldError" issues in the logs, rerun using the following steps:
+- Delete some remote directories related to the workflow in your user's dir: /user//
+ - ***.sparkStaging***
+ - ***oozie-oozi***
+- Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script.
diff --git a/dhp-workflows/dhp-continuous-validation/pom.xml b/dhp-workflows/dhp-continuous-validation/pom.xml
index 77c8969de..68d61f921 100644
--- a/dhp-workflows/dhp-continuous-validation/pom.xml
+++ b/dhp-workflows/dhp-continuous-validation/pom.xml
@@ -37,12 +37,6 @@
spark-sql_${scala.binary.version}
-
-
- com.google.code.gson
- gson
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh b/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh
index 8ab67a39b..386f0b41f 100755
--- a/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh
+++ b/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh
@@ -7,7 +7,7 @@ CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Build and deploy this module.
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
- -Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validator
+ -Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validation
# Show the Oozie-job-ID.
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
diff --git a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validation/ContinuousValidation.java
similarity index 79%
rename from dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java
rename to dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validation/ContinuousValidation.java
index e7ce7d319..721cd23ba 100644
--- a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java
+++ b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validation/ContinuousValidation.java
@@ -1,13 +1,13 @@
-package eu.dnetlib.dhp.continuous_validator;
+package eu.dnetlib.dhp.continuous_validation;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.util.Objects;
+import java.util.Optional;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.validator2.validation.StandardValidationResult;
-import eu.dnetlib.validator2.validation.XMLApplicationProfile;
-import eu.dnetlib.validator2.validation.guideline.Guideline;
-import eu.dnetlib.validator2.validation.guideline.StandardResult;
-import eu.dnetlib.validator2.validation.guideline.openaire.*;
-import eu.dnetlib.validator2.validation.utils.TestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
@@ -16,23 +16,24 @@ import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.slf4j.LoggerFactory;
-import java.io.File;
-import java.nio.charset.StandardCharsets;
-import java.util.Objects;
-import java.util.Optional;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.validator2.validation.StandardValidationResult;
+import eu.dnetlib.validator2.validation.XMLApplicationProfile;
+import eu.dnetlib.validator2.validation.guideline.Guideline;
+import eu.dnetlib.validator2.validation.guideline.StandardResult;
+import eu.dnetlib.validator2.validation.guideline.openaire.*;
+import eu.dnetlib.validator2.validation.utils.TestUtils;
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+public class ContinuousValidation {
-public class ContinuousValidator {
-
- private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class);
- private static final String parametersFile = "input_continuous_validator_parameters.json";
+ private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidation.class);
+ private static final String parametersFile = "input_continuous_validation_parameters.json";
public static void main(String[] args) {
ArgumentApplicationParser parser = null;
Boolean isSparkSessionManaged = false;
- String parquet_file_path = null;
+ String parquetPath = null;
String guidelines = null;
String outputPath = null;
@@ -41,8 +42,8 @@ public class ContinuousValidator {
.toString(
Objects
.requireNonNull(
- ContinuousValidator.class
- .getResourceAsStream("/eu/dnetlib/dhp/continuous_validator/" + parametersFile)),
+ ContinuousValidation.class
+ .getResourceAsStream("/eu/dnetlib/dhp/continuous_validation/" + parametersFile)),
StandardCharsets.UTF_8);
parser = new ArgumentApplicationParser(jsonConfiguration);
@@ -62,22 +63,21 @@ public class ContinuousValidator {
// unit test itself rather than inside the spark application"
// Set the parquet input, either a parquet-file or a directory with parquet files.
- parquet_file_path = parser.get("parquet_file_path");
- if (parquet_file_path == null) {
- logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile);
+ parquetPath = parser.get("parquet_path");
+ if (parquetPath == null) {
+ logger.error("The \"parquet_path\" was not retrieved from the parameters file: " + parametersFile);
return;
}
guidelines = parser.get("openaire_guidelines");
if (guidelines == null) {
- logger
- .error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile);
+ logger.error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile);
return;
}
- outputPath = parser.get("outputPath");
+ outputPath = parser.get("output_path");
if (outputPath == null) {
- logger.error("The \"outputPath\" was not retrieved from the parameters file: " + parametersFile);
+ logger.error("The \"output_path\" was not retrieved from the parameters file: " + parametersFile);
return;
}
@@ -86,8 +86,8 @@ public class ContinuousValidator {
logger
.info(
- "Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \""
- + guidelines + "\"" + " and will output the results in: " + outputPath);
+ "Will validate the contents of parquetFile: \"" + parquetPath + "\", against guidelines: \""
+ + guidelines + "\"" + " and will output the results in the outputPath: " + outputPath);
AbstractOpenAireProfile profile;
switch (guidelines) {
@@ -112,13 +112,13 @@ public class ContinuousValidator {
}
SparkConf conf = new SparkConf();
- conf.setAppName(ContinuousValidator.class.getSimpleName());
+ conf.setAppName(ContinuousValidation.class.getSimpleName());
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(new Class[] {
XMLApplicationProfile.ValidationResult.class, Guideline.Result.class, StandardValidationResult.class,
StandardResult.class
});
- String finalParquet_file_path = parquet_file_path;
+ String finalParquetPath = parquetPath;
String finalOutputPath = outputPath;
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@@ -132,7 +132,7 @@ public class ContinuousValidator {
spark
.read()
- .parquet(finalParquet_file_path)
+ .parquet(finalParquetPath)
.filter("encoding = 'XML' and id is not NULL and body is not NULL")
.map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class))
.write()
diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/input_continuous_validation_parameters.json
similarity index 90%
rename from dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json
rename to dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/input_continuous_validation_parameters.json
index c37683e82..98b0e791b 100644
--- a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json
+++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/input_continuous_validation_parameters.json
@@ -7,7 +7,7 @@
},
{
"paramName": "prq_file",
- "paramLongName": "parquet_file_path",
+ "paramLongName": "parquet_path",
"paramDescription": "the full path of a parquet-file or a directory with parquet files, to be processed",
"paramRequired": true
},
@@ -19,7 +19,7 @@
},
{
"paramName": "o",
- "paramLongName": "outputPath",
+ "paramLongName": "output_path",
"paramDescription": "the path of the output-directory where the result-json-files will be stored",
"paramRequired": true
}
diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/config-default.xml
similarity index 100%
rename from dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml
rename to dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/config-default.xml
diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/workflow.xml
similarity index 92%
rename from dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml
rename to dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/workflow.xml
index 8296d5f5c..d0bb0a8bd 100644
--- a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validation/oozie_app/workflow.xml
@@ -1,7 +1,7 @@
- parquet_file_path
+ parquet_path
/var/lib/dnet/mdstore_PROD/md-7763c517-538d-4aa7-83f8-6096b3ce0d96/md-7763c517-538d-4aa7-83f8-6096b3ce0d96-1702622132535/store
the full path of a parquet-file or a directory with parquet files, to be processed
@@ -11,8 +11,8 @@
the version of the OpenAIRE Guidelines to validate the records against
- outputPath
- /user/lsmyrnaios/continuous_validator/output
+ output_path
+ /user/${dhp.hadoop.frontend.user.name}/continuous_validation/output
the path of the output-directory where the result-json-files will be stored
@@ -89,7 +89,7 @@
yarn
cluster
Validate multiple records against OpenAIRE Guidelines
- eu.dnetlib.dhp.continuous_validator.ContinuousValidator
+ eu.dnetlib.dhp.continuous_validation.ContinuousValidation
dhp-continuous-validation-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
@@ -103,9 +103,9 @@
--conf spark.sql.shuffle.partitions=3840
- --parquet_file_path${parquet_file_path}
+ --parquet_path${parquet_path}
--openaire_guidelines${openaire_guidelines}
- --outputPath${outputPath}
+ --output_path${output_path}
diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml
index a5ea95f7d..a4c056149 100644
--- a/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml
+++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml
@@ -4,10 +4,10 @@
- logs/ContinuousValidator.log
+ logs/ContinuousValidation.log
- logs/ContinuousValidator.%i.log.zip
+ logs/ContinuousValidation.%i.log.zip
1
10
@@ -33,8 +33,7 @@
-
-
+