parent
b5f4d37827
commit
cbe7c6734a
|
@ -2,10 +2,32 @@
|
||||||
|
|
||||||
This module is responsible for deploying an **Oozie Workflow** (on the desired cluster), which executes a **Spark** action.<br>
|
This module is responsible for deploying an **Oozie Workflow** (on the desired cluster), which executes a **Spark** action.<br>
|
||||||
This action takes the HDFS-path of a directory of parquet files containing metadata records, and applies the validation process on all of them, in parallel. Then it outputs the results, in json-format, in the given directory.<br>
|
This action takes the HDFS-path of a directory of parquet files containing metadata records, and applies the validation process on all of them, in parallel. Then it outputs the results, in json-format, in the given directory.<br>
|
||||||
The validation process is powered by the [**uoa-validator-engine2**](https://code-repo.d4science.org/MaDgIK/uoa-validator-engine2) software.<br>
|
The validation process is powered by the "[**uoa-validator-engine2**](https://code-repo.d4science.org/MaDgIK/uoa-validator-engine2)" software,
|
||||||
|
which is included as a dependency inside the main "pom.xml" file.<br>
|
||||||
|
|
||||||
### Install and run
|
|
||||||
|
### Configure the workflow
|
||||||
|
|
||||||
|
Add the wanted values for each of the parameters, defined in the "/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml" file.<br>
|
||||||
|
The most important parameters are the following:
|
||||||
|
- ***parquet_path***: the input parquet
|
||||||
|
- ***openaire_guidelines***: valid values: "4.0", "3.0", "2.0", "fair_data", "fair_literature_v4"
|
||||||
|
- ***output_path***: Be careful to use a base directory which is different from the one that this module is running on, as during a new deployment, that base directory will be deleted.
|
||||||
|
|
||||||
|
|
||||||
|
### Install the project and then deploy and run the workflow
|
||||||
|
|
||||||
Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script.<br>
|
Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script.<br>
|
||||||
|
|
||||||
[...]
|
Use the "workflow-id" displayed by the "runOozieWorkflow.sh" script to check the running status and logs, in the remote machine, as follows:
|
||||||
|
- Check the status: `oozie job -oozie http://<cluster's domain and port>/oozie -info <Workflow-ID>`
|
||||||
|
- Copy the "Job-id" from the output of the above command (numbers with ONE underscore between them).
|
||||||
|
- Check the job's logs (not the app's logs!): `yarn logs -applicationId application_<Job-ID>`
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
**Note**:<br>
|
||||||
|
If you encounter any "java.lang.NoSuchFieldError" issues in the logs, rerun using the following steps:
|
||||||
|
- Delete some remote directories related to the workflow in your user's dir: /user/<userName>/
|
||||||
|
- ***.sparkStaging***
|
||||||
|
- ***oozie-oozi***
|
||||||
|
- Run the **./installProject.sh** script and then the **./runOozieWorkflow.sh** script.
|
||||||
|
|
|
@ -37,12 +37,6 @@
|
||||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Other dependencies. -->
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.code.gson</groupId>
|
|
||||||
<artifactId>gson</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -7,7 +7,7 @@ CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
||||||
|
|
||||||
# Build and deploy this module.
|
# Build and deploy this module.
|
||||||
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
|
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
|
||||||
-Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validator
|
-Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validation
|
||||||
|
|
||||||
# Show the Oozie-job-ID.
|
# Show the Oozie-job-ID.
|
||||||
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
|
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.continuous_validator;
|
package eu.dnetlib.dhp.continuous_validation;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.validator2.validation.StandardValidationResult;
|
|
||||||
import eu.dnetlib.validator2.validation.XMLApplicationProfile;
|
|
||||||
import eu.dnetlib.validator2.validation.guideline.Guideline;
|
|
||||||
import eu.dnetlib.validator2.validation.guideline.StandardResult;
|
|
||||||
import eu.dnetlib.validator2.validation.guideline.openaire.*;
|
|
||||||
import eu.dnetlib.validator2.validation.utils.TestUtils;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -16,23 +16,24 @@ import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import java.nio.charset.StandardCharsets;
|
import eu.dnetlib.validator2.validation.StandardValidationResult;
|
||||||
import java.util.Objects;
|
import eu.dnetlib.validator2.validation.XMLApplicationProfile;
|
||||||
import java.util.Optional;
|
import eu.dnetlib.validator2.validation.guideline.Guideline;
|
||||||
|
import eu.dnetlib.validator2.validation.guideline.StandardResult;
|
||||||
|
import eu.dnetlib.validator2.validation.guideline.openaire.*;
|
||||||
|
import eu.dnetlib.validator2.validation.utils.TestUtils;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
public class ContinuousValidation {
|
||||||
|
|
||||||
public class ContinuousValidator {
|
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidation.class);
|
||||||
|
private static final String parametersFile = "input_continuous_validation_parameters.json";
|
||||||
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class);
|
|
||||||
private static final String parametersFile = "input_continuous_validator_parameters.json";
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = null;
|
ArgumentApplicationParser parser = null;
|
||||||
Boolean isSparkSessionManaged = false;
|
Boolean isSparkSessionManaged = false;
|
||||||
String parquet_file_path = null;
|
String parquetPath = null;
|
||||||
String guidelines = null;
|
String guidelines = null;
|
||||||
String outputPath = null;
|
String outputPath = null;
|
||||||
|
|
||||||
|
@ -41,8 +42,8 @@ public class ContinuousValidator {
|
||||||
.toString(
|
.toString(
|
||||||
Objects
|
Objects
|
||||||
.requireNonNull(
|
.requireNonNull(
|
||||||
ContinuousValidator.class
|
ContinuousValidation.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/continuous_validator/" + parametersFile)),
|
.getResourceAsStream("/eu/dnetlib/dhp/continuous_validation/" + parametersFile)),
|
||||||
StandardCharsets.UTF_8);
|
StandardCharsets.UTF_8);
|
||||||
|
|
||||||
parser = new ArgumentApplicationParser(jsonConfiguration);
|
parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
@ -62,22 +63,21 @@ public class ContinuousValidator {
|
||||||
// unit test itself rather than inside the spark application"
|
// unit test itself rather than inside the spark application"
|
||||||
|
|
||||||
// Set the parquet input, either a parquet-file or a directory with parquet files.
|
// Set the parquet input, either a parquet-file or a directory with parquet files.
|
||||||
parquet_file_path = parser.get("parquet_file_path");
|
parquetPath = parser.get("parquet_path");
|
||||||
if (parquet_file_path == null) {
|
if (parquetPath == null) {
|
||||||
logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile);
|
logger.error("The \"parquet_path\" was not retrieved from the parameters file: " + parametersFile);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
guidelines = parser.get("openaire_guidelines");
|
guidelines = parser.get("openaire_guidelines");
|
||||||
if (guidelines == null) {
|
if (guidelines == null) {
|
||||||
logger
|
logger.error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile);
|
||||||
.error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
outputPath = parser.get("outputPath");
|
outputPath = parser.get("output_path");
|
||||||
if (outputPath == null) {
|
if (outputPath == null) {
|
||||||
logger.error("The \"outputPath\" was not retrieved from the parameters file: " + parametersFile);
|
logger.error("The \"output_path\" was not retrieved from the parameters file: " + parametersFile);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,8 +86,8 @@ public class ContinuousValidator {
|
||||||
|
|
||||||
logger
|
logger
|
||||||
.info(
|
.info(
|
||||||
"Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \""
|
"Will validate the contents of parquetFile: \"" + parquetPath + "\", against guidelines: \""
|
||||||
+ guidelines + "\"" + " and will output the results in: " + outputPath);
|
+ guidelines + "\"" + " and will output the results in the outputPath: " + outputPath);
|
||||||
|
|
||||||
AbstractOpenAireProfile profile;
|
AbstractOpenAireProfile profile;
|
||||||
switch (guidelines) {
|
switch (guidelines) {
|
||||||
|
@ -112,13 +112,13 @@ public class ContinuousValidator {
|
||||||
}
|
}
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.setAppName(ContinuousValidator.class.getSimpleName());
|
conf.setAppName(ContinuousValidation.class.getSimpleName());
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
conf.registerKryoClasses(new Class[] {
|
conf.registerKryoClasses(new Class[] {
|
||||||
XMLApplicationProfile.ValidationResult.class, Guideline.Result.class, StandardValidationResult.class,
|
XMLApplicationProfile.ValidationResult.class, Guideline.Result.class, StandardValidationResult.class,
|
||||||
StandardResult.class
|
StandardResult.class
|
||||||
});
|
});
|
||||||
String finalParquet_file_path = parquet_file_path;
|
String finalParquetPath = parquetPath;
|
||||||
String finalOutputPath = outputPath;
|
String finalOutputPath = outputPath;
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
@ -132,7 +132,7 @@ public class ContinuousValidator {
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.parquet(finalParquet_file_path)
|
.parquet(finalParquetPath)
|
||||||
.filter("encoding = 'XML' and id is not NULL and body is not NULL")
|
.filter("encoding = 'XML' and id is not NULL and body is not NULL")
|
||||||
.map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class))
|
.map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class))
|
||||||
.write()
|
.write()
|
|
@ -7,7 +7,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "prq_file",
|
"paramName": "prq_file",
|
||||||
"paramLongName": "parquet_file_path",
|
"paramLongName": "parquet_path",
|
||||||
"paramDescription": "the full path of a parquet-file or a directory with parquet files, to be processed",
|
"paramDescription": "the full path of a parquet-file or a directory with parquet files, to be processed",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
@ -19,7 +19,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "output_path",
|
||||||
"paramDescription": "the path of the output-directory where the result-json-files will be stored",
|
"paramDescription": "the path of the output-directory where the result-json-files will be stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
<workflow-app name="Validate metadata records against OpenAIRE Guidelines" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Validate metadata records against OpenAIRE Guidelines" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>parquet_file_path</name>
|
<name>parquet_path</name>
|
||||||
<value>/var/lib/dnet/mdstore_PROD/md-7763c517-538d-4aa7-83f8-6096b3ce0d96/md-7763c517-538d-4aa7-83f8-6096b3ce0d96-1702622132535/store</value>
|
<value>/var/lib/dnet/mdstore_PROD/md-7763c517-538d-4aa7-83f8-6096b3ce0d96/md-7763c517-538d-4aa7-83f8-6096b3ce0d96-1702622132535/store</value>
|
||||||
<description>the full path of a parquet-file or a directory with parquet files, to be processed</description>
|
<description>the full path of a parquet-file or a directory with parquet files, to be processed</description>
|
||||||
</property>
|
</property>
|
||||||
|
@ -11,8 +11,8 @@
|
||||||
<description>the version of the OpenAIRE Guidelines to validate the records against</description>
|
<description>the version of the OpenAIRE Guidelines to validate the records against</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>output_path</name>
|
||||||
<value>/user/lsmyrnaios/continuous_validator/output</value>
|
<value>/user/${dhp.hadoop.frontend.user.name}/continuous_validation/output</value>
|
||||||
<description>the path of the output-directory where the result-json-files will be stored</description>
|
<description>the path of the output-directory where the result-json-files will be stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
|
@ -89,7 +89,7 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Validate multiple records against OpenAIRE Guidelines</name>
|
<name>Validate multiple records against OpenAIRE Guidelines</name>
|
||||||
<class>eu.dnetlib.dhp.continuous_validator.ContinuousValidator</class>
|
<class>eu.dnetlib.dhp.continuous_validation.ContinuousValidation</class>
|
||||||
<jar>dhp-continuous-validation-${projectVersion}.jar</jar>
|
<jar>dhp-continuous-validation-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -103,9 +103,9 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<!-- Arguments passed to the "main" method of the class defined above. -->
|
<!-- Arguments passed to the "main" method of the class defined above. -->
|
||||||
<arg>--parquet_file_path</arg><arg>${parquet_file_path}</arg>
|
<arg>--parquet_path</arg><arg>${parquet_path}</arg>
|
||||||
<arg>--openaire_guidelines</arg><arg>${openaire_guidelines}</arg>
|
<arg>--openaire_guidelines</arg><arg>${openaire_guidelines}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--output_path</arg><arg>${output_path}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
|
@ -4,10 +4,10 @@
|
||||||
|
|
||||||
<!-- <Appenders>-->
|
<!-- <Appenders>-->
|
||||||
<appender name="RollingFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
<appender name="RollingFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||||
<file>logs/ContinuousValidator.log</file>
|
<file>logs/ContinuousValidation.log</file>
|
||||||
|
|
||||||
<rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
|
<rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
|
||||||
<fileNamePattern>logs/ContinuousValidator.%i.log.zip</fileNamePattern>
|
<fileNamePattern>logs/ContinuousValidation.%i.log.zip</fileNamePattern>
|
||||||
<minIndex>1</minIndex>
|
<minIndex>1</minIndex>
|
||||||
<maxIndex>10</maxIndex>
|
<maxIndex>10</maxIndex>
|
||||||
</rollingPolicy>
|
</rollingPolicy>
|
||||||
|
@ -33,8 +33,7 @@
|
||||||
<appender-ref ref="Console" />
|
<appender-ref ref="Console" />
|
||||||
</root>
|
</root>
|
||||||
|
|
||||||
<!-- TODO - Change the level below to "debug" -->
|
<logger name="eu.dnetlib.dhp.continuous_validation" level="debug"/>
|
||||||
<logger name="eu.dnetlib.dhp.continuous_validator" level="trace"/>
|
|
||||||
<logger name="eu.dnetlib.validator2" level="error"/>
|
<logger name="eu.dnetlib.validator2" level="error"/>
|
||||||
<logger name="org.sparkproject" level="info"/>
|
<logger name="org.sparkproject" level="info"/>
|
||||||
<logger name="org.apache.spark" level="info"/>
|
<logger name="org.apache.spark" level="info"/>
|
||||||
|
|
Loading…
Reference in New Issue