Continuous Validation Workflow #388

Open
lsmyrnaios wants to merge 18 commits from lsmyrnaios/dnet-hadoop:continuous_validation2 into beta
4 changed files with 28 additions and 172 deletions
Showing only changes of commit b71633fd7f - Show all commits

View File

@ -18,10 +18,14 @@
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<jackson.version>2.14.3</jackson.version> <!-- Scala doesn't want a higher version than 2.14.x -->
</properties>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>uoa-validator-engine2</artifactId>
@ -33,10 +37,7 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<!--<scope>provided</scope>-->
<!--
<scope>compile</scope>
-->
<scope>provided</scope>
<exclusions>
<!-- This is an older version which causes problems. We have to add the latest version independently. -->
@ -70,8 +71,8 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<!--<scope>provided</scope>-->
<!--<scope>compile</scope>-->
<scope>provided</scope>
<exclusions>
<!-- This exclusion is a must for scala 2.11 and spark 2.4.0.cloudera2 -->
<exclusion>
@ -94,8 +95,8 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<!--<version>${dhp.hadoop.version}</version>-->
<!--<scope>compile</scope>-->
<version>${dhp.hadoop.version}</version>
<scope>provided</scope> <!-- This is required here, when setting the "version" above, even if that version is the same used in the main pom, where the dependency includes the "provided" scope. -->
<exclusions>
<exclusion>
@ -144,40 +145,12 @@
</exclusions>
</dependency>
<!-- https://mvnrepository.com/artifact/io.dropwizard.metrics/metrics-core -->
<dependency>
<groupId>io.dropwizard.metrics</groupId>
<artifactId>metrics-core</artifactId>
<version>4.2.22</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-app -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-app</artifactId>
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
@ -206,56 +179,13 @@
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<!--<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${dhp.hadoop.version}</version>
&lt;!&ndash; <scope>compile</scope>&ndash;&gt;
<exclusions>
<exclusion>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-reload4j</artifactId>
</exclusion>
<exclusion>
<groupId>ch.qos.reload4j</groupId>
<artifactId>reload4j</artifactId>
</exclusion>
&lt;!&ndash; Vulnerable dependencies: &ndash;&gt;
<exclusion>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty</artifactId>
</exclusion>
</exclusions>
</dependency>-->
<!-- Add back some updated version of the needed dependencies. -->
<!-- This should be enabled only when using Hadoop 3.0.0+ -->
<!--<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
</dependency>-->
<dependency> <!-- Newer versions (>=0.18.X) are not compatible with JAVA 8. -->
<groupId>org.apache.thrift</groupId>
@ -273,7 +203,7 @@
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
<version>${google.gson.version}</version>
</dependency>
<!-- logback versions 1.4.X require Java-11 -->
@ -308,68 +238,8 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
</dependencies>
<!--<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.5.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<transformers>
&lt;!&ndash; add Main-Class to manifest file &ndash;&gt;
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>eu.dnetlib.dhp.continuous_validator.ContinuousValidator</mainClass>
</transformer>
</transformers>
<filters>
&lt;!&ndash; filter manifest signature files when creating uber-jar &ndash;&gt;
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/INDEX.LIST</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>LICENSE*</exclude>
</excludes>
</filter>
</filters>
<createDependencyReducedPom>false</createDependencyReducedPom>
&lt;!&ndash; The "minimize" config breaks things at runtime. &ndash;&gt;
&lt;!&ndash;<minimizeJar>true</minimizeJar>&ndash;&gt;
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.2</version>
<configuration>
&lt;!&ndash;<excludes>
<exclude>some test to exclude here</exclude>
</excludes>&ndash;&gt;
</configuration>
</plugin>
</plugins>
</build>-->
<repositories>
<repository>
<id>libs</id>

View File

@ -33,7 +33,7 @@ import scala.Option;
public class ContinuousValidator {
public static final String TEST_FILES_V4_DIR = TestUtils.TEST_FILES_BASE_DIR + "openaireguidelinesV4/";
public static final String RESULTS_FILE = "results.json";
public static final String RESULTS_FILE_NAME = "results.json";
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class);
private static final String parametersFile = "input_continuous_validator_parameters.json";
@ -60,32 +60,20 @@ public class ContinuousValidator {
parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
String isSParkSessionManagedStr = parser.get("isSparkSessionManaged");
if (isSParkSessionManagedStr == null) {
logger
.error(
"The \"isSParkSessionManagedStr\" was not retrieved from the parameters file: "
+ parametersFile);
return;
}
// This "is needed to implement a unit test in which the spark session is created in the context of the
// unit test itself rather than inside the spark application"
isSparkSessionManaged = Optional
.of(isSParkSessionManagedStr)
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// TODO - If the above is tru,e then the Spark-session defined in the unit-test should be used..
} catch (Exception e) {
logger.error("Error when parsing the parameters!", e);
return;
}
isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) // This param is not mandatory, so it may be null.
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// This is needed to implement a unit test in which the spark session is created in the context of the
// unit test itself rather than inside the spark application"
parquet_file_path = parser.get("parquet_file_path");
if (parquet_file_path == null) {
logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile);
@ -125,10 +113,8 @@ public class ContinuousValidator {
logger
.info(
"Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \""
+ guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE);
+ guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE_NAME);
// TODO - USE THE "runWithSparkSession" METHOD TO RUN THE SPARK CODE INSIDE!!
AbstractOpenAireProfile profile = new LiteratureGuidelinesV4Profile();
SparkConf conf = new SparkConf();
@ -168,7 +154,7 @@ public class ContinuousValidator {
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(finalOutputPath + RESULTS_FILE); // The filename should be the name of the input-file or the
.json(finalOutputPath + RESULTS_FILE_NAME); // The filename should be the name of the input-file or the
// input-directory.
if (logger.isDebugEnabled()) {

View File

@ -17,7 +17,7 @@ public class ReadResultsTest {
try {
List standardValidationResultList = new Gson()
.fromJson(new BufferedReader(new FileReader(ContinuousValidator.RESULTS_FILE)), List.class);
.fromJson(new BufferedReader(new FileReader(ContinuousValidator.RESULTS_FILE_NAME)), List.class);
if (standardValidationResultList == null)
logger.error("Could not map the json to a \"List\" object.");
else if (standardValidationResultList.isEmpty())
@ -25,9 +25,9 @@ public class ReadResultsTest {
else
logger.info(standardValidationResultList.toString());
} catch (FileNotFoundException fnfe) {
logger.error("The results-file \"" + ContinuousValidator.RESULTS_FILE + "\" does not exist!");
logger.error("The results-file \"" + ContinuousValidator.RESULTS_FILE_NAME + "\" does not exist!");
} catch (Exception e) {
logger.error("Error when reading the json-results-file \"" + ContinuousValidator.RESULTS_FILE + "\"", e);
logger.error("Error when reading the json-results-file \"" + ContinuousValidator.RESULTS_FILE_NAME + "\"", e);
}
}