- Fix the location of the "input_continuous_validator_parameters.json" file.

- Fix handing the "isSparkSessionManaged" parameter.
- Add the "provided" scope for some dependencies. They do not inherit it from the main pom, since the "version" tag is declared, even though the value is the same as the one from the main pom.
- Code polishing / cleanup.
This commit is contained in:
Lampros Smyrnaios 2023-12-15 18:29:38 +02:00
parent 9e6a03e4e2
commit b71633fd7f
4 changed files with 28 additions and 172 deletions

View File

@ -18,10 +18,14 @@
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<jackson.version>2.14.3</jackson.version> <!-- Scala doesn't want a higher version than 2.14.x -->
</properties>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>uoa-validator-engine2</artifactId>
@ -33,10 +37,7 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<!--<scope>provided</scope>-->
<!--
<scope>compile</scope>
-->
<scope>provided</scope>
<exclusions>
<!-- This is an older version which causes problems. We have to add the latest version independently. -->
@ -70,8 +71,8 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<!--<scope>provided</scope>-->
<!--<scope>compile</scope>-->
<scope>provided</scope>
<exclusions>
<!-- This exclusion is a must for scala 2.11 and spark 2.4.0.cloudera2 -->
<exclusion>
@ -94,8 +95,8 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<!--<version>${dhp.hadoop.version}</version>-->
<!--<scope>compile</scope>-->
<version>${dhp.hadoop.version}</version>
<scope>provided</scope> <!-- This is required here, when setting the "version" above, even if that version is the same used in the main pom, where the dependency includes the "provided" scope. -->
<exclusions>
<exclusion>
@ -144,40 +145,12 @@
</exclusions>
</dependency>
<!-- https://mvnrepository.com/artifact/io.dropwizard.metrics/metrics-core -->
<dependency>
<groupId>io.dropwizard.metrics</groupId>
<artifactId>metrics-core</artifactId>
<version>4.2.22</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-app -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-app</artifactId>
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
@ -206,56 +179,13 @@
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<!--<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${dhp.hadoop.version}</version>
&lt;!&ndash; <scope>compile</scope>&ndash;&gt;
<exclusions>
<exclusion>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-reload4j</artifactId>
</exclusion>
<exclusion>
<groupId>ch.qos.reload4j</groupId>
<artifactId>reload4j</artifactId>
</exclusion>
&lt;!&ndash; Vulnerable dependencies: &ndash;&gt;
<exclusion>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty</artifactId>
</exclusion>
</exclusions>
</dependency>-->
<!-- Add back some updated version of the needed dependencies. -->
<!-- This should be enabled only when using Hadoop 3.0.0+ -->
<!--<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
</dependency>-->
<dependency> <!-- Newer versions (>=0.18.X) are not compatible with JAVA 8. -->
<groupId>org.apache.thrift</groupId>
@ -273,7 +203,7 @@
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
<version>${google.gson.version}</version>
</dependency>
<!-- logback versions 1.4.X require Java-11 -->
@ -308,68 +238,8 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<!-- <scope>compile</scope>-->
</dependency>
</dependencies>
<!--<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.5.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<transformers>
&lt;!&ndash; add Main-Class to manifest file &ndash;&gt;
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>eu.dnetlib.dhp.continuous_validator.ContinuousValidator</mainClass>
</transformer>
</transformers>
<filters>
&lt;!&ndash; filter manifest signature files when creating uber-jar &ndash;&gt;
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/INDEX.LIST</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>LICENSE*</exclude>
</excludes>
</filter>
</filters>
<createDependencyReducedPom>false</createDependencyReducedPom>
&lt;!&ndash; The "minimize" config breaks things at runtime. &ndash;&gt;
&lt;!&ndash;<minimizeJar>true</minimizeJar>&ndash;&gt;
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.2</version>
<configuration>
&lt;!&ndash;<excludes>
<exclude>some test to exclude here</exclude>
</excludes>&ndash;&gt;
</configuration>
</plugin>
</plugins>
</build>-->
<repositories>
<repository>
<id>libs</id>

View File

@ -33,7 +33,7 @@ import scala.Option;
public class ContinuousValidator {
public static final String TEST_FILES_V4_DIR = TestUtils.TEST_FILES_BASE_DIR + "openaireguidelinesV4/";
public static final String RESULTS_FILE = "results.json";
public static final String RESULTS_FILE_NAME = "results.json";
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class);
private static final String parametersFile = "input_continuous_validator_parameters.json";
@ -60,32 +60,20 @@ public class ContinuousValidator {
parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
String isSParkSessionManagedStr = parser.get("isSparkSessionManaged");
if (isSParkSessionManagedStr == null) {
logger
.error(
"The \"isSParkSessionManagedStr\" was not retrieved from the parameters file: "
+ parametersFile);
return;
}
// This "is needed to implement a unit test in which the spark session is created in the context of the
// unit test itself rather than inside the spark application"
isSparkSessionManaged = Optional
.of(isSParkSessionManagedStr)
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// TODO - If the above is tru,e then the Spark-session defined in the unit-test should be used..
} catch (Exception e) {
logger.error("Error when parsing the parameters!", e);
return;
}
isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) // This param is not mandatory, so it may be null.
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// This is needed to implement a unit test in which the spark session is created in the context of the
// unit test itself rather than inside the spark application"
parquet_file_path = parser.get("parquet_file_path");
if (parquet_file_path == null) {
logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile);
@ -125,10 +113,8 @@ public class ContinuousValidator {
logger
.info(
"Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \""
+ guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE);
+ guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE_NAME);
// TODO - USE THE "runWithSparkSession" METHOD TO RUN THE SPARK CODE INSIDE!!
AbstractOpenAireProfile profile = new LiteratureGuidelinesV4Profile();
SparkConf conf = new SparkConf();
@ -168,7 +154,7 @@ public class ContinuousValidator {
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(finalOutputPath + RESULTS_FILE); // The filename should be the name of the input-file or the
.json(finalOutputPath + RESULTS_FILE_NAME); // The filename should be the name of the input-file or the
// input-directory.
if (logger.isDebugEnabled()) {

View File

@ -17,7 +17,7 @@ public class ReadResultsTest {
try {
List standardValidationResultList = new Gson()
.fromJson(new BufferedReader(new FileReader(ContinuousValidator.RESULTS_FILE)), List.class);
.fromJson(new BufferedReader(new FileReader(ContinuousValidator.RESULTS_FILE_NAME)), List.class);
if (standardValidationResultList == null)
logger.error("Could not map the json to a \"List\" object.");
else if (standardValidationResultList.isEmpty())
@ -25,9 +25,9 @@ public class ReadResultsTest {
else
logger.info(standardValidationResultList.toString());
} catch (FileNotFoundException fnfe) {
logger.error("The results-file \"" + ContinuousValidator.RESULTS_FILE + "\" does not exist!");
logger.error("The results-file \"" + ContinuousValidator.RESULTS_FILE_NAME + "\" does not exist!");
} catch (Exception e) {
logger.error("Error when reading the json-results-file \"" + ContinuousValidator.RESULTS_FILE + "\"", e);
logger.error("Error when reading the json-results-file \"" + ContinuousValidator.RESULTS_FILE_NAME + "\"", e);
}
}