diff --git a/dhp-workflows/dhp-continuous-validation/README.md b/dhp-workflows/dhp-continuous-validation/README.md new file mode 100644 index 000000000..7dc65f8e8 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/README.md @@ -0,0 +1,3 @@ +# Continuous Validation + +[...] \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/installAndRun.sh b/dhp-workflows/dhp-continuous-validation/installAndRun.sh new file mode 100755 index 000000000..5ef4e86fc --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/installAndRun.sh @@ -0,0 +1,38 @@ +# This script installs and runs the project. + +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + + +# For error-handling, we cannot use the "set -e" since: it has problems https://mywiki.wooledge.org/BashFAQ/105 +# So we have our own function, for use when a single command fails. +handle_error () { + echo -e "\n\n$1\n\n"; exit $2 +} + +# Change the working directory to the script's directory, when running from another location. +cd "${0%/*}" || handle_error "Could not change-dir to this script's dir!" 1 + +if [[ $# -eq 0 ]]; then + echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: installAndRun.sh "; exit 2 +fi + +sparkMaster="" +justRun=0 + +if [[ $# -eq 1 ]]; then + sparkMaster=$1 +elif [[ $# -eq 2 ]]; then + sparkMaster=$1 + justRun=$2 +elif [[ $# -gt 2 ]]; then + echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: installAndRun.sh "; exit 3 +fi + +if [[ justRun -eq 0 ]]; then + mvn clean install ${CHOSEN_MAVEN_PROFILE} +fi +ContinuousValidator +test_parquet_file="./src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet" +java -jar ./target/dhp-continuous-validation-1.0.0-SNAPSHOT.jar ${sparkMaster} ${test_parquet_file} 1 diff --git a/dhp-workflows/dhp-continuous-validation/libs/uoa-validator-engine2-0.9.0.jar b/dhp-workflows/dhp-continuous-validation/libs/uoa-validator-engine2-0.9.0.jar new file mode 100644 index 000000000..e59797ee0 Binary files /dev/null and b/dhp-workflows/dhp-continuous-validation/libs/uoa-validator-engine2-0.9.0.jar differ diff --git a/dhp-workflows/dhp-continuous-validation/pom.xml b/dhp-workflows/dhp-continuous-validation/pom.xml new file mode 100644 index 000000000..585e273b2 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/pom.xml @@ -0,0 +1,380 @@ + + + 4.0.0 + + + dhp-workflows + eu.dnetlib.dhp + 1.2.5-SNAPSHOT + ../pom.xml + + + dhp-continuous-validation + + + + 8 + 8 + UTF-8 + 2.14.3 + + + + + eu.dnetlib + uoa-validator-engine2 + 0.9.0 + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${dhp.spark.version} + + + + + + + + + ch.qos.logback + logback-classic + + + org.apache.logging.log4j + log4j-api + + + org.slf4j + slf4j-log4j12 + + + org.slf4j + slf4j-api + + + + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${dhp.spark.version} + + + + + + com.twitter + parquet-format + + + + + + + + org.apache.parquet + parquet-avro + 1.13.1 + + + + + + org.apache.hadoop + hadoop-common + + + + + + org.apache.parquet + parquet-avro + + + org.apache.avro + avro + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-reload4j + + + ch.qos.reload4j + reload4j + + + + + com.google.protobuf + protobuf-java + + + org.codehaus.jackson + jackson-core-asl + + + org.codehaus.jackson + jackson-mapper-asl + + + com.fasterxml.woodstox + woodstox-core + + + + + + + + + io.dropwizard.metrics + metrics-core + 4.2.22 + + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + + + + + + org.apache.hadoop + hadoop-mapreduce-client-app + ${dhp.hadoop.version} + + + + org.apache.avro + avro + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + com.google.protobuf + protobuf-java + + + io.netty + netty + + + + + + + + + + + + + + + org.apache.thrift + libthrift + 0.17.0 + + + com.fasterxml.woodstox + woodstox-core + 6.5.1 + + + + + + com.google.code.gson + gson + 2.10.1 + + + + + + + + ch.qos.logback + logback-core + 1.2.13 + + + + + org.slf4j + slf4j-api + 1.7.36 + + + + + ch.qos.logback + logback-classic + 1.2.13 + + + + + org.junit.jupiter + junit-jupiter-engine + 5.10.1 + test + + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + + + + + + + + libs + file:///${project.basedir}/libs + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh b/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh new file mode 100755 index 000000000..b5eb7a07b --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/runOozieWorkflow.sh @@ -0,0 +1,9 @@ +# This script deploys and runs the oozie workflow. + +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + + +mvn clean package ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \ + -Dworkflow.source.dir=eu/dnetlib/dhp/continuous_validator diff --git a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java new file mode 100644 index 000000000..00493530a --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java @@ -0,0 +1,211 @@ + +package eu.dnetlib.dhp.continuous_validator; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.BufferedWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +import javax.jws.WebParam; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.validator2.validation.XMLApplicationProfile; +import eu.dnetlib.validator2.validation.guideline.openaire.AbstractOpenAireProfile; +import eu.dnetlib.validator2.validation.guideline.openaire.LiteratureGuidelinesV4Profile; +import eu.dnetlib.validator2.validation.utils.TestUtils; +import scala.Option; + +public class ContinuousValidator { + + public static final String TEST_FILES_V4_DIR = TestUtils.TEST_FILES_BASE_DIR + "openaireguidelinesV4/"; + public static final String RESULTS_FILE = "results.json"; + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class); + private static final String parametersFile = "input_continuous_validator_parameters.json"; + + private static final boolean SHOULD_GET_ARGUMENTS_FROM_FILE = true; // It throws an error for now.. + + public static void main(String[] args) { + + ArgumentApplicationParser parser = null; + String sparkMaster = null; + Boolean isSparkSessionManaged = false; + String parquet_file_path = null; + String guidelines = null; + String outputPath = null; + + if (SHOULD_GET_ARGUMENTS_FROM_FILE) { + try { + String jsonConfiguration = IOUtils + .toString( + Objects + .requireNonNull( + ContinuousValidator.class + .getResourceAsStream("/eu/dnetlib/dhp/continuous_validator/" + parametersFile)), + StandardCharsets.UTF_8); + + parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + String isSParkSessionManagedStr = parser.get("isSparkSessionManaged"); + if (isSParkSessionManagedStr == null) { + logger + .error( + "The \"isSParkSessionManagedStr\" was not retrieved from the parameters file: " + + parametersFile); + return; + } + + // This "is needed to implement a unit test in which the spark session is created in the context of the + // unit test itself rather than inside the spark application" + isSparkSessionManaged = Optional + .of(isSParkSessionManagedStr) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + // TODO - If the above is tru,e then the Spark-session defined in the unit-test should be used.. + + } catch (Exception e) { + logger.error("Error when parsing the parameters!", e); + return; + } + + parquet_file_path = parser.get("parquet_file_path"); + if (parquet_file_path == null) { + logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile); + return; + } + + guidelines = parser.get("guidelines"); + if (guidelines == null) { + logger.error("The \"guidelines\" was not retrieved from the parameters file: " + parametersFile); + return; + } + + outputPath = parser.get("outputPath"); + if (outputPath == null) { + logger.error("The \"outputPath\" was not retrieved from the parameters file: " + parametersFile); + return; + } + + sparkMaster = "local[*]"; + } else { + if (args.length != 4) { + String errorMsg = "Wrong number of arguments given! Please run the app like so: java -jar target/dhp-continuous-validation-1.0.0-SNAPSHOT.jar "; + System.err.println(errorMsg); + logger.error(errorMsg); + System.exit(1); + } + sparkMaster = args[0]; + logger.info("Will use this Spark master: \"" + sparkMaster + "\"."); + + parquet_file_path = args[1]; + guidelines = args[2]; + outputPath = args[3]; + if (!outputPath.endsWith("/")) + outputPath += "/"; + } + + logger + .info( + "Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \"" + + guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE); + + + // TODO - USE THE "runWithSparkSession" METHOD TO RUN THE SPARK CODE INSIDE!! + AbstractOpenAireProfile profile = new LiteratureGuidelinesV4Profile(); + + SparkConf conf = new SparkConf(); + conf.setAppName(ContinuousValidator.class.getSimpleName()); + String finalParquet_file_path = parquet_file_path; + String finalOutputPath = outputPath; + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + Dataset parquetFileDF = spark.read().parquet(finalParquet_file_path); + parquetFileDF.show(5); + + // Filter the results based on the XML-encoding and non-null id and body. + parquetFileDF = parquetFileDF + .filter( + parquetFileDF + .col("encoding") + .eqNullSafe("XML") + .and(parquetFileDF.col("id").isNotNull()) + .and(parquetFileDF.col("body").isNotNull())); + + // Use a new instance of Document Builder in each worker, as it is not thread-safe. + MapFunction validateMapFunction = row -> profile + .validate( + row.getAs("id").toString(), + TestUtils + .getDocumentBuilder() + .parse(IOUtils.toInputStream(row.getAs("body").toString(), StandardCharsets.UTF_8))); + + Dataset validationResultsDataset = parquetFileDF + .map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class)); + + logger.info("Showing a few validation-results.. just for checking"); + validationResultsDataset.show(5); + + // Write the results to json file immediately, without converting them to a list. + validationResultsDataset + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(finalOutputPath + RESULTS_FILE); // The filename should be the name of the input-file or the + // input-directory. + + if (logger.isDebugEnabled()) { + List validationResultsList = validationResultsDataset + .javaRDD() + .collect(); + + if (validationResultsList.isEmpty()) { + logger.error("The \"validationResultsList\" was empty!"); + return; + } + + validationResultsList.forEach(vr -> logger.debug(vr.id() + " | score:" + vr.score())); + for (XMLApplicationProfile.ValidationResult result : validationResultsList) + logger.debug(result.toString()); + } + + // TODO - REMOVE THIS WHEN THE WRITE FROM ABOVE IS OK + /* + * try (BufferedWriter writer = Files .newBufferedWriter(Paths.get(outputPath + RESULTS_FILE), + * StandardCharsets.UTF_8)) { writer.write(new Gson().toJson(validationResultsList)); } catch (Exception e) + * { logger.error("Error when writing the \"validationResultsList\" as json into the results-file: " + + * outputPath + RESULTS_FILE); return; } + */ + + Option uiWebUrl = spark.sparkContext().uiWebUrl(); + if (uiWebUrl.isDefined()) { + logger + .info( + "Waiting 60 seconds, before shutdown, for the user to check the jobs' status at: " + + uiWebUrl.get()); + try { + Thread.sleep(60_000); + } catch (InterruptedException ignored) { + } + } else + logger.info("The \"uiWebUrl\" is not defined, in order to check the jobs' status. Shutting down.."); + }); + } +} diff --git a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/utils/ParquetUtils.java b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/utils/ParquetUtils.java new file mode 100644 index 000000000..e36d82a65 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/utils/ParquetUtils.java @@ -0,0 +1,101 @@ + +package eu.dnetlib.dhp.continuous_validator.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.slf4j.LoggerFactory; + +public class ParquetUtils { + + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ParquetUtils.class); + + private static final Configuration parquetConfig = new Configuration(); + + public static List getParquetRecords(String fullFilePath) { + InputFile inputFile; + try { // TODO - Verify that this will create any directories which do not exist in the provided path. Currently + // we create the directories beforehand. + inputFile = HadoopInputFile.fromPath(new Path(fullFilePath), parquetConfig); + // logger.trace("Created the parquet " + outputFile); // DEBUG! + } catch (Throwable e) { // The simple "Exception" may not be thrown here, but an "Error" may be thrown. + // "Throwable" catches EVERYTHING! + logger.error("", e); + return null; + } + + List records = new ArrayList<>(); + GenericRecord record; + try (ParquetReader reader = AvroParquetReader. builder(inputFile).build()) { + while ((record = reader.read()) != null) { + records.add(record); + } + } catch (Throwable e) { // The simple "Exception" may not be thrown here, but an "Error" may be thrown. + // "Throwable" catches EVERYTHING! + logger.error("Problem when creating the \"ParquetWriter\" object or when writing the records with it!", e); + + // At some point, I got an "NoSuchMethodError", because of a problem in the AvroSchema file: + // (java.lang.NoSuchMethodError: org.apache.avro.Schema.getLogicalType()Lorg/apache/avro/LogicalType;). + // The error was with the schema: {"name": "date", "type" : ["null", {"type" : "long", "logicalType" : + // "timestamp-millis"}]}, + return null; + } + + return records; // It may be empty. + } + + public static Map getIdXmlMapFromParquetFile(String parquetFileFullPath) { + List recordList = ParquetUtils.getParquetRecords(parquetFileFullPath); + if (recordList == null) + return null; // The error is already logged. + else if (recordList.isEmpty()) { + logger.error("The parquet-file \"" + parquetFileFullPath + "\" had no records inside!"); + return null; + } + + Map idXmlMap = new HashMap<>(); + + for (GenericRecord record : recordList) { + if (logger.isTraceEnabled()) + logger.trace(record.toString()); + + Object id = record.get("id"); + if (id == null) + continue; + String idStr = id.toString(); + + Object encoding = record.get("encoding"); + if (encoding == null) { + logger.warn("Record with id = \"" + idStr + "\" does not provide the encoding for its body!"); + continue; + } + String encodingStr = encoding.toString(); + if (!encodingStr.equals("XML")) { + logger.warn("Record with id = \"" + idStr + "\" does not have XML encoding for its body!"); + continue; + } + + Object body = record.get("body"); + if (body == null) { + logger.warn("Record with id = \"" + idStr + "\" does not have a body!"); + continue; + } + String bodyStr = body.toString(); + + idXmlMap.put(idStr, bodyStr); + // logger.debug(idStr + " | " + idXmlMap.get(idStr)); + } + + return idXmlMap; + } + +} diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml new file mode 100644 index 000000000..0980c3731 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + sparkExecutorMemoryOverhead + 1G + + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml new file mode 100644 index 000000000..60c70bd14 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml @@ -0,0 +1,111 @@ + + + + parquet_file_path + ./src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet + the full path of the parquet file + + + openaire_guidelines + 4.0 + the version of the OpenAIRE Guidelines to validate the records against + + + outputPath + . + the (float) version of the OpenAIRE Guidelines to validate the records against + + + sparkDriverMemory + 4096M + memory for driver process + + + sparkExecutorMemory + 2048m + memory for individual executor + + + sparkExecutorCores + 4 + number of cores used by single executor + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + + spark2EventLogDir + spark2/logs + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Validate multiple records against OpenAIRE Guidelines + eu.dnetlib.dhp.continuous_validator.ContinuousValidator + dhp-continuous-validation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemoryOverhead} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + + --parquet_file_path${parquet_file_path} + --openaire_guidelines${openaire_guidelines} + --outputPath${outputPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/input_continuous_validator_parameters.json b/dhp-workflows/dhp-continuous-validation/src/main/resources/input_continuous_validator_parameters.json new file mode 100644 index 000000000..bf057a0d0 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/input_continuous_validator_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "prq_file", + "paramLongName": "parquet_file_path", + "paramDescription": "the full path for the parquet file to be processed", + "paramRequired": true + }, + { + "paramName": "guidelines", + "paramLongName": "openaire_guidelines", + "paramDescription": "the version of the OpenAIRE Guidelines to validate the records against", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the output-directory where the result-json-files will be stored", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml new file mode 100644 index 000000000..7e37889ac --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/logback.xml @@ -0,0 +1,43 @@ + + + + + + + logs/ContinuousValidator.log + + + logs/ContinuousValidator.%i.log.zip + 1 + 10 + + + + 100MB + + + UTF-8 + %d{yyyy-MM-dd HH:mm:ss.SSS z} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n + + + + + + UTF-8 + %d{yyyy-MM-dd HH:mm:ss.SSS z} [%thread] %highlight(%-5level) %cyan(%logger{36}.%M\(@%line\)) - %msg%n + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/java/ReadParquetDataTest.java b/dhp-workflows/dhp-continuous-validation/src/test/java/ReadParquetDataTest.java new file mode 100644 index 000000000..8c46777f0 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/java/ReadParquetDataTest.java @@ -0,0 +1,33 @@ +import java.util.Map; + +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.continuous_validator.utils.ParquetUtils; +import eu.dnetlib.validator2.validation.utils.TestUtils; + +public class ReadParquetDataTest { + + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ReadParquetDataTest.class); + + private static final String parquetFileFullPath = TestUtils.TEST_FILES_BASE_DIR + + "part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet"; + + public static void main(String[] args) { + + testParquetRead(); + } + + @Test + public static void testParquetRead() { + Map idXmlMap = ParquetUtils.getIdXmlMapFromParquetFile(parquetFileFullPath); + if (idXmlMap == null) { + logger.error("Could not create the \"idXmlMap\" from parquet-file: " + parquetFileFullPath); + System.exit(99); + } else if (idXmlMap.isEmpty()) + logger.warn("The generated \"idXmlMap\" was empty, for parquet-file: " + parquetFileFullPath); + else + logger.info("The \"idXmlMap\" was successfully generated, for parquet-file: " + parquetFileFullPath); + } + +} diff --git a/dhp-workflows/dhp-continuous-validation/src/test/java/ReadResultsTest.java b/dhp-workflows/dhp-continuous-validation/src/test/java/ReadResultsTest.java new file mode 100644 index 000000000..b1b0c6af4 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/java/ReadResultsTest.java @@ -0,0 +1,34 @@ +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.util.List; + +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.continuous_validator.ContinuousValidator; + +public class ReadResultsTest { + + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class); + + public static void main(String[] args) { + + try { + List standardValidationResultList = new Gson() + .fromJson(new BufferedReader(new FileReader(ContinuousValidator.RESULTS_FILE)), List.class); + if (standardValidationResultList == null) + logger.error("Could not map the json to a \"List\" object."); + else if (standardValidationResultList.isEmpty()) + logger.warn("The \"standardValidationResultList\" is empty!"); + else + logger.info(standardValidationResultList.toString()); + } catch (FileNotFoundException fnfe) { + logger.error("The results-file \"" + ContinuousValidator.RESULTS_FILE + "\" does not exist!"); + } catch (Exception e) { + logger.error("Error when reading the json-results-file \"" + ContinuousValidator.RESULTS_FILE + "\"", e); + } + } + +} diff --git a/dhp-workflows/dhp-continuous-validation/src/test/java/ValidateTestFiles.java b/dhp-workflows/dhp-continuous-validation/src/test/java/ValidateTestFiles.java new file mode 100644 index 000000000..072098cdf --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/java/ValidateTestFiles.java @@ -0,0 +1,126 @@ +import static eu.dnetlib.dhp.continuous_validator.ContinuousValidator.TEST_FILES_V4_DIR; + +import java.io.BufferedWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.JsonIOException; + +import eu.dnetlib.dhp.continuous_validator.ContinuousValidator; +import eu.dnetlib.validator2.validation.XMLApplicationProfile; +import eu.dnetlib.validator2.validation.guideline.openaire.LiteratureGuidelinesV4Profile; +import eu.dnetlib.validator2.validation.utils.TestUtils; +import scala.Option; + +public class ValidateTestFiles { + + private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class); + + public static final String RESULTS_FILE = "results.json"; + + public static void main(String[] args) { + if (args.length != 3) { + String errorMsg = "Wrong number of arguments given! PLease run the app like so: java -jar build/libs/continuous-validator-1.0.0-SNAPSHOT.jar "; + System.err.println(errorMsg); + logger.error(errorMsg); + System.exit(1); + } + String sparkMaster = args[0]; + logger.info("Will use this Spark master: \"" + sparkMaster + "\"."); + + String parquetFileFullPath = args[1]; + String guidelines = args[2]; + logger + .info( + "Will validate the contents of parquetFile: \"" + parquetFileFullPath + "\", against guidelines: \"" + + guidelines + "\"."); + + SparkConf sparkConf = new SparkConf(); + sparkConf.setAppName("Continuous-Validator"); + sparkConf.setMaster(sparkMaster); // Run on the Spark Cluster. + sparkConf.set("spark.driver.memory", "4096M"); + sparkConf + .set("spark.executor.instances", "4") // 4 executors + .set("spark.executor.cores", "1"); // 1 core per executor + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.rdd.compress", "true"); + + String appVersion = "1.0.0-SNAPSHOT"; + /* + * try { Class klass = Class.forName("eu.dnetlib.continuous_validator.BuildConfig"); appVersion = (String) + * klass.getDeclaredField("version").get(null); if ( logger.isTraceEnabled() ) + * logger.trace("The app's version is: " + appVersion); } catch (Exception e) { + * logger.error("Error when acquiring the \"appVersion\"!", e); System.exit(1); } + */ + + sparkConf.setJars(new String[] { + "build/libs/continuous-validator-" + appVersion + "-all.jar" + }); // This is the "fat-Jar". + sparkConf.validateSettings(); + + logger.debug("Spark custom configurations: " + sparkConf.getAll().toString()); + + LiteratureGuidelinesV4Profile profile = new LiteratureGuidelinesV4Profile(); + + try (JavaSparkContext sc = new JavaSparkContext(sparkConf)) { + JavaPairRDD jprdd = sc.wholeTextFiles(TEST_FILES_V4_DIR); + + logger.info("Showing the validation-results.."); + + // Use a new instance of Document Builder in each worker, as it is not thread-safe. + // The "x._1" is the filename and the "x._2" in the content of the file. + List validationResultsList = jprdd + .map( + x -> profile + .validate( + x._1, + TestUtils.getDocumentBuilder().parse(IOUtils.toInputStream(x._2, StandardCharsets.UTF_8)))) + .collect(); + + if (validationResultsList.isEmpty()) { + logger.error("The \"validationResultsList\" was empty!"); + return; + } + + if (logger.isDebugEnabled()) + validationResultsList.forEach(vr -> logger.debug(vr.id() + " | score:" + vr.score())); + + logger.debug(validationResultsList.toString()); + + try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(RESULTS_FILE), StandardCharsets.UTF_8)) { + writer.write(new Gson().toJson(validationResultsList)); + } catch (Exception e) { + logger + .error( + "Error when writing the \"validationResultsList\" as json into the results-file: " + + RESULTS_FILE); + return; + } + + Option uiWebUrl = sc.sc().uiWebUrl(); + if (uiWebUrl.isDefined()) { + logger + .info( + "Waiting 60 seconds, before shutdown, for the user to check the jobs' status at: " + + uiWebUrl.get()); + Thread.sleep(60_000); + } else + logger.info("The \"uiWebUrl\" is not defined, in order to check the jobs' status. Shutting down.."); + + } catch (JsonIOException jie) { + logger.error("Error when writing the validation results to the json file: " + jie.getMessage()); + } catch (Exception e) { + logger.error("Error validating directory: " + TEST_FILES_V4_DIR, e); + } + } + +} diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/01_gv4.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/01_gv4.xml new file mode 100644 index 000000000..732a9f7e4 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/01_gv4.xml @@ -0,0 +1,52 @@ + + + + + Journal bearings subjected to dynamic loads: the analytical mobility method + + + + Flores, Paulo + + + Claro, José Carlos Pimenta + + + Ambrósio, Jorge + + + + + Universidade do Minho + repositorium@sdum.uminho.pt + + + + + 2005-06-16 + 2005-06-16 + + eng + conference paper + The main purpose of this work is to use the analytical mobility method to analyze journal bearings subjected to dynamic loads, with the intent to include it in a general computational program that has been developed for the dynamic analysis of general mechanical systems. A simple journal bearing subjected to a dynamic load is chosen as a demonstrative example, in order to provide the necessary results for a comprehensive discussion of the methodology presented. + Fundação para a Ciência e a Tecnologia (FCT) + Fundo Comunitário Europeu FEDER under project POCTI/EME/2001/38281, entitled ‘Dynamic of Mechanical Systems with joint Clearances and Imperfections’ + application/pdf + https://hdl.handle.net/1822/18042 + restricted access + + Dynamic bearings + Hydrodynamic lubrication + + + 294443 bytes + + https://repositorium.sdum.uminho.pt/bitstream/1822/18042/1/CI-2005_02.pdf + IberTrib - 3º Congresso Ibérico de Tribologia + 1 + 15 + Guimarães, Portugal + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/oai_mediarep_org_doc_2534.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/oai_mediarep_org_doc_2534.xml new file mode 100644 index 000000000..1edfb4608 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/oai_mediarep_org_doc_2534.xml @@ -0,0 +1,59 @@ + +
+ oai:mediarep.org:doc/2534 + 2020-03-27T11:35:21Z + com_doc_568 + com_doc_179 + col_doc_1963 + openaire +
+ + + Die autogerechte Stadt (Rückführung) + + + Schmitt, Arne + + + + https://mediarep.org/handle/doc/2534 + + + 1869-1722 + + deu + diaphanes + 2015 + + journal article + + Bildstrecke ausgewählter Abbildungen aus Hans Bernhard Reichows Buch DIE AUTOGERECHTE STADT. + application/pdf + http://dx.doi.org/10.25969/mediarep/1436 + + open access + + + Infrastruktur + Urbanität + Mobilität + 300 + + + VoR + + https://mediarep.org/bitstream/doc/2534/1/ZfM_12_104-133_Schmitt_Die_autogerechte_Stadt.pdf + Zeitschrift für Medienwissenschaft + 7 + 1 + 104 + 113 + Researchers + Students + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_guidelines_record.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_guidelines_record.xml new file mode 100644 index 000000000..1c8ef0d3a --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_guidelines_record.xml @@ -0,0 +1,251 @@ + + + + + + + National Institute for Environmental Studies and Center + for Climate System Research Japan + + A survey + + + + + + Evans, R.J. + Robert + Evans + Institute of Science and Technology + + 1234-1234-1234-1234 + + + + + + + + i-5uvLfGD2R.y8fXv442y8o.D6v1 + givenName3 + familyName3 + MuDRo.ymY7qG1Or.9Ny + a1Sp + affiliation5 + affiliation6 + + + ZxHQr4TGMeqqEOTbMcNd_EMFvIf + givenName4 + familyName4 + nJNWaCv + RobhZUU3Obaqq-3UlrgxKNe-c0 + affiliation7 + affiliation8 + + + + + + + pZT7SjviMp4yodvIG + ff238yCNbhv5k5Y8AmbpyzYt + svI_6A + E- + award + + + funderName + QwqkD4y + yyADU1SFmiS-meqQ6 + JyORkpwzt + gyg2F9EKT7Gp.mSBU8.Drqf + + + + + + 10.1002/chem.201701589 + PMC5574022 + + + + + + 0947-6539 + + 1521-3765 + + + + + 2011-12-01 + 2012-12-31 + + 2011 + + + + eng + deu + nld + + + + Loughborough University. Department of Computer Science + + John Wiley & Sons, Inc. (US) + + + report + + + + Foreword [by] Hazel Anderson; Introduction; The scientific heresy: + transformation of a society; Consciousness as causal reality [etc] + + + A number of problems in quantum state and system identification are + addressed. + + + + video/quicktime + application/pdf + application/xml + + + http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-160648 + + + embargoed access + + + Ecology Letters (1461023X) vol.4 (2001) + Ecology Letters (1461025X) vol.5 (2002) + + + + ft12Zy + Bf6AElMCkh.mqutOmETp0 + + + + Creative Commons Attribution-NonCommercial + + + 2000-2010 + + scheme=historic; content=Ming Dynasty + + + + + 15 pages + 6 MB + + + + + + Atlantic Ocean + + 31.233 + -67.302 + + + -71.032 + -68.211 + 41.090 + 42.893 + + + + 88.557 + -0.604 + + + -143.373 + 88.832 + + + 87.3 + -36.556 + + + 129.128 + 4.616 + + + -17.547 + -47.629 + + + + + 78.121 + 19.341 + + + -118.035 + 53.647 + + + -49.07 + -45.561 + + + 132.484 + -41.146 + + + 179.293 + 15.364 + + + + + + + NA + + + http://link-to-the-fulltext.org + http://europepmc.org/articles/PMC5574022?pdf=render + + + some Journal Title + + + 10 + + + 1 + + + 100 + + + 105 + + + 2 + + + Berlin + + + 2013-10-22 - 2013-10-23 + + + Researchers + Students + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_invalid_guidelines_record.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_invalid_guidelines_record.xml new file mode 100644 index 000000000..32b9a4711 --- /dev/null +++ b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_invalid_guidelines_record.xml @@ -0,0 +1,197 @@ + + + + + + + National Institute for Environmental Studies and Center + for Climate System Research Japan + + + + + + + + + + + + + + + + + + + + + + + MuDRo.ymY7qG1Or.9Ny + a1Sp + affiliation5 + affiliation6 + + + + + + + + ff238yCNbhv5k5Y8AmbpyzYt + svI_6A + E- + + + + funderName + QwqkD4y + yyADU1SFmiS-meqQ6 + JyORkpwzt + gyg2F9EKT7Gp.mSBU8.Drqf + + + + + + 10.1002/chem.201701589 + PMC5574022 + + + + + + 0947-6539 + + 1521-3765 + + + + + 2011-12-01 + 2012-12-01 + + 2011 + + + + eng + deu + invalidTag + + + + Loughborough University. Department of Computer Science + + John Wiley & Sons, Inc. (US) + + + report + + + + Foreword [by] Hazel Anderson; Introduction; The scientific heresy: + transformation of a society; Consciousness as causal reality [etc] + + + A number of problems in quantum state and system identification are + addressed. + + + + video/quicktime + application/pdf + application/invalid + + + http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-160648 + + + invalid + + + Ecology Letters (1461023X) vol.4 (2001) + Ecology Letters (1461025X) vol.5 (2002) + + + + ft12Zy + Bf6AElMCkh.mqutOmETp0 + + + + Creative Commons Attribution-NonCommercial + + + 2000-2010 + + scheme=historic; content=Ming Dynasty + + + + + + + + + + + + Atlantic Ocean + + 31.233 + -67.302 + + + -71.032 + -68.211 + 41.090 + 42.893 + + + + + + irrelevant + + + http://link-to-the-fulltext.org + http://europepmc.org/articles/PMC5574022?pdf=render + + + some Journal Title + + + 10 + + + 1 + + + 100 + + + 105 + + + 2 + + + Berlin + + + 2013-02-29 + + + Researchers + Students + \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet b/dhp-workflows/dhp-continuous-validation/src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet new file mode 100644 index 000000000..f9e316930 Binary files /dev/null and b/dhp-workflows/dhp-continuous-validation/src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet differ diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 369c71b5b..1f323f659 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -39,6 +39,7 @@ dhp-doiboost dhp-impact-indicators dhp-swh + dhp-continuous-validation