diff --git a/dhp-workflows/dhp-continuous-validation/installAndRun.sh b/dhp-workflows/dhp-continuous-validation/installAndRun.sh deleted file mode 100755 index 5ef4e86fc..000000000 --- a/dhp-workflows/dhp-continuous-validation/installAndRun.sh +++ /dev/null @@ -1,38 +0,0 @@ -# This script installs and runs the project. - -DEFAULT_PROFILE='' # It's the empty profile. -NEWER_VERSIONS_PROFILE='-Pscala-2.12' -CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} - - -# For error-handling, we cannot use the "set -e" since: it has problems https://mywiki.wooledge.org/BashFAQ/105 -# So we have our own function, for use when a single command fails. -handle_error () { - echo -e "\n\n$1\n\n"; exit $2 -} - -# Change the working directory to the script's directory, when running from another location. -cd "${0%/*}" || handle_error "Could not change-dir to this script's dir!" 1 - -if [[ $# -eq 0 ]]; then - echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: installAndRun.sh "; exit 2 -fi - -sparkMaster="" -justRun=0 - -if [[ $# -eq 1 ]]; then - sparkMaster=$1 -elif [[ $# -eq 2 ]]; then - sparkMaster=$1 - justRun=$2 -elif [[ $# -gt 2 ]]; then - echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: installAndRun.sh "; exit 3 -fi - -if [[ justRun -eq 0 ]]; then - mvn clean install ${CHOSEN_MAVEN_PROFILE} -fi -ContinuousValidator -test_parquet_file="./src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet" -java -jar ./target/dhp-continuous-validation-1.0.0-SNAPSHOT.jar ${sparkMaster} ${test_parquet_file} 1 diff --git a/dhp-workflows/dhp-continuous-validation/pom.xml b/dhp-workflows/dhp-continuous-validation/pom.xml index 5eaecf2d1..77c8969de 100644 --- a/dhp-workflows/dhp-continuous-validation/pom.xml +++ b/dhp-workflows/dhp-continuous-validation/pom.xml @@ -14,7 +14,6 @@ dhp-continuous-validation - eu.dnetlib.dhp diff --git a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java index 9cf676e76..c514d382c 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java +++ b/dhp-workflows/dhp-continuous-validation/src/main/java/eu/dnetlib/dhp/continuous_validator/ContinuousValidator.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.continuous_validator; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.io.File; import java.nio.charset.StandardCharsets; import java.util.Objects; import java.util.Optional; @@ -16,7 +17,10 @@ import org.apache.spark.sql.SaveMode; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.validator2.validation.StandardValidationResult; import eu.dnetlib.validator2.validation.XMLApplicationProfile; +import eu.dnetlib.validator2.validation.guideline.Guideline; +import eu.dnetlib.validator2.validation.guideline.StandardResult; import eu.dnetlib.validator2.validation.guideline.openaire.*; import eu.dnetlib.validator2.validation.utils.TestUtils; @@ -61,6 +65,7 @@ public class ContinuousValidator { // This is needed to implement a unit test in which the spark session is created in the context of the // unit test itself rather than inside the spark application" + // Set the parquet input, either a parquet-file or a directory with parquet files. parquet_file_path = parser.get("parquet_file_path"); if (parquet_file_path == null) { logger.error("The \"parquet_file_path\" was not retrieved from the parameters file: " + parametersFile); @@ -80,8 +85,8 @@ public class ContinuousValidator { return; } - if (!outputPath.endsWith("/")) - outputPath += "/"; + if (!outputPath.endsWith(File.separator)) + outputPath += File.separator; logger .info( @@ -112,11 +117,15 @@ public class ContinuousValidator { SparkConf conf = new SparkConf(); conf.setAppName(ContinuousValidator.class.getSimpleName()); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(new Class[] { + XMLApplicationProfile.ValidationResult.class, Guideline.Result.class, StandardValidationResult.class, + StandardResult.class + }); String finalParquet_file_path = parquet_file_path; String finalOutputPath = outputPath; runWithSparkSession(conf, isSparkSessionManaged, spark -> { - // Use a new instance of Document Builder in each worker, as it is not thread-safe. MapFunction validateMapFunction = row -> profile .validate( diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json index bf057a0d0..c37683e82 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/input_continuous_validator_parameters.json @@ -8,7 +8,7 @@ { "paramName": "prq_file", "paramLongName": "parquet_file_path", - "paramDescription": "the full path for the parquet file to be processed", + "paramDescription": "the full path of a parquet-file or a directory with parquet files, to be processed", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml index 137c5c8cc..0f967d2f3 100644 --- a/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-continuous-validation/src/main/resources/eu/dnetlib/dhp/continuous_validator/oozie_app/workflow.xml @@ -2,8 +2,8 @@ parquet_file_path - ./src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet - the full path of the parquet file + /var/lib/dnet/mdstore_PROD/md-7763c517-538d-4aa7-83f8-6096b3ce0d96/md-7763c517-538d-4aa7-83f8-6096b3ce0d96-1702622132535/store + the full path of a parquet-file or a directory with parquet files, to be processed openaire_guidelines @@ -12,7 +12,7 @@ outputPath - . + /home/lsmyrnaios/continuous_validation/output the path of the output-directory where the result-json-files will be stored diff --git a/dhp-workflows/dhp-continuous-validation/src/test/java/ReadResultsTest.java b/dhp-workflows/dhp-continuous-validation/src/test/java/ReadResultsTest.java deleted file mode 100644 index c1bc48c82..000000000 --- a/dhp-workflows/dhp-continuous-validation/src/test/java/ReadResultsTest.java +++ /dev/null @@ -1,35 +0,0 @@ -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.util.List; - -import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; - -import eu.dnetlib.dhp.continuous_validator.ContinuousValidator; - -public class ReadResultsTest { - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class); - - public static void main(String[] args) { - - try { - List standardValidationResultList = new Gson() - .fromJson(new BufferedReader(new FileReader(ContinuousValidator.RESULTS_FILE_NAME)), List.class); - if (standardValidationResultList == null) - logger.error("Could not map the json to a \"List\" object."); - else if (standardValidationResultList.isEmpty()) - logger.warn("The \"standardValidationResultList\" is empty!"); - else - logger.info(standardValidationResultList.toString()); - } catch (FileNotFoundException fnfe) { - logger.error("The results-file \"" + ContinuousValidator.RESULTS_FILE_NAME + "\" does not exist!"); - } catch (Exception e) { - logger - .error("Error when reading the json-results-file \"" + ContinuousValidator.RESULTS_FILE_NAME + "\"", e); - } - } - -} diff --git a/dhp-workflows/dhp-continuous-validation/src/test/java/ValidateTestFiles.java b/dhp-workflows/dhp-continuous-validation/src/test/java/ValidateTestFiles.java deleted file mode 100644 index 072098cdf..000000000 --- a/dhp-workflows/dhp-continuous-validation/src/test/java/ValidateTestFiles.java +++ /dev/null @@ -1,126 +0,0 @@ -import static eu.dnetlib.dhp.continuous_validator.ContinuousValidator.TEST_FILES_V4_DIR; - -import java.io.BufferedWriter; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.slf4j.LoggerFactory; - -import com.google.gson.Gson; -import com.google.gson.JsonIOException; - -import eu.dnetlib.dhp.continuous_validator.ContinuousValidator; -import eu.dnetlib.validator2.validation.XMLApplicationProfile; -import eu.dnetlib.validator2.validation.guideline.openaire.LiteratureGuidelinesV4Profile; -import eu.dnetlib.validator2.validation.utils.TestUtils; -import scala.Option; - -public class ValidateTestFiles { - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class); - - public static final String RESULTS_FILE = "results.json"; - - public static void main(String[] args) { - if (args.length != 3) { - String errorMsg = "Wrong number of arguments given! PLease run the app like so: java -jar build/libs/continuous-validator-1.0.0-SNAPSHOT.jar "; - System.err.println(errorMsg); - logger.error(errorMsg); - System.exit(1); - } - String sparkMaster = args[0]; - logger.info("Will use this Spark master: \"" + sparkMaster + "\"."); - - String parquetFileFullPath = args[1]; - String guidelines = args[2]; - logger - .info( - "Will validate the contents of parquetFile: \"" + parquetFileFullPath + "\", against guidelines: \"" - + guidelines + "\"."); - - SparkConf sparkConf = new SparkConf(); - sparkConf.setAppName("Continuous-Validator"); - sparkConf.setMaster(sparkMaster); // Run on the Spark Cluster. - sparkConf.set("spark.driver.memory", "4096M"); - sparkConf - .set("spark.executor.instances", "4") // 4 executors - .set("spark.executor.cores", "1"); // 1 core per executor - sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - sparkConf.set("spark.rdd.compress", "true"); - - String appVersion = "1.0.0-SNAPSHOT"; - /* - * try { Class klass = Class.forName("eu.dnetlib.continuous_validator.BuildConfig"); appVersion = (String) - * klass.getDeclaredField("version").get(null); if ( logger.isTraceEnabled() ) - * logger.trace("The app's version is: " + appVersion); } catch (Exception e) { - * logger.error("Error when acquiring the \"appVersion\"!", e); System.exit(1); } - */ - - sparkConf.setJars(new String[] { - "build/libs/continuous-validator-" + appVersion + "-all.jar" - }); // This is the "fat-Jar". - sparkConf.validateSettings(); - - logger.debug("Spark custom configurations: " + sparkConf.getAll().toString()); - - LiteratureGuidelinesV4Profile profile = new LiteratureGuidelinesV4Profile(); - - try (JavaSparkContext sc = new JavaSparkContext(sparkConf)) { - JavaPairRDD jprdd = sc.wholeTextFiles(TEST_FILES_V4_DIR); - - logger.info("Showing the validation-results.."); - - // Use a new instance of Document Builder in each worker, as it is not thread-safe. - // The "x._1" is the filename and the "x._2" in the content of the file. - List validationResultsList = jprdd - .map( - x -> profile - .validate( - x._1, - TestUtils.getDocumentBuilder().parse(IOUtils.toInputStream(x._2, StandardCharsets.UTF_8)))) - .collect(); - - if (validationResultsList.isEmpty()) { - logger.error("The \"validationResultsList\" was empty!"); - return; - } - - if (logger.isDebugEnabled()) - validationResultsList.forEach(vr -> logger.debug(vr.id() + " | score:" + vr.score())); - - logger.debug(validationResultsList.toString()); - - try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(RESULTS_FILE), StandardCharsets.UTF_8)) { - writer.write(new Gson().toJson(validationResultsList)); - } catch (Exception e) { - logger - .error( - "Error when writing the \"validationResultsList\" as json into the results-file: " - + RESULTS_FILE); - return; - } - - Option uiWebUrl = sc.sc().uiWebUrl(); - if (uiWebUrl.isDefined()) { - logger - .info( - "Waiting 60 seconds, before shutdown, for the user to check the jobs' status at: " - + uiWebUrl.get()); - Thread.sleep(60_000); - } else - logger.info("The \"uiWebUrl\" is not defined, in order to check the jobs' status. Shutting down.."); - - } catch (JsonIOException jie) { - logger.error("Error when writing the validation results to the json file: " + jie.getMessage()); - } catch (Exception e) { - logger.error("Error validating directory: " + TEST_FILES_V4_DIR, e); - } - } - -} diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/01_gv4.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/01_gv4.xml deleted file mode 100644 index 732a9f7e4..000000000 --- a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/01_gv4.xml +++ /dev/null @@ -1,52 +0,0 @@ - - - - - Journal bearings subjected to dynamic loads: the analytical mobility method - - - - Flores, Paulo - - - Claro, José Carlos Pimenta - - - Ambrósio, Jorge - - - - - Universidade do Minho - repositorium@sdum.uminho.pt - - - - - 2005-06-16 - 2005-06-16 - - eng - conference paper - The main purpose of this work is to use the analytical mobility method to analyze journal bearings subjected to dynamic loads, with the intent to include it in a general computational program that has been developed for the dynamic analysis of general mechanical systems. A simple journal bearing subjected to a dynamic load is chosen as a demonstrative example, in order to provide the necessary results for a comprehensive discussion of the methodology presented. - Fundação para a Ciência e a Tecnologia (FCT) - Fundo Comunitário Europeu FEDER under project POCTI/EME/2001/38281, entitled ‘Dynamic of Mechanical Systems with joint Clearances and Imperfections’ - application/pdf - https://hdl.handle.net/1822/18042 - restricted access - - Dynamic bearings - Hydrodynamic lubrication - - - 294443 bytes - - https://repositorium.sdum.uminho.pt/bitstream/1822/18042/1/CI-2005_02.pdf - IberTrib - 3º Congresso Ibérico de Tribologia - 1 - 15 - Guimarães, Portugal - \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/oai_mediarep_org_doc_2534.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/oai_mediarep_org_doc_2534.xml deleted file mode 100644 index 1edfb4608..000000000 --- a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/oai_mediarep_org_doc_2534.xml +++ /dev/null @@ -1,59 +0,0 @@ - -
- oai:mediarep.org:doc/2534 - 2020-03-27T11:35:21Z - com_doc_568 - com_doc_179 - col_doc_1963 - openaire -
- - - Die autogerechte Stadt (Rückführung) - - - Schmitt, Arne - - - - https://mediarep.org/handle/doc/2534 - - - 1869-1722 - - deu - diaphanes - 2015 - - journal article - - Bildstrecke ausgewählter Abbildungen aus Hans Bernhard Reichows Buch DIE AUTOGERECHTE STADT. - application/pdf - http://dx.doi.org/10.25969/mediarep/1436 - - open access - - - Infrastruktur - Urbanität - Mobilität - 300 - - - VoR - - https://mediarep.org/bitstream/doc/2534/1/ZfM_12_104-133_Schmitt_Die_autogerechte_Stadt.pdf - Zeitschrift für Medienwissenschaft - 7 - 1 - 104 - 113 - Researchers - Students - - -
\ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_guidelines_record.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_guidelines_record.xml deleted file mode 100644 index 1c8ef0d3a..000000000 --- a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_guidelines_record.xml +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - National Institute for Environmental Studies and Center - for Climate System Research Japan - - A survey - - - - - - Evans, R.J. - Robert - Evans - Institute of Science and Technology - - 1234-1234-1234-1234 - - - - - - - - i-5uvLfGD2R.y8fXv442y8o.D6v1 - givenName3 - familyName3 - MuDRo.ymY7qG1Or.9Ny - a1Sp - affiliation5 - affiliation6 - - - ZxHQr4TGMeqqEOTbMcNd_EMFvIf - givenName4 - familyName4 - nJNWaCv - RobhZUU3Obaqq-3UlrgxKNe-c0 - affiliation7 - affiliation8 - - - - - - - pZT7SjviMp4yodvIG - ff238yCNbhv5k5Y8AmbpyzYt - svI_6A - E- - award - - - funderName - QwqkD4y - yyADU1SFmiS-meqQ6 - JyORkpwzt - gyg2F9EKT7Gp.mSBU8.Drqf - - - - - - 10.1002/chem.201701589 - PMC5574022 - - - - - - 0947-6539 - - 1521-3765 - - - - - 2011-12-01 - 2012-12-31 - - 2011 - - - - eng - deu - nld - - - - Loughborough University. Department of Computer Science - - John Wiley & Sons, Inc. (US) - - - report - - - - Foreword [by] Hazel Anderson; Introduction; The scientific heresy: - transformation of a society; Consciousness as causal reality [etc] - - - A number of problems in quantum state and system identification are - addressed. - - - - video/quicktime - application/pdf - application/xml - - - http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-160648 - - - embargoed access - - - Ecology Letters (1461023X) vol.4 (2001) - Ecology Letters (1461025X) vol.5 (2002) - - - - ft12Zy - Bf6AElMCkh.mqutOmETp0 - - - - Creative Commons Attribution-NonCommercial - - - 2000-2010 - - scheme=historic; content=Ming Dynasty - - - - - 15 pages - 6 MB - - - - - - Atlantic Ocean - - 31.233 - -67.302 - - - -71.032 - -68.211 - 41.090 - 42.893 - - - - 88.557 - -0.604 - - - -143.373 - 88.832 - - - 87.3 - -36.556 - - - 129.128 - 4.616 - - - -17.547 - -47.629 - - - - - 78.121 - 19.341 - - - -118.035 - 53.647 - - - -49.07 - -45.561 - - - 132.484 - -41.146 - - - 179.293 - 15.364 - - - - - - - NA - - - http://link-to-the-fulltext.org - http://europepmc.org/articles/PMC5574022?pdf=render - - - some Journal Title - - - 10 - - - 1 - - - 100 - - - 105 - - - 2 - - - Berlin - - - 2013-10-22 - 2013-10-23 - - - Researchers - Students - \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_invalid_guidelines_record.xml b/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_invalid_guidelines_record.xml deleted file mode 100644 index 32b9a4711..000000000 --- a/dhp-workflows/dhp-continuous-validation/src/test/resources/openaireguidelinesV4/v4_literature_all_invalid_guidelines_record.xml +++ /dev/null @@ -1,197 +0,0 @@ - - - - - - - National Institute for Environmental Studies and Center - for Climate System Research Japan - - - - - - - - - - - - - - - - - - - - - - - MuDRo.ymY7qG1Or.9Ny - a1Sp - affiliation5 - affiliation6 - - - - - - - - ff238yCNbhv5k5Y8AmbpyzYt - svI_6A - E- - - - - funderName - QwqkD4y - yyADU1SFmiS-meqQ6 - JyORkpwzt - gyg2F9EKT7Gp.mSBU8.Drqf - - - - - - 10.1002/chem.201701589 - PMC5574022 - - - - - - 0947-6539 - - 1521-3765 - - - - - 2011-12-01 - 2012-12-01 - - 2011 - - - - eng - deu - invalidTag - - - - Loughborough University. Department of Computer Science - - John Wiley & Sons, Inc. (US) - - - report - - - - Foreword [by] Hazel Anderson; Introduction; The scientific heresy: - transformation of a society; Consciousness as causal reality [etc] - - - A number of problems in quantum state and system identification are - addressed. - - - - video/quicktime - application/pdf - application/invalid - - - http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-160648 - - - invalid - - - Ecology Letters (1461023X) vol.4 (2001) - Ecology Letters (1461025X) vol.5 (2002) - - - - ft12Zy - Bf6AElMCkh.mqutOmETp0 - - - - Creative Commons Attribution-NonCommercial - - - 2000-2010 - - scheme=historic; content=Ming Dynasty - - - - - - - - - - - - Atlantic Ocean - - 31.233 - -67.302 - - - -71.032 - -68.211 - 41.090 - 42.893 - - - - - - irrelevant - - - http://link-to-the-fulltext.org - http://europepmc.org/articles/PMC5574022?pdf=render - - - some Journal Title - - - 10 - - - 1 - - - 100 - - - 105 - - - 2 - - - Berlin - - - 2013-02-29 - - - Researchers - Students - \ No newline at end of file diff --git a/dhp-workflows/dhp-continuous-validation/src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet b/dhp-workflows/dhp-continuous-validation/src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet deleted file mode 100644 index f9e316930..000000000 Binary files a/dhp-workflows/dhp-continuous-validation/src/test/resources/part-00589-733117df-3822-4fce-bded-17289cc5959a-c000.snappy.parquet and /dev/null differ