forked from D-Net/dnet-hadoop
- Fix acquiring the "openaire_guidelines" parameter.
- Use the right Guidelines-profile, depending on the "openaire_guidelines" version. - Update log-levels. - Optimize imports.
This commit is contained in:
parent
b71633fd7f
commit
a2feda6c07
|
@ -1,34 +1,26 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.continuous_validator;
|
package eu.dnetlib.dhp.continuous_validator;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.validator2.validation.XMLApplicationProfile;
|
||||||
|
import eu.dnetlib.validator2.validation.guideline.openaire.*;
|
||||||
|
import eu.dnetlib.validator2.validation.utils.TestUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import scala.Option;
|
||||||
|
|
||||||
import java.io.BufferedWriter;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import javax.jws.WebParam;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.SparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
|
||||||
import eu.dnetlib.validator2.validation.XMLApplicationProfile;
|
|
||||||
import eu.dnetlib.validator2.validation.guideline.openaire.AbstractOpenAireProfile;
|
|
||||||
import eu.dnetlib.validator2.validation.guideline.openaire.LiteratureGuidelinesV4Profile;
|
|
||||||
import eu.dnetlib.validator2.validation.utils.TestUtils;
|
|
||||||
import scala.Option;
|
|
||||||
|
|
||||||
public class ContinuousValidator {
|
public class ContinuousValidator {
|
||||||
|
|
||||||
|
@ -80,9 +72,9 @@ public class ContinuousValidator {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
guidelines = parser.get("guidelines");
|
guidelines = parser.get("openaire_guidelines");
|
||||||
if (guidelines == null) {
|
if (guidelines == null) {
|
||||||
logger.error("The \"guidelines\" was not retrieved from the parameters file: " + parametersFile);
|
logger.error("The \"openaire_guidelines\" was not retrieved from the parameters file: " + parametersFile);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,7 +107,27 @@ public class ContinuousValidator {
|
||||||
"Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \""
|
"Will validate the contents of parquetFile: \"" + parquet_file_path + "\", against guidelines: \""
|
||||||
+ guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE_NAME);
|
+ guidelines + "\"" + " and will output the results in: " + outputPath + RESULTS_FILE_NAME);
|
||||||
|
|
||||||
AbstractOpenAireProfile profile = new LiteratureGuidelinesV4Profile();
|
AbstractOpenAireProfile profile;
|
||||||
|
switch (guidelines) {
|
||||||
|
case "4.0":
|
||||||
|
profile = new LiteratureGuidelinesV4Profile();
|
||||||
|
break;
|
||||||
|
case "3.0":
|
||||||
|
profile = new LiteratureGuidelinesV3Profile();
|
||||||
|
break;
|
||||||
|
case "2.0":
|
||||||
|
profile = new DataArchiveGuidelinesV2Profile();
|
||||||
|
break;
|
||||||
|
case "fair_data":
|
||||||
|
profile = new FAIR_Data_GuidelinesProfile();
|
||||||
|
break;
|
||||||
|
case "fair_literature_v4":
|
||||||
|
profile = new FAIR_Literature_GuidelinesV4Profile();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
logger.error("Invalid OpenAIRE Guidelines were given: " + guidelines);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.setAppName(ContinuousValidator.class.getSimpleName());
|
conf.setAppName(ContinuousValidator.class.getSimpleName());
|
||||||
|
@ -146,8 +158,10 @@ public class ContinuousValidator {
|
||||||
Dataset<XMLApplicationProfile.ValidationResult> validationResultsDataset = parquetFileDF
|
Dataset<XMLApplicationProfile.ValidationResult> validationResultsDataset = parquetFileDF
|
||||||
.map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class));
|
.map(validateMapFunction, Encoders.bean(XMLApplicationProfile.ValidationResult.class));
|
||||||
|
|
||||||
logger.info("Showing a few validation-results.. just for checking");
|
if (logger.isTraceEnabled()) {
|
||||||
validationResultsDataset.show(5);
|
logger.trace("Showing a few validation-results.. just for checking");
|
||||||
|
validationResultsDataset.show(5);
|
||||||
|
}
|
||||||
|
|
||||||
// Write the results to json file immediately, without converting them to a list.
|
// Write the results to json file immediately, without converting them to a list.
|
||||||
validationResultsDataset
|
validationResultsDataset
|
||||||
|
@ -155,9 +169,9 @@ public class ContinuousValidator {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(finalOutputPath + RESULTS_FILE_NAME); // The filename should be the name of the input-file or the
|
.json(finalOutputPath + RESULTS_FILE_NAME); // The filename should be the name of the input-file or the
|
||||||
// input-directory.
|
// input-directory.
|
||||||
|
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isTraceEnabled()) {
|
||||||
List<XMLApplicationProfile.ValidationResult> validationResultsList = validationResultsDataset
|
List<XMLApplicationProfile.ValidationResult> validationResultsList = validationResultsDataset
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.collect();
|
.collect();
|
||||||
|
@ -167,9 +181,9 @@ public class ContinuousValidator {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
validationResultsList.forEach(vr -> logger.debug(vr.id() + " | score:" + vr.score()));
|
validationResultsList.forEach(vr -> logger.trace(vr.id() + " | score:" + vr.score()));
|
||||||
for (XMLApplicationProfile.ValidationResult result : validationResultsList)
|
for (XMLApplicationProfile.ValidationResult result : validationResultsList)
|
||||||
logger.debug(result.toString());
|
logger.trace(result.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - REMOVE THIS WHEN THE WRITE FROM ABOVE IS OK
|
// TODO - REMOVE THIS WHEN THE WRITE FROM ABOVE IS OK
|
||||||
|
@ -180,6 +194,7 @@ public class ContinuousValidator {
|
||||||
* outputPath + RESULTS_FILE); return; }
|
* outputPath + RESULTS_FILE); return; }
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// TODO - Maybe the following section is not needed, when ran as an oozie workflow..
|
||||||
Option<String> uiWebUrl = spark.sparkContext().uiWebUrl();
|
Option<String> uiWebUrl = spark.sparkContext().uiWebUrl();
|
||||||
if (uiWebUrl.isDefined()) {
|
if (uiWebUrl.isDefined()) {
|
||||||
logger
|
logger
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
<appender-ref ref="Console" />
|
<appender-ref ref="Console" />
|
||||||
</root>
|
</root>
|
||||||
|
|
||||||
|
<!-- TODO - Change the level below to "debug" -->
|
||||||
<logger name="eu.dnetlib.dhp.continuous_validator" level="trace"/>
|
<logger name="eu.dnetlib.dhp.continuous_validator" level="trace"/>
|
||||||
<logger name="eu.dnetlib.validator2" level="error"/>
|
<logger name="eu.dnetlib.validator2" level="error"/>
|
||||||
<logger name="org.sparkproject" level="info"/>
|
<logger name="org.sparkproject" level="info"/>
|
||||||
|
|
|
@ -1,14 +1,12 @@
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import eu.dnetlib.dhp.continuous_validator.ContinuousValidator;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.continuous_validator.ContinuousValidator;
|
|
||||||
|
|
||||||
public class ReadResultsTest {
|
public class ReadResultsTest {
|
||||||
|
|
||||||
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class);
|
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ContinuousValidator.class);
|
||||||
|
|
Loading…
Reference in New Issue