[aggregator graph] save invalid records aside for further inspection

2022-09-16 14:06:28 +02:00 · 2022-09-16 14:06:28 +02:00 · e370e940d8
parent 1e42d984e1
commit e370e940d8
6 changed files with 191 additions and 46 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java
@ -79,9 +79,6 @@ public class GenerateEntitiesApplication {
 		final String targetPath = parser.get("targetPath");
 		log.info("targetPath: {}", targetPath);
 		final String invalidPath = parser.get("invalidPath");
 		log.info("invalidPath: {}", invalidPath);
 		final String isLookupUrl = parser.get("isLookupUrl");
 		log.info("isLookupUrl: {}", isLookupUrl);
@ -103,8 +100,7 @@ public class GenerateEntitiesApplication {
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
-			HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration());
+			generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId, mode);
 			generateEntities(spark, vocs, sourcePaths, targetPath, invalidPath, shouldHashId, mode);
 		});
 	}
@ -113,7 +109,6 @@ public class GenerateEntitiesApplication {
 		final VocabularyGroup vocs,
 		final String sourcePaths,
 		final String targetPath,
 		final String invalidPath,
 		final boolean shouldHashId,
 		final Mode mode) {
@ -126,21 +121,6 @@ public class GenerateEntitiesApplication {
 		log.info("Generate entities from files:");
 		existingSourcePaths.forEach(log::info);
 		for (final String sp : existingSourcePaths) {
 			RDD<String> invalidRecords = sc
 				.sequenceFile(sp, Text.class, Text.class)
 				.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
 				.map(k -> tryApplyMapping(k._1(), k._2(), shouldHashId, vocs))
 				.filter(Objects::nonNull)
 				.rdd();
 			spark
 				.createDataset(invalidRecords, Encoders.STRING())
 				.write()
 				.mode(SaveMode.Append)
 				.option("compression", "gzip")
 				.text(invalidPath);
 		}
 		JavaRDD<Oaf> inputRdd = sc.emptyRDD();
 		for (final String sp : existingSourcePaths) {
@ -178,7 +158,7 @@ public class GenerateEntitiesApplication {
 			.saveAsTextFile(targetPath, GzipCodec.class);
 	}
-	private static List<Oaf> convertToListOaf(
+	public static List<Oaf> convertToListOaf(
 		final String id,
 		final String s,
 		final boolean shouldHashId,
@ -219,19 +199,6 @@ public class GenerateEntitiesApplication {
 		}
 	}
 	private static String tryApplyMapping(
 		final String id,
 		final String s,
 		final boolean shouldHashId,
 		final VocabularyGroup vocs) {
 		final List<Oaf> oaf = convertToListOaf(id, s, shouldHashId, vocs);
 		if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) {
 			return s;
 		}
 		return null;
 	}
 	private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
 		try {
 			return OBJECT_MAPPER.readValue(s, clazz);
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/VerifyRecordsApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/VerifyRecordsApplication.java
@ -0,0 +1,108 @@
 package eu.dnetlib.dhp.oa.graph.raw;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import scala.Tuple2;
 public class VerifyRecordsApplication {
 	private static final Logger log = LoggerFactory.getLogger(VerifyRecordsApplication.class);
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					VerifyRecordsApplication.class
 						.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json")));
 		parser.parseArgument(args);
 		final Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String sourcePaths = parser.get("sourcePaths");
 		log.info("sourcePaths: {}", sourcePaths);
 		final String invalidPath = parser.get("invalidPath");
 		log.info("invalidPath: {}", invalidPath);
 		final String isLookupUrl = parser.get("isLookupUrl");
 		log.info("isLookupUrl: {}", isLookupUrl);
 		final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
 		final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
 			HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration());
 			validateRecords(spark, sourcePaths, invalidPath, vocs);
 		});
 	}
 	private static void validateRecords(SparkSession spark, String sourcePaths, String invalidPath,
 		VocabularyGroup vocs) {
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		final List<String> existingSourcePaths = Arrays
 			.stream(sourcePaths.split(","))
 			.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
 			.collect(Collectors.toList());
 		log.info("Verify records in files:");
 		existingSourcePaths.forEach(log::info);
 		for (final String sp : existingSourcePaths) {
 			RDD<String> invalidRecords = sc
 				.sequenceFile(sp, Text.class, Text.class)
 				.map(k -> tryApplyMapping(k._1().toString(), k._2().toString(), true, vocs))
 				.filter(Objects::nonNull)
 				.rdd();
 			spark
 				.createDataset(invalidRecords, Encoders.STRING())
 				.write()
 				.mode(SaveMode.Append)
 				.option("compression", "gzip")
 				.text(invalidPath);
 		}
 	}
 	private static String tryApplyMapping(
 		final String id,
 		final String xmlRecord,
 		final boolean shouldHashId,
 		final VocabularyGroup vocs) {
 		final List<Oaf> oaf = GenerateEntitiesApplication.convertToListOaf(id, xmlRecord, shouldHashId, vocs);
 		if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) {
 			return xmlRecord;
 		}
 		return null;
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
 import java.io.Closeable;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
@ -24,8 +25,11 @@ import org.apache.http.impl.client.HttpClients;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.graph.raw.OafToOafMapper;
 import eu.dnetlib.dhp.oa.graph.raw.OdfToOafMapper;
 import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.utils.DHPUtils;
 public class AbstractMigrationApplication implements Closeable {
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json
@ -17,12 +17,6 @@
 		"paramDescription": "the path of the target file",
 		"paramRequired": true
 	},
 	{
 		"paramName": "i",
 		"paramLongName": "invalidPath",
 		"paramDescription": "the path of the invalid records file",
 		"paramRequired": false
 	},
 	{
 		"paramName": "isu",
 		"paramLongName": "isLookupUrl",
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@ -446,10 +446,34 @@
    <join name="wait_import" to="fork_generate_entities"/>
    <fork name="fork_generate_entities">
-        <path start="GenerateEntities_claim"/>
+        <path start="VerifyRecords_claim"/>
-        <path start="GenerateEntities"/>
+        <path start="VerifyRecords"/>
    </fork>
    <action name="VerifyRecords_claim">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>VerifyRecords_claim</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
            <arg>--invalidPath</arg><arg>${workingDir}/invalid_records_claim</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
        </spark>
        <ok to="GenerateEntities_claim"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateEntities_claim">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -468,7 +492,6 @@
            </spark-opts>
            <arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
            <arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
            <arg>--invalidPath</arg><arg>${workingDir}/invalid_records_claim</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--shouldHashId</arg><arg>${shouldHashId}</arg>
            <arg>--mode</arg><arg>claim</arg>
@ -500,6 +523,30 @@
        <error to="Kill"/>
    </action>
    <action name="VerifyRecords">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>VerifyRecords</name>
            <class>eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePaths</arg><arg>${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible</arg>
            <arg>--invalidPath</arg><arg>${workingDir}/invalid_records</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
        </spark>
        <ok to="GenerateEntities"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -518,7 +565,6 @@
            </spark-opts>
            <arg>--sourcePaths</arg><arg>${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible</arg>
            <arg>--targetPath</arg><arg>${workingDir}/entities</arg>
            <arg>--invalidPath</arg><arg>${workingDir}/invalid_records</arg>
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--shouldHashId</arg><arg>${shouldHashId}</arg>
        </spark>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json
@ -0,0 +1,26 @@
 [
 	{
 		"paramName": "issm",
 		"paramLongName": "isSparkSessionManaged",
 		"paramDescription": "when true will stop SparkSession after job execution",
 		"paramRequired": false
 	},
 	{
 		"paramName": "s",
 		"paramLongName": "sourcePaths",
 		"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
 		"paramRequired": true
 	},
 	{
 		"paramName": "i",
 		"paramLongName": "invalidPath",
 		"paramDescription": "the path of the invalid records file",
 		"paramRequired": false
 	},
 	{
 		"paramName": "isu",
 		"paramLongName": "isLookupUrl",
 		"paramDescription": "the url of the ISLookupService",
 		"paramRequired": true
 	}
 ]