forked from D-Net/dnet-hadoop
WIP: transformation workflow error reporting
This commit is contained in:
parent
cc88701f29
commit
58467aaf1e
|
@ -8,13 +8,13 @@ import java.util.LinkedHashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.message.MessageSender;
|
import eu.dnetlib.dhp.message.MessageSender;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
|
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
|
||||||
|
|
||||||
|
|
|
@ -21,11 +21,14 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.message.MessageSender;
|
||||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import parquet.hadoop.ParquetReader;
|
||||||
|
|
||||||
public class TransformSparkJobNode {
|
public class TransformSparkJobNode {
|
||||||
|
|
||||||
|
@ -54,7 +57,7 @@ public class TransformSparkJobNode {
|
||||||
final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class);
|
final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class);
|
||||||
final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH;
|
final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH;
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
ParquetReader
|
||||||
final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class);
|
final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class);
|
||||||
final String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
|
final String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
|
||||||
log.info("outputBasePath: {}", outputBasePath);
|
log.info("outputBasePath: {}", outputBasePath);
|
||||||
|
@ -91,23 +94,42 @@ public class TransformSparkJobNode {
|
||||||
final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems);
|
final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems);
|
||||||
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
||||||
|
|
||||||
final Dataset<MetadataRecord> mdstore = spark
|
final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL);
|
||||||
.read()
|
log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
|
||||||
.format("parquet")
|
|
||||||
.load(inputPath)
|
|
||||||
.as(encoder)
|
|
||||||
.map(
|
|
||||||
TransformationFactory.getTransformationPlugin(args, ct, isLookUpService),
|
|
||||||
encoder);
|
|
||||||
saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH);
|
|
||||||
|
|
||||||
log.info("Transformed item " + ct.getProcessedItems().count());
|
final String workflowId = args.get("workflowId");
|
||||||
log.info("Total item " + ct.getTotalItems().count());
|
log.info("workflowId is {}", workflowId);
|
||||||
log.info("Transformation Error item " + ct.getErrorItems().count());
|
|
||||||
|
|
||||||
writeHdfsFile(
|
final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId);
|
||||||
spark.sparkContext().hadoopConfiguration(),
|
try (AggregatorReport report = new AggregatorReport(messageSender)) {
|
||||||
"" + spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(), outputBasePath + MDSTORE_SIZE_PATH);
|
try {
|
||||||
|
final Dataset<MetadataRecord> mdstore = spark
|
||||||
|
.read()
|
||||||
|
.format("parquet")
|
||||||
|
.load(inputPath)
|
||||||
|
.as(encoder)
|
||||||
|
.map(
|
||||||
|
TransformationFactory.getTransformationPlugin(args, ct, isLookUpService),
|
||||||
|
encoder);
|
||||||
|
saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH);
|
||||||
|
|
||||||
|
log.info("Transformed item " + ct.getProcessedItems().count());
|
||||||
|
log.info("Total item " + ct.getTotalItems().count());
|
||||||
|
log.info("Transformation Error item " + ct.getErrorItems().count());
|
||||||
|
|
||||||
|
final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count();
|
||||||
|
writeHdfsFile(
|
||||||
|
spark.sparkContext().hadoopConfiguration(),
|
||||||
|
"" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
log.error("error during record transformation", e);
|
||||||
|
report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage());
|
||||||
|
report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString());
|
||||||
|
report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString());
|
||||||
|
report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString());
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,14 @@
|
||||||
<name>isLookupUrl</name>
|
<name>isLookupUrl</name>
|
||||||
<description>The IS lookUp service endopoint</description>
|
<description>The IS lookUp service endopoint</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>workflowId</name>
|
||||||
|
<description>The identifier of the workflow</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dnetMessageManagerURL</name>
|
||||||
|
<description>The URI of the Dnet Message Manager</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="BeginRead"/>
|
<start to="BeginRead"/>
|
||||||
|
@ -95,6 +103,8 @@
|
||||||
<arg>--transformationPlugin</arg><arg>${transformationPlugin}</arg>
|
<arg>--transformationPlugin</arg><arg>${transformationPlugin}</arg>
|
||||||
<arg>--transformationRuleId</arg><arg>${transformationRuleId}</arg>
|
<arg>--transformationRuleId</arg><arg>${transformationRuleId}</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--workflowId</arg><arg>${workflowId}</arg>
|
||||||
|
<arg>--dnetMessageManagerURL</arg><arg>${dnetMessageManagerURL}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="EndRead"/>
|
<ok to="EndRead"/>
|
||||||
<error to="EndReadRollBack"/>
|
<error to="EndReadRollBack"/>
|
||||||
|
|
|
@ -36,9 +36,18 @@
|
||||||
"paramDescription": "the Information System Service LookUp URL",
|
"paramDescription": "the Information System Service LookUp URL",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "dm",
|
||||||
|
"paramLongName": "dnetMessageManagerURL",
|
||||||
|
"paramDescription": "the End point URL to send Messages",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "w",
|
||||||
|
"paramLongName": "workflowId",
|
||||||
|
"paramDescription": "the identifier of the dnet Workflow",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "tp",
|
"paramName": "tp",
|
||||||
"paramLongName": "transformationPlugin",
|
"paramLongName": "transformationPlugin",
|
||||||
|
|
Loading…
Reference in New Issue