forked from D-Net/dnet-hadoop
added to CreateSimRel the feature to write time log
This commit is contained in:
parent
b195da3a83
commit
bd17c3edc8
|
@ -4,7 +4,10 @@ package eu.dnetlib.dhp.oa.dedup;
|
|||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
|
||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
@ -20,8 +23,6 @@ import org.xml.sax.SAXException;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
|
@ -73,9 +74,16 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
log.info("actionSetId: '{}'", actionSetId);
|
||||
log.info("workingPath: '{}'", workingPath);
|
||||
|
||||
|
||||
final String dfLogPath = parser.get("dataframeLog");
|
||||
final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN");
|
||||
|
||||
|
||||
// for each dedup configuration
|
||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||
|
||||
final long start = System.currentTimeMillis();
|
||||
|
||||
final String entity = dedupConf.getWf().getEntityType();
|
||||
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
||||
log.info("Creating simrels for: '{}'", subEntity);
|
||||
|
@ -85,6 +93,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
||||
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||
.repartition(numPartitions)
|
||||
|
@ -109,6 +118,13 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
Encoders.bean(Relation.class));
|
||||
|
||||
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
||||
final long end = System.currentTimeMillis();
|
||||
if (StringUtils.isNotBlank(dfLogPath)) {
|
||||
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(),entity, start, end, end-start);
|
||||
new DedupLogWriter(dfLogPath).appendLog(model, spark);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -28,5 +28,17 @@
|
|||
"paramLongName": "numPartitions",
|
||||
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "dl",
|
||||
"paramLongName": "dataframeLog",
|
||||
"paramDescription": "the path of the dataframe Log",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rt",
|
||||
"paramLongName": "runTAG",
|
||||
"paramDescription": "the label of the current running",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
Loading…
Reference in New Issue