forked from D-Net/dnet-hadoop
added to CreateSimRel the feature to write time log
This commit is contained in:
parent
b195da3a83
commit
bd17c3edc8
|
@ -4,7 +4,10 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
|
||||||
|
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
@ -20,8 +23,6 @@ import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
@ -73,9 +74,16 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
log.info("workingPath: '{}'", workingPath);
|
log.info("workingPath: '{}'", workingPath);
|
||||||
|
|
||||||
|
|
||||||
|
final String dfLogPath = parser.get("dataframeLog");
|
||||||
|
final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN");
|
||||||
|
|
||||||
|
|
||||||
// for each dedup configuration
|
// for each dedup configuration
|
||||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||||
|
|
||||||
|
final long start = System.currentTimeMillis();
|
||||||
|
|
||||||
final String entity = dedupConf.getWf().getEntityType();
|
final String entity = dedupConf.getWf().getEntityType();
|
||||||
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
||||||
log.info("Creating simrels for: '{}'", subEntity);
|
log.info("Creating simrels for: '{}'", subEntity);
|
||||||
|
@ -85,6 +93,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
|
||||||
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||||
.repartition(numPartitions)
|
.repartition(numPartitions)
|
||||||
|
@ -109,6 +118,13 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
||||||
|
final long end = System.currentTimeMillis();
|
||||||
|
if (StringUtils.isNotBlank(dfLogPath)) {
|
||||||
|
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(),entity, start, end, end-start);
|
||||||
|
new DedupLogWriter(dfLogPath).appendLog(model, spark);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,5 +28,17 @@
|
||||||
"paramLongName": "numPartitions",
|
"paramLongName": "numPartitions",
|
||||||
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "dl",
|
||||||
|
"paramLongName": "dataframeLog",
|
||||||
|
"paramDescription": "the path of the dataframe Log",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "rt",
|
||||||
|
"paramLongName": "runTAG",
|
||||||
|
"paramDescription": "the label of the current running",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
Loading…
Reference in New Issue