forked from D-Net/dnet-hadoop
added to CreateSimRel the feature to write time log
This commit is contained in:
parent
2717edafb7
commit
9910ce06ae
|
@ -1,3 +1,10 @@
|
||||||
package eu.dnetlib.dhp.application.dedup.log
|
package eu.dnetlib.dhp.application.dedup.log
|
||||||
|
|
||||||
case class DedupLogModel(tag:String, configuration:String, entity:String, startTS:Long, endTS:Long, totalMs:Long ) {}
|
case class DedupLogModel(
|
||||||
|
tag: String,
|
||||||
|
configuration: String,
|
||||||
|
entity: String,
|
||||||
|
startTS: Long,
|
||||||
|
endTS: Long,
|
||||||
|
totalMs: Long
|
||||||
|
) {}
|
||||||
|
|
|
@ -2,15 +2,13 @@ package eu.dnetlib.dhp.application.dedup.log
|
||||||
|
|
||||||
import org.apache.spark.sql.{SaveMode, SparkSession}
|
import org.apache.spark.sql.{SaveMode, SparkSession}
|
||||||
|
|
||||||
class DedupLogWriter (path:String) {
|
class DedupLogWriter(path: String) {
|
||||||
|
|
||||||
|
def appendLog(dedupLogModel: DedupLogModel, spark: SparkSession): Unit = {
|
||||||
def appendLog(dedupLogModel: DedupLogModel, spark:SparkSession): Unit = {
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val df = spark.createDataset[DedupLogModel](data = List(dedupLogModel))
|
val df = spark.createDataset[DedupLogModel](data = List(dedupLogModel))
|
||||||
df.write.mode(SaveMode.Append).save(path)
|
df.write.mode(SaveMode.Append).save(path)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,6 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
|
|
||||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -22,6 +20,8 @@ import org.slf4j.LoggerFactory;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
|
||||||
|
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
@ -74,11 +74,9 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
log.info("workingPath: '{}'", workingPath);
|
log.info("workingPath: '{}'", workingPath);
|
||||||
|
|
||||||
|
|
||||||
final String dfLogPath = parser.get("dataframeLog");
|
final String dfLogPath = parser.get("dataframeLog");
|
||||||
final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN");
|
final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN");
|
||||||
|
|
||||||
|
|
||||||
// for each dedup configuration
|
// for each dedup configuration
|
||||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||||
|
|
||||||
|
@ -93,7 +91,6 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
|
||||||
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||||
.repartition(numPartitions)
|
.repartition(numPartitions)
|
||||||
|
@ -120,7 +117,8 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
||||||
final long end = System.currentTimeMillis();
|
final long end = System.currentTimeMillis();
|
||||||
if (StringUtils.isNotBlank(dfLogPath)) {
|
if (StringUtils.isNotBlank(dfLogPath)) {
|
||||||
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(),entity, start, end, end-start);
|
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(), entity, start, end,
|
||||||
|
end - start);
|
||||||
new DedupLogWriter(dfLogPath).appendLog(model, spark);
|
new DedupLogWriter(dfLogPath).appendLog(model, spark);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -134,7 +134,8 @@
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--numPartitions</arg><arg>15000</arg>
|
<arg>--numPartitions</arg><arg>15000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="WhitelistSimRels"/>
|
<!-- <ok to="WhitelistSimRels"/>-->
|
||||||
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue