Master branch updates from beta September 2023 #337
|
@ -1,3 +1,10 @@
|
|||
package eu.dnetlib.dhp.application.dedup.log
|
||||
|
||||
case class DedupLogModel(tag:String, configuration:String, entity:String, startTS:Long, endTS:Long, totalMs:Long ) {}
|
||||
case class DedupLogModel(
|
||||
tag: String,
|
||||
configuration: String,
|
||||
entity: String,
|
||||
startTS: Long,
|
||||
endTS: Long,
|
||||
totalMs: Long
|
||||
) {}
|
||||
|
|
|
@ -2,15 +2,13 @@ package eu.dnetlib.dhp.application.dedup.log
|
|||
|
||||
import org.apache.spark.sql.{SaveMode, SparkSession}
|
||||
|
||||
class DedupLogWriter (path:String) {
|
||||
class DedupLogWriter(path: String) {
|
||||
|
||||
|
||||
def appendLog(dedupLogModel: DedupLogModel, spark:SparkSession): Unit = {
|
||||
def appendLog(dedupLogModel: DedupLogModel, spark: SparkSession): Unit = {
|
||||
import spark.implicits._
|
||||
val df = spark.createDataset[DedupLogModel](data = List(dedupLogModel))
|
||||
df.write.mode(SaveMode.Append).save(path)
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,8 +4,6 @@ package eu.dnetlib.dhp.oa.dedup;
|
|||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
|
||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -22,6 +20,8 @@ import org.slf4j.LoggerFactory;
|
|||
import org.xml.sax.SAXException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
|
||||
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
|
@ -74,11 +74,9 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
log.info("actionSetId: '{}'", actionSetId);
|
||||
log.info("workingPath: '{}'", workingPath);
|
||||
|
||||
|
||||
final String dfLogPath = parser.get("dataframeLog");
|
||||
final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN");
|
||||
|
||||
|
||||
// for each dedup configuration
|
||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||
|
||||
|
@ -93,7 +91,6 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
||||
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||
.repartition(numPartitions)
|
||||
|
@ -120,7 +117,8 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
||||
final long end = System.currentTimeMillis();
|
||||
if (StringUtils.isNotBlank(dfLogPath)) {
|
||||
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(),entity, start, end, end-start);
|
||||
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(), entity, start, end,
|
||||
end - start);
|
||||
new DedupLogWriter(dfLogPath).appendLog(model, spark);
|
||||
|
||||
}
|
||||
|
|
|
@ -134,7 +134,8 @@
|
|||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--numPartitions</arg><arg>15000</arg>
|
||||
</spark>
|
||||
<ok to="WhitelistSimRels"/>
|
||||
<!-- <ok to="WhitelistSimRels"/>-->
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
|
Loading…
Reference in New Issue