From bd17c3edc807753886cfbb3a6448265d3407ea12 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 28 Jun 2023 11:20:58 +0200 Subject: [PATCH] added to CreateSimRel the feature to write time log --- .../dhp/oa/dedup/SparkCreateSimRels.java | 20 +++++++++++++++++-- .../oa/dedup/createSimRels_parameters.json | 12 +++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index 3aa8f241d..c015078c5 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -4,7 +4,10 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.IOException; import java.util.Optional; +import eu.dnetlib.dhp.application.dedup.log.DedupLogModel; +import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -20,8 +23,6 @@ import org.xml.sax.SAXException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.dedup.model.Block; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @@ -73,9 +74,16 @@ public class SparkCreateSimRels extends AbstractSparkAction { log.info("actionSetId: '{}'", actionSetId); log.info("workingPath: '{}'", workingPath); + + final String dfLogPath = parser.get("dataframeLog"); + final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN"); + + // for each dedup configuration for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + final long start = System.currentTimeMillis(); + final String entity = dedupConf.getWf().getEntityType(); final String subEntity = dedupConf.getWf().getSubEntityValue(); log.info("Creating simrels for: '{}'", subEntity); @@ -85,6 +93,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaPairRDD mapDocuments = sc .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) .repartition(numPartitions) @@ -109,6 +118,13 @@ public class SparkCreateSimRels extends AbstractSparkAction { Encoders.bean(Relation.class)); saveParquet(simRels, outputPath, SaveMode.Overwrite); + final long end = System.currentTimeMillis(); + if (StringUtils.isNotBlank(dfLogPath)) { + final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(),entity, start, end, end-start); + new DedupLogWriter(dfLogPath).appendLog(model, spark); + + } + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json index 09f4365d3..c97de3482 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json @@ -28,5 +28,17 @@ "paramLongName": "numPartitions", "paramDescription": "number of partitions for the similarity relations intermediate phases", "paramRequired": false + }, + { + "paramName": "dl", + "paramLongName": "dataframeLog", + "paramDescription": "the path of the dataframe Log", + "paramRequired": false + }, + { + "paramName": "rt", + "paramLongName": "runTAG", + "paramDescription": "the label of the current running", + "paramRequired": false } ] \ No newline at end of file