Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
4 changed files with 17 additions and 13 deletions
Showing only changes of commit 9910ce06ae - Show all commits

View File

@ -1,3 +1,10 @@
package eu.dnetlib.dhp.application.dedup.log
case class DedupLogModel(tag:String, configuration:String, entity:String, startTS:Long, endTS:Long, totalMs:Long ) {}
case class DedupLogModel(
tag: String,
configuration: String,
entity: String,
startTS: Long,
endTS: Long,
totalMs: Long
) {}

View File

@ -2,15 +2,13 @@ package eu.dnetlib.dhp.application.dedup.log
import org.apache.spark.sql.{SaveMode, SparkSession}
class DedupLogWriter (path:String) {
class DedupLogWriter(path: String) {
def appendLog(dedupLogModel: DedupLogModel, spark:SparkSession): Unit = {
def appendLog(dedupLogModel: DedupLogModel, spark: SparkSession): Unit = {
import spark.implicits._
val df = spark.createDataset[DedupLogModel](data = List(dedupLogModel))
df.write.mode(SaveMode.Append).save(path)
}
}

View File

@ -4,8 +4,6 @@ package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException;
import java.util.Optional;
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
@ -22,6 +20,8 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.application.dedup.log.DedupLogModel;
import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter;
import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
@ -74,11 +74,9 @@ public class SparkCreateSimRels extends AbstractSparkAction {
log.info("actionSetId: '{}'", actionSetId);
log.info("workingPath: '{}'", workingPath);
final String dfLogPath = parser.get("dataframeLog");
final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN");
// for each dedup configuration
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
@ -93,7 +91,6 @@ public class SparkCreateSimRels extends AbstractSparkAction {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<String, MapDocument> mapDocuments = sc
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
.repartition(numPartitions)
@ -120,7 +117,8 @@ public class SparkCreateSimRels extends AbstractSparkAction {
saveParquet(simRels, outputPath, SaveMode.Overwrite);
final long end = System.currentTimeMillis();
if (StringUtils.isNotBlank(dfLogPath)) {
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(),entity, start, end, end-start);
final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(), entity, start, end,
end - start);
new DedupLogWriter(dfLogPath).appendLog(model, spark);
}

View File

@ -134,7 +134,8 @@
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>15000</arg>
</spark>
<ok to="WhitelistSimRels"/>
<!-- <ok to="WhitelistSimRels"/>-->
<ok to="End"/>
<error to="Kill"/>
</action>