1
0
Fork 0

fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting

This commit is contained in:
Claudio Atzori 2024-05-07 16:25:17 +02:00
parent 711048ceed
commit b4e3389432
3 changed files with 54 additions and 37 deletions

View File

@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
result
.getTitle()
.stream()
.filter(t -> StringUtils.isNotBlank(t.getValue()))
.findFirst()
.map(StructuredProperty::getValue)
.ifPresent(
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
title -> {
re.setTitle(title);
re
.getTitle()
.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
});
}
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
result

View File

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.schema.oaf.Relation;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.col;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
@ -20,12 +21,15 @@ import org.apache.spark.sql.functions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Sets;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.col;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.schema.oaf.Relation;
/**
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
@ -119,27 +123,33 @@ public class PrepareRelationsJob {
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
WindowSpec source_w = Window
.partitionBy("source", "subRelType")
.orderBy(col("target").desc_nulls_last());
.partitionBy("source", "subRelType")
.orderBy(col("target").desc_nulls_last());
WindowSpec target_w = Window
.partitionBy("target", "subRelType")
.orderBy(col("source").desc_nulls_last());
.partitionBy("target", "subRelType")
.orderBy(col("source").desc_nulls_last());
spark.read().schema(Encoders.bean(Relation.class).schema()).json(inputRelationsPath)
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
.where("datainfo.deletedbyinference != true")
.where(relationFilter.isEmpty() ? "" : "lower(relClass) NOT IN ("+ Joiner.on(',').join(relationFilter) +")")
.withColumn("source_w_pos", functions.row_number().over(source_w))
.where("source_w_pos < " + sourceMaxRelations )
.drop("source_w_pos")
.withColumn("target_w_pos", functions.row_number().over(target_w))
.where("target_w_pos < " + targetMaxRelations)
.drop( "target_w_pos")
.coalesce(relPartitions)
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
spark
.read()
.schema(Encoders.bean(Relation.class).schema())
.json(inputRelationsPath)
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
.where("datainfo.deletedbyinference != true")
.where(
relationFilter.isEmpty() ? ""
: "lower(relClass) NOT IN ("
+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
.withColumn("source_w_pos", functions.row_number().over(source_w))
.where("source_w_pos < " + sourceMaxRelations)
.drop("source_w_pos")
.withColumn("target_w_pos", functions.row_number().over(target_w))
.where("target_w_pos < " + targetMaxRelations)
.drop("target_w_pos")
.coalesce(relPartitions)
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static void removeOutputDir(SparkSession spark, String path) {

View File

@ -144,21 +144,23 @@
<class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--executor-cores=4
--executor-memory=6G
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=6G
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=15000
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
<arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg>
<arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg>
<arg>--relationFilter</arg><arg>${relationFilter}</arg>
<arg>--relPartitions</arg><arg>5000</arg>
<arg>--relPartitions</arg><arg>15000</arg>
</spark>
<ok to="fork_join_related_entities"/>
<error to="Kill"/>