fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting

2024-05-07 16:25:17 +02:00 · 2024-05-07 16:25:17 +02:00 · b4e3389432
parent 711048ceed
commit b4e3389432
3 changed files with 54 additions and 37 deletions
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
 					result
 						.getTitle()
 						.stream()
 						.filter(t -> StringUtils.isNotBlank(t.getValue()))
 						.findFirst()
 						.map(StructuredProperty::getValue)
 						.ifPresent(
-							title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
+							title -> {
 								re.setTitle(title);
 								re
 									.getTitle()
 									.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
 							});
 				}
 				if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
 					result
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@ -1,14 +1,15 @@
 package eu.dnetlib.dhp.oa.provision;
-import com.fasterxml.jackson.databind.ObjectMapper;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import com.google.common.base.Joiner;
+import static org.apache.spark.sql.functions.col;
-import com.google.common.base.Splitter;
+
-import com.google.common.collect.Sets;
+import java.util.HashSet;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import java.util.Optional;
-import eu.dnetlib.dhp.common.HdfsSupport;
+import java.util.Set;
-import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
+import java.util.stream.Collectors;
-import eu.dnetlib.dhp.schema.oaf.Relation;
+import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Encoders;
@ -20,12 +21,15 @@ import org.apache.spark.sql.functions;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import java.util.HashSet;
+import com.fasterxml.jackson.databind.ObjectMapper;
-import java.util.Optional;
+import com.google.common.base.Joiner;
-import java.util.Set;
+import com.google.common.base.Splitter;
 import com.google.common.collect.Sets;
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import static org.apache.spark.sql.functions.col;
+import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 /**
 * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
@ -126,16 +130,22 @@ public class PrepareRelationsJob {
 			.partitionBy("target", "subRelType")
 			.orderBy(col("source").desc_nulls_last());
-		spark.read().schema(Encoders.bean(Relation.class).schema()).json(inputRelationsPath)
+		spark
 			.read()
 			.schema(Encoders.bean(Relation.class).schema())
 			.json(inputRelationsPath)
 			.where("source NOT LIKE 'unresolved%' AND  target  NOT LIKE 'unresolved%'")
 			.where("datainfo.deletedbyinference != true")
-				.where(relationFilter.isEmpty() ? "" : "lower(relClass) NOT IN ("+ Joiner.on(',').join(relationFilter) +")")
+			.where(
 				relationFilter.isEmpty() ? ""
 					: "lower(relClass) NOT IN ("
 						+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
 			.withColumn("source_w_pos", functions.row_number().over(source_w))
-				.where("source_w_pos < " + sourceMaxRelations )
+			.where("source_w_pos < " + sourceMaxRelations)
 			.drop("source_w_pos")
 			.withColumn("target_w_pos", functions.row_number().over(target_w))
 			.where("target_w_pos < " + targetMaxRelations)
-				.drop( "target_w_pos")
+			.drop("target_w_pos")
 			.coalesce(relPartitions)
 			.write()
 			.mode(SaveMode.Overwrite)
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -144,21 +144,23 @@
            <class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
+                --executor-cores=4
-                --executor-memory=${sparkExecutorMemoryForJoining}
+                --executor-memory=6G
                --driver-memory=${sparkDriverMemoryForJoining}
                --conf spark.executor.memoryOverhead=6G
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.sql.shuffle.partitions=15000
                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
            <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
            <arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg>
            <arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg>
            <arg>--relationFilter</arg><arg>${relationFilter}</arg>
-            <arg>--relPartitions</arg><arg>5000</arg>
+            <arg>--relPartitions</arg><arg>15000</arg>
        </spark>
        <ok to="fork_join_related_entities"/>
        <error to="Kill"/>