forked from D-Net/dnet-hadoop
fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting
This commit is contained in:
parent
711048ceed
commit
b4e3389432
|
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
result
|
result
|
||||||
.getTitle()
|
.getTitle()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(t -> StringUtils.isNotBlank(t.getValue()))
|
||||||
.findFirst()
|
.findFirst()
|
||||||
.map(StructuredProperty::getValue)
|
|
||||||
.ifPresent(
|
.ifPresent(
|
||||||
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
|
title -> {
|
||||||
|
re.setTitle(title);
|
||||||
|
re
|
||||||
|
.getTitle()
|
||||||
|
.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
||||||
result
|
result
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import com.google.common.base.Joiner;
|
import static org.apache.spark.sql.functions.col;
|
||||||
import com.google.common.base.Splitter;
|
|
||||||
import com.google.common.collect.Sets;
|
import java.util.HashSet;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import java.util.Optional;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import java.util.Set;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
import java.util.stream.Collectors;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
@ -20,12 +21,15 @@ import org.apache.spark.sql.functions;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.util.Optional;
|
import com.google.common.base.Joiner;
|
||||||
import java.util.Set;
|
import com.google.common.base.Splitter;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import static org.apache.spark.sql.functions.col;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
||||||
|
@ -126,16 +130,22 @@ public class PrepareRelationsJob {
|
||||||
.partitionBy("target", "subRelType")
|
.partitionBy("target", "subRelType")
|
||||||
.orderBy(col("source").desc_nulls_last());
|
.orderBy(col("source").desc_nulls_last());
|
||||||
|
|
||||||
spark.read().schema(Encoders.bean(Relation.class).schema()).json(inputRelationsPath)
|
spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(inputRelationsPath)
|
||||||
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
|
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
|
||||||
.where("datainfo.deletedbyinference != true")
|
.where("datainfo.deletedbyinference != true")
|
||||||
.where(relationFilter.isEmpty() ? "" : "lower(relClass) NOT IN ("+ Joiner.on(',').join(relationFilter) +")")
|
.where(
|
||||||
|
relationFilter.isEmpty() ? ""
|
||||||
|
: "lower(relClass) NOT IN ("
|
||||||
|
+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
|
||||||
.withColumn("source_w_pos", functions.row_number().over(source_w))
|
.withColumn("source_w_pos", functions.row_number().over(source_w))
|
||||||
.where("source_w_pos < " + sourceMaxRelations )
|
.where("source_w_pos < " + sourceMaxRelations)
|
||||||
.drop("source_w_pos")
|
.drop("source_w_pos")
|
||||||
.withColumn("target_w_pos", functions.row_number().over(target_w))
|
.withColumn("target_w_pos", functions.row_number().over(target_w))
|
||||||
.where("target_w_pos < " + targetMaxRelations)
|
.where("target_w_pos < " + targetMaxRelations)
|
||||||
.drop( "target_w_pos")
|
.drop("target_w_pos")
|
||||||
.coalesce(relPartitions)
|
.coalesce(relPartitions)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
|
|
@ -144,21 +144,23 @@
|
||||||
<class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
|
<class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
|
||||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=4
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=6G
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemoryForJoining}
|
||||||
|
--conf spark.executor.memoryOverhead=6G
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
|
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
||||||
<arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg>
|
<arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg>
|
||||||
<arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg>
|
<arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg>
|
||||||
<arg>--relationFilter</arg><arg>${relationFilter}</arg>
|
<arg>--relationFilter</arg><arg>${relationFilter}</arg>
|
||||||
<arg>--relPartitions</arg><arg>5000</arg>
|
<arg>--relPartitions</arg><arg>15000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork_join_related_entities"/>
|
<ok to="fork_join_related_entities"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue