forked from D-Net/dnet-hadoop
repartition the join_entities in 24k files
This commit is contained in:
parent
2f1a623d09
commit
cfd753217c
|
@ -362,7 +362,7 @@
|
||||||
<arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
|
<arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
|
||||||
<arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
|
<arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
|
||||||
<arg>--numPartitions</arg><arg>12000</arg>
|
<arg>--numPartitions</arg><arg>24000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="adjancency_lists"/>
|
<ok to="adjancency_lists"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -386,7 +386,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
|
<arg>--inputPath</arg><arg>${workingDir}/join_entities</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/joined</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/joined</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="convert_to_xml"/>
|
<ok to="convert_to_xml"/>
|
||||||
|
|
Loading…
Reference in New Issue