Skip to content

Commit 6b88825

Browse files
committed
[SPARK-5604][MLLIB] remove checkpointDir from trees
This is the second part of SPARK-5604, which removes checkpointDir from tree strategies. Note that this is a break change. I will mention it in the migration guide. Author: Xiangrui Meng <[email protected]> Closes apache#4407 from mengxr/SPARK-5604-1 and squashes the following commits: 13a276d [Xiangrui Meng] remove checkpointDir from trees
1 parent 7dc4965 commit 6b88825

File tree

4 files changed

+6
-20
lines changed

4 files changed

+6
-20
lines changed

examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ object DecisionTreeRunner {
272272
case Variance => impurity.Variance
273273
}
274274

275+
params.checkpointDir.foreach(sc.setCheckpointDir)
276+
275277
val strategy
276278
= new Strategy(
277279
algo = params.algo,
@@ -282,7 +284,6 @@ object DecisionTreeRunner {
282284
minInstancesPerNode = params.minInstancesPerNode,
283285
minInfoGain = params.minInfoGain,
284286
useNodeIdCache = params.useNodeIdCache,
285-
checkpointDir = params.checkpointDir,
286287
checkpointInterval = params.checkpointInterval)
287288
if (params.numTrees == 1) {
288289
val startTime = System.nanoTime()

mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,6 @@ private class RandomForest (
204204
Some(NodeIdCache.init(
205205
data = baggedInput,
206206
numTrees = numTrees,
207-
checkpointDir = strategy.checkpointDir,
208207
checkpointInterval = strategy.checkpointInterval,
209208
initVal = 1))
210209
} else {

mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,10 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
6262
* @param subsamplingRate Fraction of the training data used for learning decision tree.
6363
* @param useNodeIdCache If this is true, instead of passing trees to executors, the algorithm will
6464
* maintain a separate RDD of node Id cache for each row.
65-
* @param checkpointDir If the node Id cache is used, it will help to checkpoint
66-
* the node Id cache periodically. This is the checkpoint directory
67-
* to be used for the node Id cache.
6865
* @param checkpointInterval How often to checkpoint when the node Id cache gets updated.
69-
* E.g. 10 means that the cache will get checkpointed every 10 updates.
66+
* E.g. 10 means that the cache will get checkpointed every 10 updates. If
67+
* the checkpoint directory is not set in
68+
* [[org.apache.spark.SparkContext]], this setting is ignored.
7069
*/
7170
@Experimental
7271
class Strategy (
@@ -82,7 +81,6 @@ class Strategy (
8281
@BeanProperty var maxMemoryInMB: Int = 256,
8382
@BeanProperty var subsamplingRate: Double = 1,
8483
@BeanProperty var useNodeIdCache: Boolean = false,
85-
@BeanProperty var checkpointDir: Option[String] = None,
8684
@BeanProperty var checkpointInterval: Int = 10) extends Serializable {
8785

8886
def isMulticlassClassification =
@@ -165,7 +163,7 @@ class Strategy (
165163
def copy: Strategy = {
166164
new Strategy(algo, impurity, maxDepth, numClasses, maxBins,
167165
quantileCalculationStrategy, categoricalFeaturesInfo, minInstancesPerNode, minInfoGain,
168-
maxMemoryInMB, subsamplingRate, useNodeIdCache, checkpointDir, checkpointInterval)
166+
maxMemoryInMB, subsamplingRate, useNodeIdCache, checkpointInterval)
169167
}
170168
}
171169

mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,12 @@ private[tree] case class NodeIndexUpdater(
7171
* The nodeIdsForInstances RDD needs to be updated at each iteration.
7272
* @param nodeIdsForInstances The initial values in the cache
7373
* (should be an Array of all 1's (meaning the root nodes)).
74-
* @param checkpointDir The checkpoint directory where
75-
* the checkpointed files will be stored.
7674
* @param checkpointInterval The checkpointing interval
7775
* (how often should the cache be checkpointed.).
7876
*/
7977
@DeveloperApi
8078
private[tree] class NodeIdCache(
8179
var nodeIdsForInstances: RDD[Array[Int]],
82-
val checkpointDir: Option[String],
8380
val checkpointInterval: Int) {
8481

8582
// Keep a reference to a previous node Ids for instances.
@@ -91,12 +88,6 @@ private[tree] class NodeIdCache(
9188
private val checkpointQueue = mutable.Queue[RDD[Array[Int]]]()
9289
private var rddUpdateCount = 0
9390

94-
// If a checkpoint directory is given, and there's no prior checkpoint directory,
95-
// then set the checkpoint directory with the given one.
96-
if (checkpointDir.nonEmpty && nodeIdsForInstances.sparkContext.getCheckpointDir.isEmpty) {
97-
nodeIdsForInstances.sparkContext.setCheckpointDir(checkpointDir.get)
98-
}
99-
10091
/**
10192
* Update the node index values in the cache.
10293
* This updates the RDD and its lineage.
@@ -184,7 +175,6 @@ private[tree] object NodeIdCache {
184175
* Initialize the node Id cache with initial node Id values.
185176
* @param data The RDD of training rows.
186177
* @param numTrees The number of trees that we want to create cache for.
187-
* @param checkpointDir The checkpoint directory where the checkpointed files will be stored.
188178
* @param checkpointInterval The checkpointing interval
189179
* (how often should the cache be checkpointed.).
190180
* @param initVal The initial values in the cache.
@@ -193,12 +183,10 @@ private[tree] object NodeIdCache {
193183
def init(
194184
data: RDD[BaggedPoint[TreePoint]],
195185
numTrees: Int,
196-
checkpointDir: Option[String],
197186
checkpointInterval: Int,
198187
initVal: Int = 1): NodeIdCache = {
199188
new NodeIdCache(
200189
data.map(_ => Array.fill[Int](numTrees)(initVal)),
201-
checkpointDir,
202190
checkpointInterval)
203191
}
204192
}

0 commit comments

Comments
 (0)