Skip to content

Commit bde9cc1

Browse files
mengxrpwendell
authored andcommitted
[SPARK-1357] [MLLIB] Annotate developer and experimental APIs
Annotate developer and experimental APIs in MLlib. Author: Xiangrui Meng <[email protected]> Closes #298 from mengxr/api and squashes the following commits: 13390e8 [Xiangrui Meng] Merge branch 'master' into api dc4cbb3 [Xiangrui Meng] mark distribute matrices experimental 6b9f8e2 [Xiangrui Meng] add Experimental annotation 8773d0d [Xiangrui Meng] add DeveloperApi annotation da31733 [Xiangrui Meng] update developer and experimental tags 555e0fe [Xiangrui Meng] Merge branch 'master' into api ef1a717 [Xiangrui Meng] mark some constructors private add default parameters to JavaDoc 00ffbcc [Xiangrui Meng] update tree API annotation 0b674fa [Xiangrui Meng] mark decision tree APIs 86b9e34 [Xiangrui Meng] one pass over APIs of GLMs, NaiveBayes, and ALS f21d862 [Xiangrui Meng] Merge branch 'master' into api 2b133d6 [Xiangrui Meng] intial annotation of developer and experimental apis
1 parent 87bd1f9 commit bde9cc1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+355
-122
lines changed

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.api.python
1919

2020
import java.nio.{ByteBuffer, ByteOrder}
2121

22+
import org.apache.spark.annotation.DeveloperApi
2223
import org.apache.spark.api.java.JavaRDD
2324
import org.apache.spark.mllib.classification._
2425
import org.apache.spark.mllib.clustering._
@@ -28,8 +29,11 @@ import org.apache.spark.mllib.regression._
2829
import org.apache.spark.rdd.RDD
2930

3031
/**
32+
* :: DeveloperApi ::
33+
*
3134
* The Java stubs necessary for the Python mllib bindings.
3235
*/
36+
@DeveloperApi
3337
class PythonMLLibAPI extends Serializable {
3438
private def deserializeDoubleVector(bytes: Array[Byte]): Array[Double] = {
3539
val packetLength = bytes.length

mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class LogisticRegressionModel(
5555
this
5656
}
5757

58-
override def predictPoint(dataMatrix: Vector, weightMatrix: Vector,
58+
override protected def predictPoint(dataMatrix: Vector, weightMatrix: Vector,
5959
intercept: Double) = {
6060
val margin = weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept
6161
val score = 1.0/ (1.0 + math.exp(-margin))
@@ -71,27 +71,27 @@ class LogisticRegressionModel(
7171
* NOTE: Labels used in Logistic Regression should be {0, 1}
7272
*/
7373
class LogisticRegressionWithSGD private (
74-
var stepSize: Double,
75-
var numIterations: Int,
76-
var regParam: Double,
77-
var miniBatchFraction: Double)
74+
private var stepSize: Double,
75+
private var numIterations: Int,
76+
private var regParam: Double,
77+
private var miniBatchFraction: Double)
7878
extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {
7979

80-
val gradient = new LogisticGradient()
81-
val updater = new SimpleUpdater()
80+
private val gradient = new LogisticGradient()
81+
private val updater = new SimpleUpdater()
8282
override val optimizer = new GradientDescent(gradient, updater)
8383
.setStepSize(stepSize)
8484
.setNumIterations(numIterations)
8585
.setRegParam(regParam)
8686
.setMiniBatchFraction(miniBatchFraction)
87-
override val validators = List(DataValidators.classificationLabels)
87+
override protected val validators = List(DataValidators.binaryLabelValidator)
8888

8989
/**
9090
* Construct a LogisticRegression object with default parameters
9191
*/
9292
def this() = this(1.0, 100, 0.0, 1.0)
9393

94-
def createModel(weights: Vector, intercept: Double) = {
94+
override protected def createModel(weights: Vector, intercept: Double) = {
9595
new LogisticRegressionModel(weights, intercept)
9696
}
9797
}

mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.classification
1919

2020
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
2121

22+
import org.apache.spark.annotation.Experimental
2223
import org.apache.spark.{Logging, SparkContext}
2324
import org.apache.spark.SparkContext._
2425
import org.apache.spark.mllib.linalg.Vector
@@ -27,11 +28,16 @@ import org.apache.spark.mllib.util.MLUtils
2728
import org.apache.spark.rdd.RDD
2829

2930
/**
31+
* :: Experimental ::
32+
*
3033
* Model for Naive Bayes Classifiers.
3134
*
32-
* @param pi Log of class priors, whose dimension is C.
33-
* @param theta Log of class conditional probabilities, whose dimension is CxD.
35+
* @param labels list of labels
36+
* @param pi log of class priors, whose dimension is C, number of labels
37+
* @param theta log of class conditional probabilities, whose dimension is C-by-D,
38+
* where D is number of features
3439
*/
40+
@Experimental
3541
class NaiveBayesModel(
3642
val labels: Array[Double],
3743
val pi: Array[Double],
@@ -40,14 +46,17 @@ class NaiveBayesModel(
4046
private val brzPi = new BDV[Double](pi)
4147
private val brzTheta = new BDM[Double](theta.length, theta(0).length)
4248

43-
var i = 0
44-
while (i < theta.length) {
45-
var j = 0
46-
while (j < theta(i).length) {
47-
brzTheta(i, j) = theta(i)(j)
48-
j += 1
49+
{
50+
// Need to put an extra pair of braces to prevent Scala treating `i` as a member.
51+
var i = 0
52+
while (i < theta.length) {
53+
var j = 0
54+
while (j < theta(i).length) {
55+
brzTheta(i, j) = theta(i)(j)
56+
j += 1
57+
}
58+
i += 1
4959
}
50-
i += 1
5160
}
5261

5362
override def predict(testData: RDD[Vector]): RDD[Double] = testData.map(predict)
@@ -65,7 +74,7 @@ class NaiveBayesModel(
6574
* document classification. By making every vector a 0-1 vector, it can also be used as
6675
* Bernoulli NB ([[http://tinyurl.com/p7c96j6]]).
6776
*/
68-
class NaiveBayes private (var lambda: Double) extends Serializable with Logging {
77+
class NaiveBayes private (private var lambda: Double) extends Serializable with Logging {
6978

7079
def this() = this(1.0)
7180

mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ class SVMModel(
5555
this
5656
}
5757

58-
override def predictPoint(dataMatrix: Vector, weightMatrix: Vector,
58+
override protected def predictPoint(
59+
dataMatrix: Vector,
60+
weightMatrix: Vector,
5961
intercept: Double) = {
6062
val margin = weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept
6163
threshold match {
@@ -70,28 +72,27 @@ class SVMModel(
7072
* NOTE: Labels used in SVM should be {0, 1}.
7173
*/
7274
class SVMWithSGD private (
73-
var stepSize: Double,
74-
var numIterations: Int,
75-
var regParam: Double,
76-
var miniBatchFraction: Double)
75+
private var stepSize: Double,
76+
private var numIterations: Int,
77+
private var regParam: Double,
78+
private var miniBatchFraction: Double)
7779
extends GeneralizedLinearAlgorithm[SVMModel] with Serializable {
7880

79-
val gradient = new HingeGradient()
80-
val updater = new SquaredL2Updater()
81+
private val gradient = new HingeGradient()
82+
private val updater = new SquaredL2Updater()
8183
override val optimizer = new GradientDescent(gradient, updater)
8284
.setStepSize(stepSize)
8385
.setNumIterations(numIterations)
8486
.setRegParam(regParam)
8587
.setMiniBatchFraction(miniBatchFraction)
86-
87-
override val validators = List(DataValidators.classificationLabels)
88+
override protected val validators = List(DataValidators.binaryLabelValidator)
8889

8990
/**
9091
* Construct a SVM object with default parameters
9192
*/
9293
def this() = this(1.0, 100, 1.0, 1.0)
9394

94-
def createModel(weights: Vector, intercept: Double) = {
95+
override protected def createModel(weights: Vector, intercept: Double) = {
9596
new SVMModel(weights, intercept)
9697
}
9798
}

mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import scala.collection.mutable.ArrayBuffer
2121

2222
import breeze.linalg.{DenseVector => BDV, Vector => BV, norm => breezeNorm}
2323

24+
import org.apache.spark.annotation.Experimental
2425
import org.apache.spark.{Logging, SparkContext}
2526
import org.apache.spark.SparkContext._
2627
import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -37,12 +38,17 @@ import org.apache.spark.util.random.XORShiftRandom
3738
* to it should be cached by the user.
3839
*/
3940
class KMeans private (
40-
var k: Int,
41-
var maxIterations: Int,
42-
var runs: Int,
43-
var initializationMode: String,
44-
var initializationSteps: Int,
45-
var epsilon: Double) extends Serializable with Logging {
41+
private var k: Int,
42+
private var maxIterations: Int,
43+
private var runs: Int,
44+
private var initializationMode: String,
45+
private var initializationSteps: Int,
46+
private var epsilon: Double) extends Serializable with Logging {
47+
48+
/**
49+
* Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
50+
* initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4}.
51+
*/
4652
def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4)
4753

4854
/** Set the number of clusters to create (k). Default: 2. */
@@ -71,6 +77,8 @@ class KMeans private (
7177
}
7278

7379
/**
80+
* :: Experimental ::
81+
*
7482
* Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
7583
* this many times with random starting conditions (configured by the initialization mode), then
7684
* return the best clustering found over any run. Default: 1.
@@ -316,15 +324,36 @@ object KMeans {
316324
data: RDD[Vector],
317325
k: Int,
318326
maxIterations: Int,
319-
runs: Int = 1,
320-
initializationMode: String = K_MEANS_PARALLEL): KMeansModel = {
327+
runs: Int,
328+
initializationMode: String): KMeansModel = {
321329
new KMeans().setK(k)
322330
.setMaxIterations(maxIterations)
323331
.setRuns(runs)
324332
.setInitializationMode(initializationMode)
325333
.run(data)
326334
}
327335

336+
/**
337+
* Trains a k-means model using specified parameters and the default values for unspecified.
338+
*/
339+
def train(
340+
data: RDD[Vector],
341+
k: Int,
342+
maxIterations: Int): KMeansModel = {
343+
train(data, k, maxIterations, 1, K_MEANS_PARALLEL)
344+
}
345+
346+
/**
347+
* Trains a k-means model using specified parameters and the default values for unspecified.
348+
*/
349+
def train(
350+
data: RDD[Vector],
351+
k: Int,
352+
maxIterations: Int,
353+
runs: Int): KMeansModel = {
354+
train(data, k, maxIterations, runs, K_MEANS_PARALLEL)
355+
}
356+
328357
/**
329358
* Returns the index of the closest center to the given point, as well as the squared distance.
330359
*/
@@ -369,6 +398,10 @@ object KMeans {
369398
MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)
370399
}
371400

401+
/**
402+
* :: Experimental ::
403+
*/
404+
@Experimental
372405
def main(args: Array[String]) {
373406
if (args.length < 4) {
374407
println("Usage: KMeans <master> <input_file> <k> <max_iterations> [<runs>]")

mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,13 @@ trait Vector extends Serializable {
6464

6565
/**
6666
* Factory methods for [[org.apache.spark.mllib.linalg.Vector]].
67+
* We don't use the name `Vector` because Scala imports
68+
* [[scala.collection.immutable.Vector]] by default.
6769
*/
6870
object Vectors {
6971

7072
/**
71-
* Creates a dense vector.
73+
* Creates a dense vector from its values.
7274
*/
7375
@varargs
7476
def dense(firstValue: Double, otherValues: Double*): Vector =
@@ -158,20 +160,21 @@ class DenseVector(val values: Array[Double]) extends Vector {
158160
/**
159161
* A sparse vector represented by an index array and an value array.
160162
*
161-
* @param n size of the vector.
163+
* @param size size of the vector.
162164
* @param indices index array, assume to be strictly increasing.
163165
* @param values value array, must have the same length as the index array.
164166
*/
165-
class SparseVector(val n: Int, val indices: Array[Int], val values: Array[Double]) extends Vector {
166-
167-
override def size: Int = n
167+
class SparseVector(
168+
override val size: Int,
169+
val indices: Array[Int],
170+
val values: Array[Double]) extends Vector {
168171

169172
override def toString: String = {
170-
"(" + n + "," + indices.zip(values).mkString("[", "," ,"]") + ")"
173+
"(" + size + "," + indices.zip(values).mkString("[", "," ,"]") + ")"
171174
}
172175

173176
override def toArray: Array[Double] = {
174-
val data = new Array[Double](n)
177+
val data = new Array[Double](size)
175178
var i = 0
176179
val nnz = indices.length
177180
while (i < nnz) {
@@ -181,5 +184,5 @@ class SparseVector(val n: Int, val indices: Array[Int], val values: Array[Double
181184
data
182185
}
183186

184-
private[mllib] override def toBreeze: BV[Double] = new BSV[Double](indices, values, n)
187+
private[mllib] override def toBreeze: BV[Double] = new BSV[Double](indices, values, size)
185188
}

mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.linalg.distributed
1919

2020
import breeze.linalg.{DenseMatrix => BDM}
2121

22+
import org.apache.spark.annotation.Experimental
2223
import org.apache.spark.rdd.RDD
2324
import org.apache.spark.SparkContext._
2425
import org.apache.spark.mllib.linalg.Vectors
@@ -32,6 +33,8 @@ import org.apache.spark.mllib.linalg.Vectors
3233
case class MatrixEntry(i: Long, j: Long, value: Double)
3334

3435
/**
36+
* :: Experimental ::
37+
*
3538
* Represents a matrix in coordinate format.
3639
*
3740
* @param entries matrix entries
@@ -40,6 +43,7 @@ case class MatrixEntry(i: Long, j: Long, value: Double)
4043
* @param nCols number of columns. A non-positive value means unknown, and then the number of
4144
* columns will be determined by the max column index plus one.
4245
*/
46+
@Experimental
4347
class CoordinateMatrix(
4448
val entries: RDD[MatrixEntry],
4549
private var nRows: Long,

mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/DistributedMatrix.scala

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ package org.apache.spark.mllib.linalg.distributed
1919

2020
import breeze.linalg.{DenseMatrix => BDM}
2121

22-
import org.apache.spark.mllib.linalg.Matrix
23-
2422
/**
2523
* Represents a distributively stored matrix backed by one or more RDDs.
2624
*/

mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,22 @@ package org.apache.spark.mllib.linalg.distributed
1919

2020
import breeze.linalg.{DenseMatrix => BDM}
2121

22+
import org.apache.spark.annotation.Experimental
2223
import org.apache.spark.rdd.RDD
2324
import org.apache.spark.mllib.linalg._
2425
import org.apache.spark.mllib.linalg.SingularValueDecomposition
2526

26-
/** Represents a row of [[org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix]]. */
27+
/**
28+
* :: Experimental ::
29+
*
30+
* Represents a row of [[org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix]].
31+
*/
32+
@Experimental
2733
case class IndexedRow(index: Long, vector: Vector)
2834

2935
/**
36+
* :: Experimental ::
37+
*
3038
* Represents a row-oriented [[org.apache.spark.mllib.linalg.distributed.DistributedMatrix]] with
3139
* indexed rows.
3240
*
@@ -36,6 +44,7 @@ case class IndexedRow(index: Long, vector: Vector)
3644
* @param nCols number of columns. A non-positive value means unknown, and then the number of
3745
* columns will be determined by the size of the first row.
3846
*/
47+
@Experimental
3948
class IndexedRowMatrix(
4049
val rows: RDD[IndexedRow],
4150
private var nRows: Long,

0 commit comments

Comments
 (0)