From e74831f355bc21d61ad80c1b701359773b2a4edf Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Tue, 5 Jan 2016 11:08:23 +0100 Subject: [PATCH 1/5] [SPARK-12630][DOC] Update param descriptions Updates the `param` descriptions consistent. See [SPARK-11219] for more details. --- python/pyspark/mllib/classification.py | 255 ++++++++++++++----------- 1 file changed, 142 insertions(+), 113 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 9e6f17ef6e942..b32a062fda8ab 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -94,16 +94,18 @@ class LogisticRegressionModel(LinearClassificationModel): Classification model trained using Multinomial/Binary Logistic Regression. - :param weights: Weights computed for every feature. - :param intercept: Intercept computed for this model. (Only used - in Binary Logistic Regression. In Multinomial Logistic - Regression, the intercepts will not be a single value, - so the intercepts will be part of the weights.) - :param numFeatures: the dimension of the features. - :param numClasses: the number of possible outcomes for k classes - classification problem in Multinomial Logistic Regression. - By default, it is binary logistic regression so numClasses - will be set to 2. + :param weights: + Weights computed for every feature. + :param intercept: + Intercept computed for this model. (Only used in Binary Logistic + Regression. In Multinomial Logistic Regression, the intercepts will not + be a single value, so the intercepts will be part of the weights.) + :param numFeatures: + the dimension of the features. + :param numClasses: + the number of possible outcomes for k classes classification problem in + Multinomial Logistic Regression. By default, it is binary logistic + regression so numClasses will be set to 2. >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -272,37 +274,42 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, """ Train a logistic regression model on the given data. - :param data: The training data, an RDD of - LabeledPoint. - :param iterations: The number of iterations - (default: 100). - :param step: The step parameter used in SGD - (default: 1.0). - :param miniBatchFraction: Fraction of data to be used for each - SGD iteration (default: 1.0). - :param initialWeights: The initial weights (default: None). - :param regParam: The regularizer parameter - (default: 0.01). - :param regType: The type of regularizer used for - training our model. - - :Allowed values: - - "l1" for using L1 regularization - - "l2" for using L2 regularization - - None for no regularization - - (default: "l2") - - :param intercept: Boolean parameter which indicates the - use or not of the augmented representation - for training data (i.e. whether bias - features are activated or not, - default: False). - :param validateData: Boolean parameter which indicates if - the algorithm should validate data - before training. (default: True) - :param convergenceTol: A condition which decides iteration termination. - (default: 0.001) + :param data: + The training data, an RDD of LabeledPoint. + :param iterations: + The number of iterations. + (default: 100) + :param step: + The step parameter used in SGD. + (default: 1.0) + :param miniBatchFraction: + Fraction of data to be used for each SGD iteration. + (default: 1.0) + :param initialWeights: + The initial weights. + (default: None) + :param regParam: + The regularizer parameter. + (default: 0.01) + :param regType: + The type of regularizer used for training our model. + :Allowed values: + - "l1" for using L1 regularization + - "l2" for using L2 regularization + - None for no regularization + (default: "l2") + :param intercept: + Boolean parameter which indicates the use or not of the augmented + representation for training data (i.e. whether bias features are + activated or not). + (default: False) + :param validateData: + Boolean parameter which indicates if the algorithm should validate + data before training. + (default: True) + :param convergenceTol: + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations), @@ -323,38 +330,45 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType """ Train a logistic regression model on the given data. - :param data: The training data, an RDD of - LabeledPoint. - :param iterations: The number of iterations - (default: 100). - :param initialWeights: The initial weights (default: None). - :param regParam: The regularizer parameter - (default: 0.01). - :param regType: The type of regularizer used for - training our model. - - :Allowed values: - - "l1" for using L1 regularization - - "l2" for using L2 regularization - - None for no regularization - - (default: "l2") - - :param intercept: Boolean parameter which indicates the - use or not of the augmented representation - for training data (i.e. whether bias - features are activated or not, - default: False). - :param corrections: The number of corrections used in the - LBFGS update (default: 10). - :param tolerance: The convergence tolerance of iterations - for L-BFGS (default: 1e-4). - :param validateData: Boolean parameter which indicates if the - algorithm should validate data before - training. (default: True) - :param numClasses: The number of classes (i.e., outcomes) a - label can take in Multinomial Logistic - Regression (default: 2). + :param data: + The training data, an RDD of LabeledPoint. + :param iterations: + The number of iterations + (default: 100). + :param initialWeights: + The initial weights + (default: None). + :param regParam: + The regularizer parameter + (default: 0.01). + :param regType: + The type of regularizer used for training our model. + + :Allowed values: + - "l1" for using L1 regularization + - "l2" for using L2 regularization + - None for no regularization + (default: "l2") + + :param intercept: + Boolean parameter which indicates the use or not of the augmented + representation for training data (i.e. whether bias features are + activated or not) + (default: False) + :param corrections: + The number of corrections used in the LBFGS update. + (default: 10) + :param tolerance: + The convergence tolerance of iterations for L-BFGS. + (default: 1e-4) + :param validateData: + Boolean parameter which indicates if the algorithm should validate + data before training. + (default: True) + :param numClasses: + The number of classes (i.e., outcomes) a label can take in Multinomial + Logistic Regression + (default: 2). >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -387,8 +401,10 @@ class SVMModel(LinearClassificationModel): """ Model for Support Vector Machines (SVMs). - :param weights: Weights computed for every feature. - :param intercept: Intercept computed for this model. + :param weights: + Weights computed for every feature. + :param intercept: + Intercept computed for this model. >>> data = [ ... LabeledPoint(0.0, [0.0]), @@ -490,37 +506,45 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, """ Train a support vector machine on the given data. - :param data: The training data, an RDD of - LabeledPoint. - :param iterations: The number of iterations - (default: 100). - :param step: The step parameter used in SGD - (default: 1.0). - :param regParam: The regularizer parameter - (default: 0.01). - :param miniBatchFraction: Fraction of data to be used for each - SGD iteration (default: 1.0). - :param initialWeights: The initial weights (default: None). - :param regType: The type of regularizer used for - training our model. - - :Allowed values: - - "l1" for using L1 regularization - - "l2" for using L2 regularization - - None for no regularization - - (default: "l2") - - :param intercept: Boolean parameter which indicates the - use or not of the augmented representation - for training data (i.e. whether bias - features are activated or not, - default: False). - :param validateData: Boolean parameter which indicates if - the algorithm should validate data - before training. (default: True) - :param convergenceTol: A condition which decides iteration termination. - (default: 0.001) + :param data: + The training data, an RDD of LabeledPoint. + :param iterations: + The number of iterations. + (default: 100) + :param step: + The step parameter used in SGD. + (default: 1.0) + :param regParam: + The regularizer parameter. + (default: 0.01) + :param miniBatchFraction: + Fraction of data to be used for each SGD iteration. + (default: 1.0) + :param initialWeights: + The initial weights. + (default: None) + :param regType: + The type of regularizer used for training our model. + + :Allowed values: + - "l1" for using L1 regularization + - "l2" for using L2 regularization + - None for no regularization + + (default: "l2") + + :param intercept: + Boolean parameter which indicates the use or not of the augmented + representation for training data (i.e. whether bias features are + activated or not). + (default: False) + :param validateData: + Boolean parameter which indicates if the algorithm should validate + data before training. + (default: True) + :param convergenceTol: + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step), @@ -536,11 +560,13 @@ class NaiveBayesModel(Saveable, Loader): """ Model for Naive Bayes classifiers. - :param labels: list of labels. - :param pi: log of class priors, whose dimension is C, - number of labels. - :param theta: log of class conditional probabilities, whose - dimension is C-by-D, where D is number of features. + :param labels: + list of labels. + :param pi: + log of class priors, whose dimension is C, number of labels. + :param theta: + log of class conditional probabilities, whose dimension is C-by-D, + where D is number of features. >>> data = [ ... LabeledPoint(0.0, [0.0, 0.0]), @@ -639,8 +665,11 @@ def train(cls, data, lambda_=1.0): it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). The input feature values must be nonnegative. - :param data: RDD of LabeledPoint. - :param lambda_: The smoothing parameter (default: 1.0). + :param data: + RDD of LabeledPoint. + :param lambda_: + The smoothing parameter. + (default: 1.0) """ first = data.first() if not isinstance(first, LabeledPoint): From 6cb46ca02e9a9384ba8ca357a0f2259750309106 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Wed, 6 Jan 2016 10:25:43 +0100 Subject: [PATCH 2/5] Style fixes - Style fixes based on review comments by @BryanCutler. - Changed fill-column to 100 instead of 80. --- python/pyspark/mllib/classification.py | 73 ++++++++++---------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index b32a062fda8ab..52d0afac82c13 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -91,21 +91,19 @@ def predict(self, test): class LogisticRegressionModel(LinearClassificationModel): """ - Classification model trained using Multinomial/Binary Logistic - Regression. + Classification model trained using Multinomial/Binary Logistic Regression. :param weights: Weights computed for every feature. :param intercept: - Intercept computed for this model. (Only used in Binary Logistic - Regression. In Multinomial Logistic Regression, the intercepts will not - be a single value, so the intercepts will be part of the weights.) + Intercept computed for this model. (Only used in Binary Logistic Regression. In Multinomial + Logistic Regression, the intercepts will not be a single value, so the intercepts will be part + of the weights.) :param numFeatures: - the dimension of the features. + The dimension of the features. :param numClasses: - the number of possible outcomes for k classes classification problem in - Multinomial Logistic Regression. By default, it is binary logistic - regression so numClasses will be set to 2. + The number of possible outcomes for k classes classification problem in Multinomial Logistic + Regression. By default, it is binary logistic regression so numClasses will be set to 2. >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -299,13 +297,11 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - None for no regularization (default: "l2") :param intercept: - Boolean parameter which indicates the use or not of the augmented - representation for training data (i.e. whether bias features are - activated or not). + Boolean parameter which indicates the use or not of the augmented representation for + training data (i.e., whether bias features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate - data before training. + Boolean parameter which indicates if the algorithm should validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -333,42 +329,37 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType :param data: The training data, an RDD of LabeledPoint. :param iterations: - The number of iterations - (default: 100). + The number of iterations. + (default: 100) :param initialWeights: - The initial weights - (default: None). + The initial weights. + (default: None) :param regParam: - The regularizer parameter - (default: 0.01). + The regularizer parameter. + (default: 0.01) :param regType: The type of regularizer used for training our model. - :Allowed values: - "l1" for using L1 regularization - "l2" for using L2 regularization - None for no regularization (default: "l2") - :param intercept: - Boolean parameter which indicates the use or not of the augmented - representation for training data (i.e. whether bias features are - activated or not) - (default: False) + Boolean parameter which indicates the use or not of the augmented representation for + training data (i.e., whether bias features are activated or not). + (default: False) :param corrections: - The number of corrections used in the LBFGS update. + The number of corrections used in the LBFGS update. (default: 10) :param tolerance: The convergence tolerance of iterations for L-BFGS. (default: 1e-4) :param validateData: - Boolean parameter which indicates if the algorithm should validate - data before training. + Boolean parameter which indicates if the algorithm should validate data before training. (default: True) :param numClasses: - The number of classes (i.e., outcomes) a label can take in Multinomial - Logistic Regression - (default: 2). + The number of classes (i.e., outcomes) a label can take in Multinomial Logistic Regression. + (default: 2) >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -525,22 +516,17 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, (default: None) :param regType: The type of regularizer used for training our model. - :Allowed values: - "l1" for using L1 regularization - "l2" for using L2 regularization - None for no regularization - (default: "l2") - :param intercept: - Boolean parameter which indicates the use or not of the augmented - representation for training data (i.e. whether bias features are - activated or not). + Boolean parameter which indicates the use or not of the augmented representation for + training data (i.e. whether bias features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate - data before training. + Boolean parameter which indicates if the algorithm should validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -561,12 +547,11 @@ class NaiveBayesModel(Saveable, Loader): Model for Naive Bayes classifiers. :param labels: - list of labels. + List of labels. :param pi: - log of class priors, whose dimension is C, number of labels. + Log of class priors, whose dimension is C, number of labels. :param theta: - log of class conditional probabilities, whose dimension is C-by-D, - where D is number of features. + Log of class conditional probabilities, whose dimension is C-by-D, where D is number of features. >>> data = [ ... LabeledPoint(0.0, [0.0, 0.0]), From cbd9d08ed87869ba637531f501897b5c24a4c39c Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Fri, 22 Jan 2016 15:07:57 +0100 Subject: [PATCH 3/5] Limit parameter desciptions to 74 columns --- python/pyspark/mllib/classification.py | 48 ++++++++++++++++---------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 52d0afac82c13..a46b97122b8b0 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -96,14 +96,16 @@ class LogisticRegressionModel(LinearClassificationModel): :param weights: Weights computed for every feature. :param intercept: - Intercept computed for this model. (Only used in Binary Logistic Regression. In Multinomial - Logistic Regression, the intercepts will not be a single value, so the intercepts will be part - of the weights.) + Intercept computed for this model. (Only used in Binary Logistic + Regression. In Multinomial Logistic Regression, the intercepts will + not bea single value, so the intercepts will be part of the + weights.) :param numFeatures: The dimension of the features. :param numClasses: - The number of possible outcomes for k classes classification problem in Multinomial Logistic - Regression. By default, it is binary logistic regression so numClasses will be set to 2. + The number of possible outcomes for k classes classification problem + in Multinomial Logistic Regression. By default, it is binary + logistic regression so numClasses will be set to 2. >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -297,11 +299,13 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - None for no regularization (default: "l2") :param intercept: - Boolean parameter which indicates the use or not of the augmented representation for - training data (i.e., whether bias features are activated or not). + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate data before training. + Boolean parameter which indicates if the algorithm should + validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -345,8 +349,9 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType - None for no regularization (default: "l2") :param intercept: - Boolean parameter which indicates the use or not of the augmented representation for - training data (i.e., whether bias features are activated or not). + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). (default: False) :param corrections: The number of corrections used in the LBFGS update. @@ -355,10 +360,12 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType The convergence tolerance of iterations for L-BFGS. (default: 1e-4) :param validateData: - Boolean parameter which indicates if the algorithm should validate data before training. + Boolean parameter which indicates if the algorithm should + validate data before training. (default: True) :param numClasses: - The number of classes (i.e., outcomes) a label can take in Multinomial Logistic Regression. + The number of classes (i.e., outcomes) a label can take in + Multinomial Logistic Regression. (default: 2) >>> data = [ @@ -522,11 +529,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, - None for no regularization (default: "l2") :param intercept: - Boolean parameter which indicates the use or not of the augmented representation for - training data (i.e. whether bias features are activated or not). + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate data before training. + Boolean parameter which indicates if the algorithm should + validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -551,7 +560,8 @@ class NaiveBayesModel(Saveable, Loader): :param pi: Log of class priors, whose dimension is C, number of labels. :param theta: - Log of class conditional probabilities, whose dimension is C-by-D, where D is number of features. + Log of class conditional probabilities, whose dimension is C-by-D, + where D is number of features. >>> data = [ ... LabeledPoint(0.0, [0.0, 0.0]), @@ -666,9 +676,9 @@ def train(cls, data, lambda_=1.0): @inherit_doc class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): """ - Train or predict a logistic regression model on streaming data. Training uses - Stochastic Gradient Descent to update the model based on each new batch of - incoming data from a DStream. + Train or predict a logistic regression model on streaming data. + Training uses Stochastic Gradient Descent to update the model based on + each new batch of incoming data from a DStream. Each batch of data is assumed to be an RDD of LabeledPoints. The number of data points per batch can vary, but the number From bf8f8a0b82b49b9144449cd00872345dba3bf061 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Sat, 23 Jan 2016 08:15:19 +0100 Subject: [PATCH 4/5] A couple of more 74 column fixes --- python/pyspark/mllib/classification.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index a46b97122b8b0..64c016213f1df 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -91,7 +91,8 @@ def predict(self, test): class LogisticRegressionModel(LinearClassificationModel): """ - Classification model trained using Multinomial/Binary Logistic Regression. + Classification model trained using Multinomial/Binary Logistic + Regression. :param weights: Weights computed for every feature. @@ -191,8 +192,8 @@ def numFeatures(self): @since('1.4.0') def numClasses(self): """ - Number of possible outcomes for k classes classification problem in Multinomial - Logistic Regression. + Number of possible outcomes for k classes classification problem + in Multinomial Logistic Regression. """ return self._numClasses From 91dc609703bc012c32b636595b86e60eb56424a4 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Fri, 12 Feb 2016 10:00:37 -0800 Subject: [PATCH 5/5] [SPARK-12630] Fixed 'Allowed values' formatting of reg param to get rid of Sphinx warnings --- python/pyspark/mllib/classification.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 64c016213f1df..b24592c3798e6 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -294,11 +294,11 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, (default: 0.01) :param regType: The type of regularizer used for training our model. - :Allowed values: + Allowed values: + - "l1" for using L1 regularization - - "l2" for using L2 regularization + - "l2" for using L2 regularization (default) - None for no regularization - (default: "l2") :param intercept: Boolean parameter which indicates the use or not of the augmented representation for training data (i.e., whether bias @@ -344,11 +344,11 @@ def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType (default: 0.01) :param regType: The type of regularizer used for training our model. - :Allowed values: + Allowed values: + - "l1" for using L1 regularization - - "l2" for using L2 regularization + - "l2" for using L2 regularization (default) - None for no regularization - (default: "l2") :param intercept: Boolean parameter which indicates the use or not of the augmented representation for training data (i.e., whether bias @@ -524,11 +524,11 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, (default: None) :param regType: The type of regularizer used for training our model. - :Allowed values: + Allowed values: + - "l1" for using L1 regularization - - "l2" for using L2 regularization + - "l2" for using L2 regularization (default) - None for no regularization - (default: "l2") :param intercept: Boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias