From 146c2a8b340f2269612a11469205aa10414cbb13 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Tue, 5 Jan 2016 12:08:34 +0100 Subject: [PATCH 1/4] [SPARK-12633][DOC] Update param descriptions Updates the param descriptions to be consistent. See [SPARK-11219] for more details. --- python/pyspark/mllib/regression.py | 209 +++++++++++++++++------------ 1 file changed, 121 insertions(+), 88 deletions(-) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 13b3397501c0b..26342f5abfc07 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -37,10 +37,11 @@ class LabeledPoint(object): """ Class that represents the features and labels of a data point. - :param label: Label for this data point. - :param features: Vector of features for this point (NumPy array, - list, pyspark.mllib.linalg.SparseVector, or scipy.sparse - column matrix) + :param label: + Label for this data point. + :param features: + Vector of features for this point (NumPy array, list, + pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix) Note: 'label' and 'features' are accessible as class attributes. @@ -66,8 +67,10 @@ class LinearModel(object): """ A linear model that has a vector of coefficients and an intercept. - :param weights: Weights computed for every feature. - :param intercept: Intercept computed for this model. + :param weights: + Weights computed for every feature. + :param intercept: + Intercept computed for this model. .. versionadded:: 0.9.0 """ @@ -245,37 +248,45 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, set of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: The training data, an RDD of - LabeledPoint. - :param iterations: The number of iterations - (default: 100). - :param step: The step parameter used in SGD - (default: 1.0). - :param miniBatchFraction: Fraction of data to be used for each - SGD iteration (default: 1.0). - :param initialWeights: The initial weights (default: None). - :param regParam: The regularizer parameter - (default: 0.0). - :param regType: The type of regularizer used for - training our model. - - :Allowed values: - - "l1" for using L1 regularization (lasso), - - "l2" for using L2 regularization (ridge), - - None for no regularization - - (default: None) - - :param intercept: Boolean parameter which indicates the - use or not of the augmented representation - for training data (i.e. whether bias - features are activated or not, - default: False). - :param validateData: Boolean parameter which indicates if - the algorithm should validate data - before training. (default: True) - :param convergenceTol: A condition which decides iteration termination. - (default: 0.001) + :param data: + The training data, an RDD of LabeledPoint. + :param iterations: + The number of iterations. + (default: 100) + :param step: + The step parameter used in SGD. + (default: 1.0) + :param miniBatchFraction: + Fraction of data to be used for each SGD iteration. + (default: 1.0) + :param initialWeights: + The initial weights. + (default: None) + :param regParam: + The regularizer parameter. + (default: 0.0) + :param regType: + The type of regularizer used for training our model. + + :Allowed values: + - "l1" for using L1 regularization (lasso), + - "l2" for using L2 regularization (ridge), + - None for no regularization + + (default: None) + + :param intercept: + Boolean parameter which indicates the use or not of the augmented + representation for training data (i.e., whether bias features are + activated or not). + (default: False) + :param validateData: + Boolean parameter which indicates if the algorithm should validate data + before training. + (default: True) + :param convergenceTol: + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations), @@ -393,27 +404,35 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, set of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: The training data, an RDD of - LabeledPoint. - :param iterations: The number of iterations - (default: 100). - :param step: The step parameter used in SGD - (default: 1.0). - :param regParam: The regularizer parameter - (default: 0.01). - :param miniBatchFraction: Fraction of data to be used for each - SGD iteration (default: 1.0). - :param initialWeights: The initial weights (default: None). - :param intercept: Boolean parameter which indicates the - use or not of the augmented representation - for training data (i.e. whether bias - features are activated or not, - default: False). - :param validateData: Boolean parameter which indicates if - the algorithm should validate data - before training. (default: True) - :param convergenceTol: A condition which decides iteration termination. - (default: 0.001) + :param data: + The training data, an RDD of LabeledPoint. + :param iterations: + The number of iterations. + (default: 100) + :param step: + The step parameter used in SGD. + (default: 1.0) + :param regParam: + The regularizer parameter. + (default: 0.01) + :param miniBatchFraction: + Fraction of data to be used for each SGD iteration. + (default: 1.0) + :param initialWeights: + The initial weights. + (default: None) + :param intercept: + Boolean parameter which indicates the use or not of the augmented + representation for training data (i.e. whether bias features are + activated or not). + (default: False) + :param validateData: + Boolean parameter which indicates if the algorithm should validate + data before training. + (default: True) + :param convergenceTol: + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step), @@ -531,27 +550,35 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, set of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: The training data, an RDD of - LabeledPoint. - :param iterations: The number of iterations - (default: 100). - :param step: The step parameter used in SGD - (default: 1.0). - :param regParam: The regularizer parameter - (default: 0.01). - :param miniBatchFraction: Fraction of data to be used for each - SGD iteration (default: 1.0). - :param initialWeights: The initial weights (default: None). - :param intercept: Boolean parameter which indicates the - use or not of the augmented representation - for training data (i.e. whether bias - features are activated or not, - default: False). - :param validateData: Boolean parameter which indicates if - the algorithm should validate data - before training. (default: True) - :param convergenceTol: A condition which decides iteration termination. - (default: 0.001) + :param data: + The training data, an RDD of LabeledPoint. + :param iterations: + The number of iterations. + (default: 100) + :param step: + The step parameter used in SGD. + (default: 1.0) + :param regParam: + The regularizer parameter. + (default: 0.01) + :param miniBatchFraction: + Fraction of data to be used for each SGD iteration. + (default: 1.0) + :param initialWeights: + The initial weights. + (default: None) + :param intercept: + Boolean parameter which indicates the use or not of the augmented + representation for training data (i.e. whether bias features are + activated or not). + (default: False) + :param validateData: + Boolean parameter which indicates if the algorithm should validate + data before training. + (default: True) + :param convergenceTol: + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step), @@ -566,12 +593,14 @@ class IsotonicRegressionModel(Saveable, Loader): """ Regression model for isotonic regression. - :param boundaries: Array of boundaries for which predictions are - known. Boundaries must be sorted in increasing order. - :param predictions: Array of predictions associated to the - boundaries at the same index. Results of isotonic - regression and therefore monotone. - :param isotonic: indicates whether this is isotonic or antitonic. + :param boundaries: + Array of boundaries for which predictions are known. Boundaries must be + sorted in increasing order. + :param predictions: + Array of predictions associated to the boundaries at the same index. + Results of isotonic regression and therefore monotone. + :param isotonic: + Indicates whether this is isotonic or antitonic. >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)] >>> irm = IsotonicRegression.train(sc.parallelize(data)) @@ -622,7 +651,8 @@ def predict(self, x): values with the same boundary then the same rules as in 2) are used. - :param x: Feature or RDD of Features to be labeled. + :param x: + Feature or RDD of Features to be labeled. """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) @@ -676,8 +706,11 @@ def train(cls, data, isotonic=True): """ Train a isotonic regression model on the given data. - :param data: RDD of (label, feature, weight) tuples. - :param isotonic: Whether this is isotonic or antitonic. + :param data: + RDD of (label, feature, weight) tuples. + :param isotonic: + Whether this is isotonic (which is default) or antitonic. + (default: True) """ boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", data.map(_convert_to_vector), bool(isotonic)) From d361d70806a9e758a9ee2986c144a89f6a0c7b63 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Wed, 6 Jan 2016 11:30:18 +0100 Subject: [PATCH 2/4] Style Fixes Change fill-column to 100. --- python/pyspark/mllib/regression.py | 39 ++++++++++++------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 26342f5abfc07..c6b52c9fbd64b 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -40,8 +40,8 @@ class LabeledPoint(object): :param label: Label for this data point. :param features: - Vector of features for this point (NumPy array, list, - pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix) + Vector of features for this point (NumPy array, list, pyspark.mllib.linalg.SparseVector, or + scipy.sparse column matrix) Note: 'label' and 'features' are accessible as class attributes. @@ -267,22 +267,17 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, (default: 0.0) :param regType: The type of regularizer used for training our model. - :Allowed values: - "l1" for using L1 regularization (lasso), - "l2" for using L2 regularization (ridge), - None for no regularization - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the augmented - representation for training data (i.e., whether bias features are - activated or not). + Boolean parameter which indicates the use or not of the augmented representation for + training data (i.e., whether bias features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate data - before training. + Boolean parameter which indicates if the algorithm should validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -422,13 +417,11 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, The initial weights. (default: None) :param intercept: - Boolean parameter which indicates the use or not of the augmented - representation for training data (i.e. whether bias features are - activated or not). + Boolean parameter which indicates the use or not of the augmented representation for + training data (i.e. whether bias features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate - data before training. + Boolean parameter which indicates if the algorithm should validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -568,13 +561,11 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, The initial weights. (default: None) :param intercept: - Boolean parameter which indicates the use or not of the augmented - representation for training data (i.e. whether bias features are - activated or not). + Boolean parameter which indicates the use or not of the augmented representation for + training data (i.e. whether bias features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate - data before training. + Boolean parameter which indicates if the algorithm should validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -594,11 +585,11 @@ class IsotonicRegressionModel(Saveable, Loader): Regression model for isotonic regression. :param boundaries: - Array of boundaries for which predictions are known. Boundaries must be - sorted in increasing order. + Array of boundaries for which predictions are known. Boundaries must be sorted in increasing + order. :param predictions: - Array of predictions associated to the boundaries at the same index. - Results of isotonic regression and therefore monotone. + Array of predictions associated to the boundaries at the same index. Results of isotonic + regression and therefore monotone. :param isotonic: Indicates whether this is isotonic or antitonic. From 45bec55b2f6bb165a0491e71bff6f2341a58b744 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Fri, 22 Jan 2016 15:21:51 +0100 Subject: [PATCH 3/4] Limit parameter descriptions to col 74 --- python/pyspark/mllib/regression.py | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index c6b52c9fbd64b..de97ba515fb7b 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -40,8 +40,8 @@ class LabeledPoint(object): :param label: Label for this data point. :param features: - Vector of features for this point (NumPy array, list, pyspark.mllib.linalg.SparseVector, or - scipy.sparse column matrix) + Vector of features for this point (NumPy array, list, + pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix) Note: 'label' and 'features' are accessible as class attributes. @@ -273,11 +273,13 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - None for no regularization (default: None) :param intercept: - Boolean parameter which indicates the use or not of the augmented representation for - training data (i.e., whether bias features are activated or not). + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate data before training. + Boolean parameter which indicates if the algorithm should + validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -417,11 +419,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, The initial weights. (default: None) :param intercept: - Boolean parameter which indicates the use or not of the augmented representation for - training data (i.e. whether bias features are activated or not). + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate data before training. + Boolean parameter which indicates if the algorithm should + validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -561,11 +565,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, The initial weights. (default: None) :param intercept: - Boolean parameter which indicates the use or not of the augmented representation for - training data (i.e. whether bias features are activated or not). + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). (default: False) :param validateData: - Boolean parameter which indicates if the algorithm should validate data before training. + Boolean parameter which indicates if the algorithm should + validate data before training. (default: True) :param convergenceTol: A condition which decides iteration termination. @@ -585,11 +591,11 @@ class IsotonicRegressionModel(Saveable, Loader): Regression model for isotonic regression. :param boundaries: - Array of boundaries for which predictions are known. Boundaries must be sorted in increasing - order. + Array of boundaries for which predictions are known. Boundaries must + be sorted in increasing order. :param predictions: - Array of predictions associated to the boundaries at the same index. Results of isotonic - regression and therefore monotone. + Array of predictions associated to the boundaries at the same index. + Results of isotonic regression and therefore monotone. :param isotonic: Indicates whether this is isotonic or antitonic. From 5feecbad219895696709d804facfb8c575d1d5b4 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Sat, 23 Jan 2016 08:13:15 +0100 Subject: [PATCH 4/4] Fix indentation --- python/pyspark/mllib/regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index de97ba515fb7b..d5e3adb5ac9e2 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -41,7 +41,7 @@ class LabeledPoint(object): Label for this data point. :param features: Vector of features for this point (NumPy array, list, - pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix) + pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). Note: 'label' and 'features' are accessible as class attributes. @@ -422,7 +422,7 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, Boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias features are activated or not). - (default: False) + (default: False) :param validateData: Boolean parameter which indicates if the algorithm should validate data before training.