From 32c4a22a54a8ca88fdd766c3fab2672d219c0f06 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Fri, 22 Jun 2018 10:22:52 -0700
Subject: [PATCH 1/6] Adding xml style documentation for lbfgs, sdca and
 averagerPerceptron trainers, to improve what's currently on
 docs.microsoft.com

---
 .../Standard/LinearClassificationTrainer.cs   | 18 ++++
 .../LogisticRegression/LbfgsPredictorBase.cs  | 27 ++++++
 .../LogisticRegression/LogisticRegression.cs  |  2 +-
 .../MulticlassLogisticRegression.cs           |  2 +-
 .../Standard/Online/AveragedPerceptron.cs     | 14 ++-
 .../Standard/SdcaMultiClass.cs                |  2 +-
 .../Standard/SdcaRegression.cs                |  2 +-
 src/Microsoft.ML/CSharpApi.cs                 | 87 +++++++++++++++++--
 .../Common/EntryPoints/core_ep-list.tsv       | 10 +--
 .../Common/EntryPoints/core_manifest.json     | 10 +--
 10 files changed, 153 insertions(+), 21 deletions(-)
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
index 56a3663054..6d72d866cf 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
@@ -222,6 +222,24 @@ internal virtual void Check(IHostEnvironment env)
             }
         }
 
+        internal const string SDCADetailedSummary = @"This classifier is a trainer based on the Stochastic DualCoordinate 
+Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
+that supports multi-threading.
+Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+Several choices of loss functions are also provided.
+The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+For more information on SDCA, see:
+<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
+<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
+
+Note that SDCA is a stochastic and streaming optimization algorithm. 
+The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
+`False` and `NumThreads` to `1`.
+
+Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
+In general, the larger the l2_weight, the faster SDCA converges.";
+
         // The order of these matter, since they are used as indices into arrays.
         protected enum MetricKind
         {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index 89f4866228..4a39d3ac92 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -94,6 +94,33 @@ public abstract class ArgumentsBase : LearnerInputBaseWithWeight
             public bool EnforceNonNegativity = false;
         }
 
+        internal const string DetailedSummary = @"Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
+If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
+If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+
+The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
+Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
+But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
+The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+
+This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
+Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
+This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
+An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+
+l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
+l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
+
+Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
+The default values of x and y are both 1. 
+An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+
+<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
+<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
+<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
+<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.
+";
+
         protected int NumFeatures;
         protected VBuffer<Float> CurrentWeights;
         protected long NumGoodRows;
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 5abc062bf7..6f4a1d9617 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -386,7 +386,7 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
                 new PlattCalibrator(Host, -1, 0));
         }
 
-        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier", Desc = "Train a logistic regression binary model", UserName = UserNameValue, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier", Desc = DetailedSummary, UserName = UserNameValue, ShortName = ShortName)]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 51decafea5..8e9b03b831 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -961,7 +961,7 @@ public IRow GetStatsIRowOrNull(RoleMappedSchema schema)
     /// </summary>
     public partial class LogisticRegression
     {
-        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier", Desc = "Train a logistic regression multi class model", UserName = MulticlassLogisticRegression.UserNameValue, ShortName = MulticlassLogisticRegression.ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier", Desc = DetailedSummary, UserName = MulticlassLogisticRegression.UserNameValue, ShortName = MulticlassLogisticRegression.ShortName)]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index 1861821d1c..b6b9df9894 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -37,6 +37,18 @@ public sealed class AveragedPerceptronTrainer :
         internal const string UserNameValue = "Averaged Perceptron";
         internal const string ShortName = "ap";
         internal const string Summary = "Perceptron is a binary classification algorithm that makes its predictions based on a linear function.";
+        internal const string DetailedSummary = @"Perceptron is a classification algorithm that makes its predictions based on a linear function. 
+I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
+
+Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
+The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
+If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
+the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, 
+multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
+and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
+
+In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). 
+The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.";
 
         public class Arguments : AveragedLinearArguments
         {
@@ -91,7 +103,7 @@ public override LinearBinaryPredictor CreatePredictor()
             return new LinearBinaryPredictor(Host, ref weights, bias);
         }
 
-        [TlcModule.EntryPoint(Name = "Trainers.AveragedPerceptronBinaryClassifier", Desc = "Train a Average perceptron.", UserName = UserNameValue, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.AveragedPerceptronBinaryClassifier", Desc = DetailedSummary, UserName = UserNameValue, ShortName = ShortName)]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
index f8d7db7998..b00bc1a4c5 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
@@ -386,7 +386,7 @@ protected override Float GetInstanceWeight(FloatLabelCursor cursor)
     /// </summary>
     public static partial class Sdca
     {
-        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier", Desc = "Train an SDCA multi class model", UserName = SdcaMultiClassTrainer.UserNameValue, ShortName = SdcaMultiClassTrainer.ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier", Desc = SdcaMultiClassTrainer.SDCADetailedSummary, UserName = SdcaMultiClassTrainer.UserNameValue, ShortName = SdcaMultiClassTrainer.ShortName)]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
index e8f5aeb04d..516c2c7fcb 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
@@ -131,7 +131,7 @@ protected override Float TuneDefaultL2(IChannel ch, int maxIterations, long rowC
     /// </summary>
     public static partial class Sdca
     {
-        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor", Desc = "Train an SDCA regression model", UserName = SdcaRegressionTrainer.UserNameValue, ShortName = SdcaRegressionTrainer.ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor", Desc = SdcaRegressionTrainer.SDCADetailedSummary, UserName = SdcaRegressionTrainer.UserNameValue, ShortName = SdcaRegressionTrainer.ShortName)]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index fd365acfbe..3533f74a1f 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -76,7 +76,7 @@ public Microsoft.ML.Data.TextLoader.Output Add(Microsoft.ML.Data.TextLoader inpu
                 Add(input, output);
                 return output;
             }
-
+            
             public void Add(Microsoft.ML.Data.TextLoader input, Microsoft.ML.Data.TextLoader.Output output)
             {
                 _jsonNodes.Add(Serialize("Data.TextLoader", input, output));
@@ -3642,7 +3642,16 @@ namespace Trainers
     {
 
         /// <summary>
-        /// Train a Average perceptron.
+        /// Perceptron is a classification algorithm that makes its predictions based on a linear function. 
+        /// I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
+        /// Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
+        /// The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
+        /// If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
+        /// the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, 
+        /// multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
+        /// and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
+        /// In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). 
+        /// The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
         /// </summary>
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
@@ -6535,7 +6544,26 @@ namespace Trainers
     {
 
         /// <summary>
-        /// Train a logistic regression binary model
+        /// Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
+        /// If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
+        /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
+        /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
+        /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
+        /// The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+        /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
+        /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
+        /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
+        /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+        /// l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
+        /// l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
+        /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
+        /// The default values of x and y are both 1. 
+        /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+        /// <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
+        /// <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
+        /// <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
+        /// <see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.
         /// </summary>
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
@@ -6685,7 +6713,26 @@ namespace Trainers
     {
 
         /// <summary>
-        /// Train a logistic regression multi class model
+        /// Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
+        /// If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
+        /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
+        /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
+        /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
+        /// The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+        /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
+        /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
+        /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
+        /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+        /// l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
+        /// l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
+        /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
+        /// The default values of x and y are both 1. 
+        /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+        /// <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
+        /// <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
+        /// <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
+        /// <see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.
         /// </summary>
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
@@ -7445,7 +7492,21 @@ namespace Trainers
     {
 
         /// <summary>
-        /// Train an SDCA multi class model
+        /// This classifier is a trainer based on the Stochastic DualCoordinate 
+        /// Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+        /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
+        /// that supports multi-threading.
+        /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+        /// Several choices of loss functions are also provided.
+        /// The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+        /// For more information on SDCA, see:
+        /// <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
+        /// <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
+        /// Note that SDCA is a stochastic and streaming optimization algorithm. 
+        /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
+        /// `False` and `NumThreads` to `1`.
+        /// Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
+        /// In general, the larger the l2_weight, the faster SDCA converges.
         /// </summary>
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
@@ -7570,7 +7631,21 @@ namespace Trainers
     {
 
         /// <summary>
-        /// Train an SDCA regression model
+        /// This classifier is a trainer based on the Stochastic DualCoordinate 
+        /// Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+        /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
+        /// that supports multi-threading.
+        /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+        /// Several choices of loss functions are also provided.
+        /// The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+        /// For more information on SDCA, see:
+        /// <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
+        /// <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
+        /// Note that SDCA is a stochastic and streaming optimization algorithm. 
+        /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
+        /// `False` and `NumThreads` to `1`.
+        /// Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
+        /// In general, the larger the l2_weight, the faster SDCA converges.
         /// </summary>
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index a6d1f50668..72e1aeaef1 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -29,7 +29,7 @@ Models.Summarizer	Summarize a linear regression predictor.	Microsoft.ML.Runtime.
 Models.SweepResultExtractor	Extracts the sweep result.	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro	ExtractSweepResult	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+ResultInput	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+Output
 Models.TrainTestBinaryEvaluator	Train test for binary classification	Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro	TrainTestBinary	Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output]
 Models.TrainTestEvaluator	General train test for any supported evaluator	Microsoft.ML.Runtime.EntryPoints.TrainTestMacro	TrainTest	Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output]
-Trainers.AveragedPerceptronBinaryClassifier	Train a Average perceptron.	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
+Trainers.AveragedPerceptronBinaryClassifier	Perceptron is a classification algorithm that makes its predictions based on a linear function. \n I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.\n \n Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.\n The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. \n If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,\n the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, \n multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, \n and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).\n \n In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). \n The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Trainers.FastForestBinaryClassifier	Uses a random forest learner to perform binary classification.	Microsoft.ML.Runtime.FastTree.FastForest	TrainBinary	Microsoft.ML.Runtime.FastTree.FastForestClassification+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Trainers.FastForestRegressor	Trains a random forest to fit target values using least-squares.	Microsoft.ML.Runtime.FastTree.FastForest	TrainRegression	Microsoft.ML.Runtime.FastTree.FastForestRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.FastTreeBinaryClassifier	Uses a logit-boost boosted tree learner to perform binary classification.	Microsoft.ML.Runtime.FastTree.FastTree	TrainBinary	Microsoft.ML.Runtime.FastTree.FastTreeBinaryClassificationTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
@@ -40,15 +40,15 @@ Trainers.GeneralizedAdditiveModelBinaryClassifier	Trains a gradient boosted stum
 Trainers.GeneralizedAdditiveModelRegressor	Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.	Microsoft.ML.Runtime.FastTree.Gam	TrainRegression	Microsoft.ML.Runtime.FastTree.RegressionGamTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.KMeansPlusPlusClusterer	K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.	Microsoft.ML.Runtime.KMeans.KMeansPlusPlusTrainer	TrainKMeans	Microsoft.ML.Runtime.KMeans.KMeansPlusPlusTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+ClusteringOutput
 Trainers.LinearSvmBinaryClassifier	Train a linear SVM.	Microsoft.ML.Runtime.Learners.LinearSvm	TrainLinearSvm	Microsoft.ML.Runtime.Learners.LinearSvm+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
-Trainers.LogisticRegressionBinaryClassifier	Train a logistic regression binary model	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainBinary	Microsoft.ML.Runtime.Learners.LogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
-Trainers.LogisticRegressionClassifier	Train a logistic regression multi class model	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainMultiClass	Microsoft.ML.Runtime.Learners.MulticlassLogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
+Trainers.LogisticRegressionBinaryClassifier	Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \n If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \n If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\n \n The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \n Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \n But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \n The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\n \n This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \n Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \n This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \n An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\n \n l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \n l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \n \n Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \n The default values of x and y are both 1. \n An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\n \n <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\n <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\n <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\n <see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\n 	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainBinary	Microsoft.ML.Runtime.Learners.LogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
+Trainers.LogisticRegressionClassifier	Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \n If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \n If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\n \n The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \n Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \n But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \n The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\n \n This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \n Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \n This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \n An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\n \n l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \n l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \n \n Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \n The default values of x and y are both 1. \n An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\n \n <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\n <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\n <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\n <see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\n 	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainMultiClass	Microsoft.ML.Runtime.Learners.MulticlassLogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
 Trainers.NaiveBayesClassifier	Train a MultiClassNaiveBayesTrainer.	Microsoft.ML.Runtime.Learners.MultiClassNaiveBayesTrainer	TrainMultiClassNaiveBayesTrainer	Microsoft.ML.Runtime.Learners.MultiClassNaiveBayesTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
 Trainers.OnlineGradientDescentRegressor	Train a Online gradient descent perceptron.	Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer	TrainRegression	Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.PcaAnomalyDetector	Train an PCA Anomaly model.	Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer	TrainPcaAnomaly	Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+AnomalyDetectionOutput
 Trainers.PoissonRegressor	Train an Poisson regression model.	Microsoft.ML.Runtime.Learners.PoissonRegression	TrainRegression	Microsoft.ML.Runtime.Learners.PoissonRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.StochasticDualCoordinateAscentBinaryClassifier	Train an SDCA binary model.	Microsoft.ML.Runtime.Learners.Sdca	TrainBinary	Microsoft.ML.Runtime.Learners.LinearClassificationTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
-Trainers.StochasticDualCoordinateAscentClassifier	Train an SDCA multi class model	Microsoft.ML.Runtime.Learners.Sdca	TrainMultiClass	Microsoft.ML.Runtime.Learners.SdcaMultiClassTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
-Trainers.StochasticDualCoordinateAscentRegressor	Train an SDCA regression model	Microsoft.ML.Runtime.Learners.Sdca	TrainRegression	Microsoft.ML.Runtime.Learners.SdcaRegressionTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
+Trainers.StochasticDualCoordinateAscentClassifier	This classifier is a trainer based on the Stochastic DualCoordinate \n Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\n The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \n that supports multi-threading.\n Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\n Several choices of loss functions are also provided.\n The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\n For more information on SDCA, see:\n <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\n <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\n \n Note that SDCA is a stochastic and streaming optimization algorithm. \n The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\n `False` and `NumThreads` to `1`.\n \n Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \n In general, the larger the l2_weight, the faster SDCA converges.	Microsoft.ML.Runtime.Learners.Sdca	TrainMultiClass	Microsoft.ML.Runtime.Learners.SdcaMultiClassTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
+Trainers.StochasticDualCoordinateAscentRegressor	This classifier is a trainer based on the Stochastic DualCoordinate \n Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\n The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \n that supports multi-threading.\n Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\n Several choices of loss functions are also provided.\n The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\n For more information on SDCA, see:\n <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\n <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\n \n Note that SDCA is a stochastic and streaming optimization algorithm. \n The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\n `False` and `NumThreads` to `1`.\n \n Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \n In general, the larger the l2_weight, the faster SDCA converges.	Microsoft.ML.Runtime.Learners.Sdca	TrainRegression	Microsoft.ML.Runtime.Learners.SdcaRegressionTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.StochasticGradientDescentBinaryClassifier	Train an Hogwild SGD binary model.	Microsoft.ML.Runtime.Learners.StochasticGradientDescentClassificationTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.StochasticGradientDescentClassificationTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Transforms.ApproximateBootstrapSampler	Approximate bootstrap sampling.	Microsoft.ML.Runtime.Data.BootstrapSample	GetSample	Microsoft.ML.Runtime.Data.BootstrapSampleTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.BinaryPredictionScoreColumnsRenamer	For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.	Microsoft.ML.Runtime.EntryPoints.ScoreModel	RenameBinaryPredictionScoreColumns	Microsoft.ML.Runtime.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index a5cb656da9..5799b00f20 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -3349,7 +3349,7 @@
     },
     {
       "Name": "Trainers.AveragedPerceptronBinaryClassifier",
-      "Desc": "Train a Average perceptron.",
+      "Desc": "Perceptron is a classification algorithm that makes its predictions based on a linear function. \r\nI.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.\r\n\r\nPerceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.\r\nThe weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. \r\nIf this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,\r\nthe weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, \r\nmultiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, \r\nand by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).\r\n\r\nIn Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). \r\nThe prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.",
       "FriendlyName": "Averaged Perceptron",
       "ShortName": "ap",
       "Inputs": [
@@ -9727,7 +9727,7 @@
     },
     {
       "Name": "Trainers.LogisticRegressionBinaryClassifier",
-      "Desc": "Train a logistic regression binary model",
+      "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \r\nIf the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \r\nIf the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\r\n\r\nThe optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \r\nBoth the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \r\nBut the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \r\nThe memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\r\n\r\nThis learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \r\nRegularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \r\nThis can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \r\nAn accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\r\n\r\nl1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \r\nl2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \r\n\r\nAdding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \r\nThe default values of x and y are both 1. \r\nAn agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\r\n\r\n<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\r\n<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\r\n<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\r\n<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\r\n",
       "FriendlyName": "Logistic Regression",
       "ShortName": "lr",
       "Inputs": [
@@ -10039,7 +10039,7 @@
     },
     {
       "Name": "Trainers.LogisticRegressionClassifier",
-      "Desc": "Train a logistic regression multi class model",
+      "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \r\nIf the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \r\nIf the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\r\n\r\nThe optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \r\nBoth the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \r\nBut the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \r\nThe memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\r\n\r\nThis learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \r\nRegularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \r\nThis can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \r\nAn accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\r\n\r\nl1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \r\nl2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \r\n\r\nAdding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \r\nThe default values of x and y are both 1. \r\nAn agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\r\n\r\n<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\r\n<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\r\n<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\r\n<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\r\n",
       "FriendlyName": "Multi-class Logistic Regression",
       "ShortName": "mlr",
       "Inputs": [
@@ -11545,7 +11545,7 @@
     },
     {
       "Name": "Trainers.StochasticDualCoordinateAscentClassifier",
-      "Desc": "Train an SDCA multi class model",
+      "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate \r\nAscent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\r\nThe algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \r\nthat supports multi-threading.\r\nConvergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\r\nSeveral choices of loss functions are also provided.\r\nThe SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\r\nFor more information on SDCA, see:\r\n<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\r\n<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\r\n\r\nNote that SDCA is a stochastic and streaming optimization algorithm. \r\nThe results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\r\n`False` and `NumThreads` to `1`.\r\n\r\nElastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \r\nIn general, the larger the l2_weight, the faster SDCA converges.",
       "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)",
       "ShortName": "sasdcamc",
       "Inputs": [
@@ -11815,7 +11815,7 @@
     },
     {
       "Name": "Trainers.StochasticDualCoordinateAscentRegressor",
-      "Desc": "Train an SDCA regression model",
+      "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate \r\nAscent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\r\nThe algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \r\nthat supports multi-threading.\r\nConvergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\r\nSeveral choices of loss functions are also provided.\r\nThe SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\r\nFor more information on SDCA, see:\r\n<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\r\n<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\r\n\r\nNote that SDCA is a stochastic and streaming optimization algorithm. \r\nThe results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\r\n`False` and `NumThreads` to `1`.\r\n\r\nElastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \r\nIn general, the larger the l2_weight, the faster SDCA converges.",
       "FriendlyName": "Fast Linear Regression (SA-SDCA)",
       "ShortName": "sasdcar",
       "Inputs": [

From ca8d4037ae59bc0d83410bd3f4899437548aa4b3 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Fri, 22 Jun 2018 13:02:40 -0700
Subject: [PATCH 2/6] regenerating the C#Api file

---
 src/Microsoft.ML/CSharpApi.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index 3533f74a1f..dcd05a010d 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -76,7 +76,7 @@ public Microsoft.ML.Data.TextLoader.Output Add(Microsoft.ML.Data.TextLoader inpu
                 Add(input, output);
                 return output;
             }
-            
+
             public void Add(Microsoft.ML.Data.TextLoader input, Microsoft.ML.Data.TextLoader.Output output)
             {
                 _jsonNodes.Add(Serialize("Data.TextLoader", input, output));

From 7ca7e8c98837673edb723813600f7e274d88291b Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 25 Jun 2018 12:21:55 -0700
Subject: [PATCH 3/6] Removing the control characters from the description when
 generating the ep_list.tsv, so that they have one line per entry point.
 Formatting.

---
 .../Standard/Online/AveragedPerceptron.cs     |  7 +++---
 src/Microsoft.ML/CSharpApi.cs                 | 23 ++++++++++++++++---
 .../Common/EntryPoints/core_ep-list.tsv       | 10 ++++----
 .../Common/EntryPoints/core_manifest.json     |  2 +-
 .../UnitTests/TestEntryPoints.cs              |  9 +++++++-
 5 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index b6b9df9894..57a6de3e71 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -37,17 +37,18 @@ public sealed class AveragedPerceptronTrainer :
         internal const string UserNameValue = "Averaged Perceptron";
         internal const string ShortName = "ap";
         internal const string Summary = "Perceptron is a binary classification algorithm that makes its predictions based on a linear function.";
-        internal const string DetailedSummary = @"Perceptron is a classification algorithm that makes its predictions based on a linear function. 
+        internal const string DetailedSummary = @"Perceptron is a classification algorithm that makes its predictions based on a linear function.
 I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
 
 Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
 The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
 If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
-the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, 
+the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
 multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
 and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
 
-In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). 
+In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
+together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
 The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.";
 
         public class Arguments : AveragedLinearArguments
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index dcd05a010d..d8e54b3629 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -3642,15 +3642,18 @@ namespace Trainers
     {
 
         /// <summary>
-        /// Perceptron is a classification algorithm that makes its predictions based on a linear function. 
+        /// Perceptron is a classification algorithm that makes its predictions based on a linear function.
         /// I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
+        /// 
         /// Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
         /// The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
         /// If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
-        /// the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, 
+        /// the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
         /// multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
         /// and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
-        /// In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). 
+        /// 
+        /// In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
+        /// together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
         /// The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
         /// </summary>
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
@@ -6547,19 +6550,24 @@ namespace Trainers
         /// Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
         /// If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
         /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        /// 
         /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
         /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
         /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
         /// The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+        /// 
         /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
         /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
         /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
         /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+        /// 
         /// l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
         /// l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
+        /// 
         /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
         /// The default values of x and y are both 1. 
         /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+        /// 
         /// <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
         /// <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
         /// <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
@@ -6716,19 +6724,24 @@ namespace Trainers
         /// Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
         /// If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
         /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        /// 
         /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
         /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
         /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
         /// The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+        /// 
         /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
         /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
         /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
         /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+        /// 
         /// l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
         /// l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
+        /// 
         /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
         /// The default values of x and y are both 1. 
         /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+        /// 
         /// <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
         /// <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
         /// <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
@@ -7502,9 +7515,11 @@ namespace Trainers
         /// For more information on SDCA, see:
         /// <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
         /// <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
+        /// 
         /// Note that SDCA is a stochastic and streaming optimization algorithm. 
         /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
         /// `False` and `NumThreads` to `1`.
+        /// 
         /// Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
         /// In general, the larger the l2_weight, the faster SDCA converges.
         /// </summary>
@@ -7641,9 +7656,11 @@ namespace Trainers
         /// For more information on SDCA, see:
         /// <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
         /// <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
+        /// 
         /// Note that SDCA is a stochastic and streaming optimization algorithm. 
         /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
         /// `False` and `NumThreads` to `1`.
+        /// 
         /// Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
         /// In general, the larger the l2_weight, the faster SDCA converges.
         /// </summary>
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index 72e1aeaef1..bed6fb8d9c 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -29,7 +29,7 @@ Models.Summarizer	Summarize a linear regression predictor.	Microsoft.ML.Runtime.
 Models.SweepResultExtractor	Extracts the sweep result.	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro	ExtractSweepResult	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+ResultInput	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+Output
 Models.TrainTestBinaryEvaluator	Train test for binary classification	Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro	TrainTestBinary	Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output]
 Models.TrainTestEvaluator	General train test for any supported evaluator	Microsoft.ML.Runtime.EntryPoints.TrainTestMacro	TrainTest	Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output]
-Trainers.AveragedPerceptronBinaryClassifier	Perceptron is a classification algorithm that makes its predictions based on a linear function. \n I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.\n \n Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.\n The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. \n If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,\n the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, \n multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, \n and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).\n \n In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). \n The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
+Trainers.AveragedPerceptronBinaryClassifier	Perceptron is a classification algorithm that makes its predictions based on a linear function.I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Trainers.FastForestBinaryClassifier	Uses a random forest learner to perform binary classification.	Microsoft.ML.Runtime.FastTree.FastForest	TrainBinary	Microsoft.ML.Runtime.FastTree.FastForestClassification+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Trainers.FastForestRegressor	Trains a random forest to fit target values using least-squares.	Microsoft.ML.Runtime.FastTree.FastForest	TrainRegression	Microsoft.ML.Runtime.FastTree.FastForestRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.FastTreeBinaryClassifier	Uses a logit-boost boosted tree learner to perform binary classification.	Microsoft.ML.Runtime.FastTree.FastTree	TrainBinary	Microsoft.ML.Runtime.FastTree.FastTreeBinaryClassificationTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
@@ -40,15 +40,15 @@ Trainers.GeneralizedAdditiveModelBinaryClassifier	Trains a gradient boosted stum
 Trainers.GeneralizedAdditiveModelRegressor	Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.	Microsoft.ML.Runtime.FastTree.Gam	TrainRegression	Microsoft.ML.Runtime.FastTree.RegressionGamTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.KMeansPlusPlusClusterer	K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.	Microsoft.ML.Runtime.KMeans.KMeansPlusPlusTrainer	TrainKMeans	Microsoft.ML.Runtime.KMeans.KMeansPlusPlusTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+ClusteringOutput
 Trainers.LinearSvmBinaryClassifier	Train a linear SVM.	Microsoft.ML.Runtime.Learners.LinearSvm	TrainLinearSvm	Microsoft.ML.Runtime.Learners.LinearSvm+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
-Trainers.LogisticRegressionBinaryClassifier	Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \n If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \n If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\n \n The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \n Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \n But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \n The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\n \n This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \n Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \n This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \n An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\n \n l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \n l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \n \n Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \n The default values of x and y are both 1. \n An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\n \n <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\n <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\n <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\n <see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\n 	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainBinary	Microsoft.ML.Runtime.Learners.LogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
-Trainers.LogisticRegressionClassifier	Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \n If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \n If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\n \n The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \n Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \n But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \n The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\n \n This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \n Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \n This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \n An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\n \n l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \n l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \n \n Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \n The default values of x and y are both 1. \n An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\n \n <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\n <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\n <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\n <see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\n 	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainMultiClass	Microsoft.ML.Runtime.Learners.MulticlassLogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
+Trainers.LogisticRegressionBinaryClassifier	Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainBinary	Microsoft.ML.Runtime.Learners.LogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
+Trainers.LogisticRegressionClassifier	Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.	Microsoft.ML.Runtime.Learners.LogisticRegression	TrainMultiClass	Microsoft.ML.Runtime.Learners.MulticlassLogisticRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
 Trainers.NaiveBayesClassifier	Train a MultiClassNaiveBayesTrainer.	Microsoft.ML.Runtime.Learners.MultiClassNaiveBayesTrainer	TrainMultiClassNaiveBayesTrainer	Microsoft.ML.Runtime.Learners.MultiClassNaiveBayesTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
 Trainers.OnlineGradientDescentRegressor	Train a Online gradient descent perceptron.	Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer	TrainRegression	Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.PcaAnomalyDetector	Train an PCA Anomaly model.	Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer	TrainPcaAnomaly	Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+AnomalyDetectionOutput
 Trainers.PoissonRegressor	Train an Poisson regression model.	Microsoft.ML.Runtime.Learners.PoissonRegression	TrainRegression	Microsoft.ML.Runtime.Learners.PoissonRegression+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.StochasticDualCoordinateAscentBinaryClassifier	Train an SDCA binary model.	Microsoft.ML.Runtime.Learners.Sdca	TrainBinary	Microsoft.ML.Runtime.Learners.LinearClassificationTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
-Trainers.StochasticDualCoordinateAscentClassifier	This classifier is a trainer based on the Stochastic DualCoordinate \n Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\n The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \n that supports multi-threading.\n Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\n Several choices of loss functions are also provided.\n The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\n For more information on SDCA, see:\n <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\n <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\n \n Note that SDCA is a stochastic and streaming optimization algorithm. \n The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\n `False` and `NumThreads` to `1`.\n \n Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \n In general, the larger the l2_weight, the faster SDCA converges.	Microsoft.ML.Runtime.Learners.Sdca	TrainMultiClass	Microsoft.ML.Runtime.Learners.SdcaMultiClassTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
-Trainers.StochasticDualCoordinateAscentRegressor	This classifier is a trainer based on the Stochastic DualCoordinate \n Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\n The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \n that supports multi-threading.\n Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\n Several choices of loss functions are also provided.\n The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\n For more information on SDCA, see:\n <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\n <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\n \n Note that SDCA is a stochastic and streaming optimization algorithm. \n The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\n `False` and `NumThreads` to `1`.\n \n Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \n In general, the larger the l2_weight, the faster SDCA converges.	Microsoft.ML.Runtime.Learners.Sdca	TrainRegression	Microsoft.ML.Runtime.Learners.SdcaRegressionTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
+Trainers.StochasticDualCoordinateAscentClassifier	This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.	Microsoft.ML.Runtime.Learners.Sdca	TrainMultiClass	Microsoft.ML.Runtime.Learners.SdcaMultiClassTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
+Trainers.StochasticDualCoordinateAscentRegressor	This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.	Microsoft.ML.Runtime.Learners.Sdca	TrainRegression	Microsoft.ML.Runtime.Learners.SdcaRegressionTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
 Trainers.StochasticGradientDescentBinaryClassifier	Train an Hogwild SGD binary model.	Microsoft.ML.Runtime.Learners.StochasticGradientDescentClassificationTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.StochasticGradientDescentClassificationTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Transforms.ApproximateBootstrapSampler	Approximate bootstrap sampling.	Microsoft.ML.Runtime.Data.BootstrapSample	GetSample	Microsoft.ML.Runtime.Data.BootstrapSampleTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.BinaryPredictionScoreColumnsRenamer	For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.	Microsoft.ML.Runtime.EntryPoints.ScoreModel	RenameBinaryPredictionScoreColumns	Microsoft.ML.Runtime.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index 5799b00f20..39f9e61759 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -3349,7 +3349,7 @@
     },
     {
       "Name": "Trainers.AveragedPerceptronBinaryClassifier",
-      "Desc": "Perceptron is a classification algorithm that makes its predictions based on a linear function. \r\nI.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.\r\n\r\nPerceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.\r\nThe weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. \r\nIf this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,\r\nthe weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example, \r\nmultiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, \r\nand by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).\r\n\r\nIn Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not). \r\nThe prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.",
+      "Desc": "Perceptron is a classification algorithm that makes its predictions based on a linear function.\r\nI.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.\r\n\r\nPerceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.\r\nThe weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. \r\nIf this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,\r\nthe weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,\r\nmultiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, \r\nand by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).\r\n\r\nIn Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, \r\ntogether with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).\r\nThe prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.",
       "FriendlyName": "Averaged Perceptron",
       "ShortName": "ap",
       "Inputs": [
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
index 60e79a943d..6419991876 100644
--- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
+++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -388,7 +388,14 @@ public void EntryPointCatalog()
             var catalog = ModuleCatalog.CreateInstance(Env);
             var path = DeleteOutputPath(entryPointsSubDir, epListFile);
             File.WriteAllLines(path, catalog.AllEntryPoints()
-                .Select(x => string.Join("\t", x.Name, x.Description, x.Method.DeclaringType, x.Method.Name, x.InputType, x.OutputType).Replace(Environment.NewLine, "\\n "))
+                .Select(x => string.Join("\t", 
+                x.Name,
+                new string(x.Description.Where(c => !char.IsControl(c)).ToArray()), 
+                x.Method.DeclaringType,
+                x.Method.Name, 
+                x.InputType,
+                x.OutputType)
+                .Replace(Environment.NewLine, ""))
                 .OrderBy(x => x));
 
             CheckEquality(entryPointsSubDir, epListFile);

From c93c55f82f6241f84b4f2006f2160a2d3c37f0cd Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 25 Jun 2018 14:32:11 -0700
Subject: [PATCH 4/6] spaces

---
 src/Microsoft.ML/CSharpApi.cs | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index d8e54b3629..27db1b3b6a 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -3644,14 +3644,12 @@ namespace Trainers
         /// <summary>
         /// Perceptron is a classification algorithm that makes its predictions based on a linear function.
         /// I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
-        /// 
         /// Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
         /// The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
         /// If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
         /// the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
         /// multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
         /// and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
-        /// 
         /// In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
         /// together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
         /// The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
@@ -6550,24 +6548,19 @@ namespace Trainers
         /// Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
         /// If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
         /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-        /// 
         /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
         /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
         /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
         /// The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-        /// 
         /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
         /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
         /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
         /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-        /// 
         /// l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
         /// l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
-        /// 
         /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
         /// The default values of x and y are both 1. 
         /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-        /// 
         /// <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
         /// <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
         /// <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
@@ -6724,24 +6717,19 @@ namespace Trainers
         /// Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
         /// If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
         /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-        /// 
         /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
         /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
         /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
         /// The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-        /// 
         /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
         /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
         /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
         /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-        /// 
         /// l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
         /// l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
-        /// 
         /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
         /// The default values of x and y are both 1. 
         /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-        /// 
         /// <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
         /// <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
         /// <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
@@ -7515,11 +7503,9 @@ namespace Trainers
         /// For more information on SDCA, see:
         /// <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
         /// <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
-        /// 
         /// Note that SDCA is a stochastic and streaming optimization algorithm. 
         /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
         /// `False` and `NumThreads` to `1`.
-        /// 
         /// Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
         /// In general, the larger the l2_weight, the faster SDCA converges.
         /// </summary>
@@ -7656,11 +7642,9 @@ namespace Trainers
         /// For more information on SDCA, see:
         /// <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
         /// <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
-        /// 
         /// Note that SDCA is a stochastic and streaming optimization algorithm. 
         /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
         /// `False` and `NumThreads` to `1`.
-        /// 
         /// Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
         /// In general, the larger the l2_weight, the faster SDCA converges.
         /// </summary>

From a8698bf18ac4e9e999d32c51e45b5f931b62fdee Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Tue, 26 Jun 2018 15:08:41 -0700
Subject: [PATCH 5/6] The epList.tsv file and the manifest should not have the
 platform specific new line characters.

---
 .../Standard/LinearClassificationTrainer.cs    |  2 --
 .../LogisticRegression/LbfgsPredictorBase.cs   |  5 -----
 .../Standard/Online/AveragedPerceptron.cs      |  2 --
 .../Common/EntryPoints/core_manifest.json      | 10 +++++-----
 .../UnitTests/TestEntryPoints.cs               | 18 +++++++++++++++---
 5 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
index 6d72d866cf..6a2e18dbda 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
@@ -232,11 +232,9 @@ The SDCA method combines several of the best properties and capabilities of logi
 For more information on SDCA, see:
 <see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
 <see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
-
 Note that SDCA is a stochastic and streaming optimization algorithm. 
 The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
 `False` and `NumThreads` to `1`.
-
 Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
 In general, the larger the l2_weight, the faster SDCA converges.";
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index 4a39d3ac92..5fe70de2f0 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -97,24 +97,19 @@ public abstract class ArgumentsBase : LearnerInputBaseWithWeight
         internal const string DetailedSummary = @"Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
 If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
 If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-
 The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
 Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
 But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
 The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-
 This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
 Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
 This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
 An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-
 l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
 l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
-
 Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
 The default values of x and y are both 1. 
 An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-
 <see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
 <see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
 <see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index 57a6de3e71..1164cbb5ae 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -39,14 +39,12 @@ public sealed class AveragedPerceptronTrainer :
         internal const string Summary = "Perceptron is a binary classification algorithm that makes its predictions based on a linear function.";
         internal const string DetailedSummary = @"Perceptron is a classification algorithm that makes its predictions based on a linear function.
 I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
-
 Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
 The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
 If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
 the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
 multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
 and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
-
 In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
 together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
 The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.";
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index 39f9e61759..11669e464d 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -3349,7 +3349,7 @@
     },
     {
       "Name": "Trainers.AveragedPerceptronBinaryClassifier",
-      "Desc": "Perceptron is a classification algorithm that makes its predictions based on a linear function.\r\nI.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.\r\n\r\nPerceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.\r\nThe weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. \r\nIf this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,\r\nthe weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,\r\nmultiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, \r\nand by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).\r\n\r\nIn Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, \r\ntogether with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).\r\nThe prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.",
+      "Desc": "Perceptron is a classification algorithm that makes its predictions based on a linear function.I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.",
       "FriendlyName": "Averaged Perceptron",
       "ShortName": "ap",
       "Inputs": [
@@ -9727,7 +9727,7 @@
     },
     {
       "Name": "Trainers.LogisticRegressionBinaryClassifier",
-      "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \r\nIf the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \r\nIf the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\r\n\r\nThe optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \r\nBoth the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \r\nBut the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \r\nThe memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\r\n\r\nThis learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \r\nRegularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \r\nThis can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \r\nAn accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\r\n\r\nl1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \r\nl2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \r\n\r\nAdding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \r\nThe default values of x and y are both 1. \r\nAn agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\r\n\r\n<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\r\n<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\r\n<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\r\n<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\r\n",
+      "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.",
       "FriendlyName": "Logistic Regression",
       "ShortName": "lr",
       "Inputs": [
@@ -10039,7 +10039,7 @@
     },
     {
       "Name": "Trainers.LogisticRegressionClassifier",
-      "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. \r\nIf the dependent variable has only two possible values (success/failure), then the logistic regression is binary. \r\nIf the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.\r\n\r\nThe optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). \r\nBoth the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. \r\nBut the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. \r\nThe memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.\r\n\r\nThis learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. \r\nRegularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. \r\nThis can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. \r\nAn accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.\r\n\r\nl1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. \r\nl2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. \r\n\r\nAdding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. \r\nThe default values of x and y are both 1. \r\nAn agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.\r\n\r\n<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.\r\n<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.\r\n<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.\r\n<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.\r\n",
+      "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.",
       "FriendlyName": "Multi-class Logistic Regression",
       "ShortName": "mlr",
       "Inputs": [
@@ -11545,7 +11545,7 @@
     },
     {
       "Name": "Trainers.StochasticDualCoordinateAscentClassifier",
-      "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate \r\nAscent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\r\nThe algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \r\nthat supports multi-threading.\r\nConvergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\r\nSeveral choices of loss functions are also provided.\r\nThe SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\r\nFor more information on SDCA, see:\r\n<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\r\n<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\r\n\r\nNote that SDCA is a stochastic and streaming optimization algorithm. \r\nThe results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\r\n`False` and `NumThreads` to `1`.\r\n\r\nElastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \r\nIn general, the larger the l2_weight, the faster SDCA converges.",
+      "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.",
       "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)",
       "ShortName": "sasdcamc",
       "Inputs": [
@@ -11815,7 +11815,7 @@
     },
     {
       "Name": "Trainers.StochasticDualCoordinateAscentRegressor",
-      "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate \r\nAscent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.\r\nThe algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation \r\nthat supports multi-threading.\r\nConvergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.\r\nSeveral choices of loss functions are also provided.\r\nThe SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.\r\nFor more information on SDCA, see:\r\n<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.\r\n<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.\r\n\r\nNote that SDCA is a stochastic and streaming optimization algorithm. \r\nThe results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to\r\n`False` and `NumThreads` to `1`.\r\n\r\nElastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. \r\nIn general, the larger the l2_weight, the faster SDCA converges.",
+      "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.",
       "FriendlyName": "Fast Linear Regression (SA-SDCA)",
       "ShortName": "sasdcar",
       "Inputs": [
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
index 6419991876..51f7dd1d42 100644
--- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
+++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -6,6 +6,7 @@
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
+using System.Text.RegularExpressions;
 using Microsoft.ML.Runtime.Api;
 using Microsoft.ML.Runtime.Core.Tests.UnitTests;
 using Microsoft.ML.Runtime.Data;
@@ -387,12 +388,14 @@ public void EntryPointCatalog()
             var entryPointsSubDir = Path.Combine("..", "Common", "EntryPoints");
             var catalog = ModuleCatalog.CreateInstance(Env);
             var path = DeleteOutputPath(entryPointsSubDir, epListFile);
+
+            var regex = new Regex(@"\r\n?|\n", RegexOptions.Compiled);
             File.WriteAllLines(path, catalog.AllEntryPoints()
-                .Select(x => string.Join("\t", 
+                .Select(x => string.Join("\t",
                 x.Name,
-                new string(x.Description.Where(c => !char.IsControl(c)).ToArray()), 
+                regex.Replace(x.Description, ""),
                 x.Method.DeclaringType,
-                x.Method.Name, 
+                x.Method.Name,
                 x.InputType,
                 x.OutputType)
                 .Replace(Environment.NewLine, ""))
@@ -401,6 +404,15 @@ public void EntryPointCatalog()
             CheckEquality(entryPointsSubDir, epListFile);
 
             var jObj = JsonManifestUtils.BuildAllManifests(Env, catalog);
+
+            //clean up the description from the new line characters
+            if (jObj[FieldNames.TopEntryPoints] != null && jObj[FieldNames.TopEntryPoints] is JArray)
+            {
+                foreach (JToken entry in jObj[FieldNames.TopEntryPoints].Children())
+                    if (entry[FieldNames.Desc] != null)
+                        entry[FieldNames.Desc] = regex.Replace(entry[FieldNames.Desc].ToString(), "");
+            }
+
             var jPath = DeleteOutputPath(entryPointsSubDir, manifestFile);
             using (var file = File.OpenWrite(jPath))
             using (var writer = new StreamWriter(file))

From bd86afaab532cf0c1c2f671c21d5ac0d74f4ee55 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 27 Jun 2018 08:48:29 -0700
Subject: [PATCH 6/6] merge fix

---
 test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index c13daf2421..ee0eb3de15 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -36,7 +36,7 @@ Models.Summarizer	Summarize a linear regression predictor.	Microsoft.ML.Runtime.
 Models.SweepResultExtractor	Extracts the sweep result.	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro	ExtractSweepResult	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+ResultInput	Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+Output
 Models.TrainTestBinaryEvaluator	Train test for binary classification	Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro	TrainTestBinary	Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output]
 Models.TrainTestEvaluator	General train test for any supported evaluator	Microsoft.ML.Runtime.EntryPoints.TrainTestMacro	TrainTest	Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output]
-Trainers.AveragedPerceptronBinaryClassifier	Train a Average perceptron.	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
+Trainers.AveragedPerceptronBinaryClassifier	Perceptron is a classification algorithm that makes its predictions based on a linear function.I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer	TrainBinary	Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Trainers.EnsembleBinaryClassifier	Train binary ensemble.	Microsoft.ML.Ensemble.EntryPoints.Ensemble	CreateBinaryEnsemble	Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
 Trainers.EnsembleClassification	Train multiclass ensemble.	Microsoft.ML.Ensemble.EntryPoints.Ensemble	CreateMultiClassEnsemble	Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
 Trainers.EnsembleRegression	Train regression ensemble.	Microsoft.ML.Ensemble.EntryPoints.Ensemble	CreateRegressionEnsemble	Microsoft.ML.Runtime.Ensemble.RegressionEnsembleTrainer+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput