diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FastTreeRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FastTreeRegression.cs index 546de0640c..214435f5d3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FastTreeRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/FastTreeRegression.cs @@ -30,22 +30,12 @@ public static void Example() // We will train a FastTreeRegression model with 1 tree on these two columns to predict Age. string outputColumnName = "Features"; var pipeline = ml.Transforms.Concatenate(outputColumnName, new[] { "Parity", "Induced" }) - .Append(ml.Regression.Trainers.FastTree(labelColumnName: "Age", featureColumnName: outputColumnName, numTrees: 1, numLeaves: 2, minDatapointsInLeaves: 1)); + .Append(ml.Regression.Trainers.FastTree(labelColumnName: "Age", featureColumnName: outputColumnName, numberOfTrees: 1, numberOfLeaves: 2, minimumExampleCountPerLeaf: 1)); var model = pipeline.Fit(trainData); // Get the trained model parameters. var modelParams = model.LastTransformer.Model; - - // Let's see where an example with Parity = 1 and Induced = 1 would end up in the single trained tree. - var testRow = new VBuffer(2, new[] { 1.0f, 1.0f }); - // Use the path object to pass to GetLeaf, which will populate path with the IDs of th nodes from root to leaf. - List path = default; - // Get the ID of the leaf this example ends up in tree 0. - var leafID = modelParams.GetLeaf(0, in testRow, ref path); - // Get the leaf value for this leaf ID in tree 0. - var leafValue = modelParams.GetLeafValue(0, leafID); - Console.WriteLine("The leaf value in tree 0 is: " + leafValue); } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs index e3edb0813c..0c75071dfa 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs @@ -28,7 +28,7 @@ public static void Example() .ToArray(); var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels( - labelColumnName: labelName, featureColumnName: "Features", maxBins: 16)); + labelColumnName: labelName, featureColumnName: "Features", maxBinCountPerFeature: 16)); var fitPipeline = pipeline.Fit(data); // Extract the model from the pipeline @@ -37,7 +37,7 @@ public static void Example() // Now investigate the properties of the Generalized Additive Model: The intercept and shape functions. // The intercept for the GAM models represent the average prediction for the training data - var intercept = gamModel.Intercept; + var intercept = gamModel.Bias; // Expected output: Average predicted cost: 22.53 Console.WriteLine($"Average predicted cost: {intercept:0.00}"); @@ -93,7 +93,7 @@ public static void Example() // Distillation." arXiv:1710.06169." Console.WriteLine(); Console.WriteLine("Student-Teacher Ratio"); - for (int i = 0; i < teacherRatioBinUpperBounds.Length; i++) + for (int i = 0; i < teacherRatioBinUpperBounds.Count; i++) { Console.WriteLine($"x < {teacherRatioBinUpperBounds[i]:0.00} => {teacherRatioBinEffects[i]:0.000}"); } diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs index 858c651e48..0480ec5015 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs @@ -78,9 +78,9 @@ public static void FastTreeBinaryClassification() Score: mlContext.BinaryClassification.Trainers.FastTree( row.Label, row.Features, - numTrees: 100, // try: (int) 20-2000 - numLeaves: 20, // try: (int) 2-128 - minDatapointsInLeaves: 10, // try: (int) 1-100 + numberOfTrees: 100, // try: (int) 20-2000 + numberOfLeaves: 20, // try: (int) 2-128 + minimumExampleCountPerLeaf: 10, // try: (int) 1-100 learningRate: 0.2))) // try: (float) 0.025-0.4 .Append(row => ( Label: row.Label, diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs index c7ca0eb905..8be77fae91 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs @@ -38,9 +38,9 @@ public static void FastTreeRegression() .Append(r => (r.label, score: mlContext.Regression.Trainers.FastTree( r.label, r.features, - numTrees: 100, // try: (int) 20-2000 - numLeaves: 20, // try: (int) 2-128 - minDatapointsInLeaves: 10, // try: (int) 1-100 + numberOfTrees: 100, // try: (int) 20-2000 + numberOfLeaves: 20, // try: (int) 2-128 + minimumExampleCountPerLeaf: 10, // try: (int) 1-100 learningRate: 0.2, // try: (float) 0.025-0.4 onFit: p => pred = p) ) diff --git a/src/Microsoft.ML.FastTree/BoostingFastTree.cs b/src/Microsoft.ML.FastTree/BoostingFastTree.cs index 0281b8ab88..e587dcb4c3 100644 --- a/src/Microsoft.ML.FastTree/BoostingFastTree.cs +++ b/src/Microsoft.ML.FastTree/BoostingFastTree.cs @@ -18,16 +18,16 @@ private protected BoostingFastTreeTrainerBase(IHostEnvironment env, TOptions opt private protected BoostingFastTreeTrainerBase(IHostEnvironment env, SchemaShape.Column label, - string featureColumn, - string weightColumn, - string groupIdColumn, - int numLeaves, - int numTrees, - int minDatapointsInLeaves, + string featureColumnName, + string exampleWeightColumnName, + string rowGroupColumnName, + int numberOfLeaves, + int numberOfTrees, + int minimumExampleCountPerLeaf, double learningRate) - : base(env, label, featureColumn, weightColumn, groupIdColumn, numLeaves, numTrees, minDatapointsInLeaves) + : base(env, label, featureColumnName, exampleWeightColumnName, rowGroupColumnName, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf) { - FastTreeTrainerOptions.LearningRates = learningRate; + FastTreeTrainerOptions.LearningRate = learningRate; } private protected override void CheckOptions(IChannel ch) @@ -40,10 +40,10 @@ private protected override void CheckOptions(IChannel ch) if (FastTreeTrainerOptions.CompressEnsemble && FastTreeTrainerOptions.WriteLastEnsemble) throw ch.Except("Ensemble compression cannot be done when forcing to write last ensemble (hl)"); - if (FastTreeTrainerOptions.NumLeaves > 2 && FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumLeaves - 1) + if (FastTreeTrainerOptions.NumberOfLeaves > 2 && FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumberOfLeaves - 1) throw ch.Except("Histogram pool size (ps) must be at least 2."); - if (FastTreeTrainerOptions.NumLeaves > 2 && FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumLeaves - 1) + if (FastTreeTrainerOptions.NumberOfLeaves > 2 && FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumberOfLeaves - 1) throw ch.Except("Histogram pool size (ps) must be at most numLeaves - 1."); if (FastTreeTrainerOptions.EnablePruning && !HasValidSet) @@ -61,12 +61,12 @@ private protected override void CheckOptions(IChannel ch) private protected override TreeLearner ConstructTreeLearner(IChannel ch) { return new LeastSquaresRegressionTreeLearner( - TrainSet, FastTreeTrainerOptions.NumLeaves, FastTreeTrainerOptions.MinDocumentsInLeafs, FastTreeTrainerOptions.EntropyCoefficient, + TrainSet, FastTreeTrainerOptions.NumberOfLeaves, FastTreeTrainerOptions.MinimumExampleCountPerLeaf, FastTreeTrainerOptions.EntropyCoefficient, FastTreeTrainerOptions.FeatureFirstUsePenalty, FastTreeTrainerOptions.FeatureReusePenalty, FastTreeTrainerOptions.SoftmaxTemperature, - FastTreeTrainerOptions.HistogramPoolSize, FastTreeTrainerOptions.RngSeed, FastTreeTrainerOptions.SplitFraction, FastTreeTrainerOptions.FilterZeroLambdas, - FastTreeTrainerOptions.AllowEmptyTrees, FastTreeTrainerOptions.GainConfidenceLevel, FastTreeTrainerOptions.MaxCategoricalGroupsPerNode, - FastTreeTrainerOptions.MaxCategoricalSplitPoints, BsrMaxTreeOutput(), ParallelTraining, - FastTreeTrainerOptions.MinDocsPercentageForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinDocsForCategoricalSplit, FastTreeTrainerOptions.Bias); + FastTreeTrainerOptions.HistogramPoolSize, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.FeatureFractionPerSplit, FastTreeTrainerOptions.FilterZeroLambdas, + FastTreeTrainerOptions.AllowEmptyTrees, FastTreeTrainerOptions.GainConfidenceLevel, FastTreeTrainerOptions.MaximumCategoricalGroupCountPerNode, + FastTreeTrainerOptions.MaximumCategoricalSplitPointCount, BsrMaxTreeOutput(), ParallelTraining, + FastTreeTrainerOptions.MinimumExampleFractionForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinimumExamplesForCategoricalSplit, FastTreeTrainerOptions.Bias); } private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm(IChannel ch) @@ -94,7 +94,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm( optimizationAlgorithm.ObjectiveFunction = ConstructObjFunc(ch); optimizationAlgorithm.Smoothing = FastTreeTrainerOptions.Smoothing; optimizationAlgorithm.DropoutRate = FastTreeTrainerOptions.DropoutRate; - optimizationAlgorithm.DropoutRng = new Random(FastTreeTrainerOptions.RngSeed); + optimizationAlgorithm.DropoutRng = new Random(FastTreeTrainerOptions.Seed); optimizationAlgorithm.PreScoreUpdateEvent += PrintTestGraph; return optimizationAlgorithm; @@ -162,7 +162,7 @@ private protected override int GetBestIteration(IChannel ch) private protected double BsrMaxTreeOutput() { if (FastTreeTrainerOptions.BestStepRankingRegressionTrees) - return FastTreeTrainerOptions.MaxTreeOutput; + return FastTreeTrainerOptions.MaximumTreeOutput; else return -1; } diff --git a/src/Microsoft.ML.FastTree/FastTree.cs b/src/Microsoft.ML.FastTree/FastTree.cs index f27a46eaa5..2814ca859a 100644 --- a/src/Microsoft.ML.FastTree/FastTree.cs +++ b/src/Microsoft.ML.FastTree/FastTree.cs @@ -101,26 +101,26 @@ public abstract class FastTreeTrainerBase : /// private protected FastTreeTrainerBase(IHostEnvironment env, SchemaShape.Column label, - string featureColumn, - string weightColumn, - string groupIdColumn, - int numLeaves, - int numTrees, - int minDatapointsInLeaves) - : base(Contracts.CheckRef(env, nameof(env)).Register(RegisterName), TrainerUtils.MakeR4VecFeature(featureColumn), label, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn), TrainerUtils.MakeU4ScalarColumn(groupIdColumn)) + string featureColumnName, + string exampleWeightColumnName, + string rowGroupColumnName, + int numberOfLeaves, + int numberOfTrees, + int minimumExampleCountPerLeaf) + : base(Contracts.CheckRef(env, nameof(env)).Register(RegisterName), TrainerUtils.MakeR4VecFeature(featureColumnName), label, TrainerUtils.MakeR4ScalarWeightColumn(exampleWeightColumnName), TrainerUtils.MakeU4ScalarColumn(rowGroupColumnName)) { FastTreeTrainerOptions = new TOptions(); // set up the directly provided values // override with the directly provided values. - FastTreeTrainerOptions.NumLeaves = numLeaves; - FastTreeTrainerOptions.NumTrees = numTrees; - FastTreeTrainerOptions.MinDocumentsInLeafs = minDatapointsInLeaves; + FastTreeTrainerOptions.NumberOfLeaves = numberOfLeaves; + FastTreeTrainerOptions.NumberOfTrees = numberOfTrees; + FastTreeTrainerOptions.MinimumExampleCountPerLeaf = minimumExampleCountPerLeaf; FastTreeTrainerOptions.LabelColumnName = label.Name; - FastTreeTrainerOptions.FeatureColumnName = featureColumn; - FastTreeTrainerOptions.ExampleWeightColumnName = weightColumn; - FastTreeTrainerOptions.RowGroupColumnName = groupIdColumn; + FastTreeTrainerOptions.FeatureColumnName = featureColumnName; + FastTreeTrainerOptions.ExampleWeightColumnName = exampleWeightColumnName; + FastTreeTrainerOptions.RowGroupColumnName = rowGroupColumnName; // The discretization step renders this trainer non-parametric, and therefore it does not need normalization. // Also since it builds its own internal discretized columnar structures, it cannot benefit from caching. @@ -176,7 +176,7 @@ private protected virtual float GetMaxLabel() private void Initialize(IHostEnvironment env) { - int numThreads = FastTreeTrainerOptions.NumThreads ?? Environment.ProcessorCount; + int numThreads = FastTreeTrainerOptions.NumberOfThreads ?? Environment.ProcessorCount; if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor) { using (var ch = Host.Start("FastTreeTrainerBase")) @@ -198,7 +198,7 @@ private protected void ConvertData(RoleMappedData trainData) { AnnotationUtils.TryGetCategoricalFeatureIndices(trainData.Schema.Schema, trainData.Schema.Feature.Value.Index, out CategoricalFeatures); var useTranspose = UseTranspose(FastTreeTrainerOptions.DiskTranspose, trainData) && (ValidData == null || UseTranspose(FastTreeTrainerOptions.DiskTranspose, ValidData)); - var instanceConverter = new ExamplesToFastTreeBins(Host, FastTreeTrainerOptions.MaxBins, useTranspose, !FastTreeTrainerOptions.FeatureFlocks, FastTreeTrainerOptions.MinDocumentsInLeafs, GetMaxLabel()); + var instanceConverter = new ExamplesToFastTreeBins(Host, FastTreeTrainerOptions.MaximumBinCountPerFeature, useTranspose, !FastTreeTrainerOptions.FeatureFlocks, FastTreeTrainerOptions.MinimumExampleCountPerLeaf, GetMaxLabel()); TrainSet = instanceConverter.FindBinsAndReturnDataset(trainData, PredictionKind, ParallelTraining, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit); FeatureMap = instanceConverter.FeatureMap; @@ -236,8 +236,8 @@ private protected void TrainCore(IChannel ch) } using (Timer.Time(TimerEvent.TotalTrain)) Train(ch); - if (FastTreeTrainerOptions.ExecutionTimes) - PrintExecutionTimes(ch); + if (FastTreeTrainerOptions.ExecutionTime) + PrintExecutionTime(ch); TrainedEnsemble = Ensemble; if (FeatureMap != null) TrainedEnsemble.RemapFeatures(FeatureMap); @@ -257,7 +257,7 @@ private protected virtual void InitializeThreads(int numThreads) ThreadTaskManager.Initialize(numThreads); } - private protected virtual void PrintExecutionTimes(IChannel ch) + private protected virtual void PrintExecutionTime(IChannel ch) { ch.Info("Execution time breakdown:\n{0}", Timer.GetString()); } @@ -270,14 +270,14 @@ private protected virtual void CheckOptions(IChannel ch) // change arguments if (FastTreeTrainerOptions.HistogramPoolSize < 2) - FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumLeaves * 2 / 3; - if (FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumLeaves - 1) - FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumLeaves - 1; + FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumberOfLeaves * 2 / 3; + if (FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumberOfLeaves - 1) + FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumberOfLeaves - 1; if (FastTreeTrainerOptions.BaggingSize > 0) { - int bagCount = FastTreeTrainerOptions.NumTrees / FastTreeTrainerOptions.BaggingSize; - if (bagCount * FastTreeTrainerOptions.BaggingSize != FastTreeTrainerOptions.NumTrees) + int bagCount = FastTreeTrainerOptions.NumberOfTrees / FastTreeTrainerOptions.BaggingSize; + if (bagCount * FastTreeTrainerOptions.BaggingSize != FastTreeTrainerOptions.NumberOfTrees) throw ch.Except("Number of trees should be a multiple of number bag size"); } @@ -423,7 +423,7 @@ private protected bool[] GetActiveFeatures() if (FastTreeTrainerOptions.FeatureFraction < 1.0) { if (_featureSelectionRandom == null) - _featureSelectionRandom = new Random(FastTreeTrainerOptions.FeatureSelectSeed); + _featureSelectionRandom = new Random(FastTreeTrainerOptions.FeatureSelectionSeed); for (int i = 0; i < TrainSet.NumFeatures; ++i) { @@ -593,7 +593,7 @@ private void GenerateActiveFeatureLists(int numberOfItems) private protected virtual BaggingProvider CreateBaggingProvider() { Contracts.Assert(FastTreeTrainerOptions.BaggingSize > 0); - return new BaggingProvider(TrainSet, FastTreeTrainerOptions.NumLeaves, FastTreeTrainerOptions.RngSeed, FastTreeTrainerOptions.BaggingTrainFraction); + return new BaggingProvider(TrainSet, FastTreeTrainerOptions.NumberOfLeaves, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.BaggingExampleFraction); } private protected virtual bool ShouldRandomStartOptimizer() @@ -604,7 +604,7 @@ private protected virtual bool ShouldRandomStartOptimizer() private protected virtual void Train(IChannel ch) { Contracts.AssertValue(ch); - int numTotalTrees = FastTreeTrainerOptions.NumTrees; + int numTotalTrees = FastTreeTrainerOptions.NumberOfTrees; ch.Info( "Reserved memory for tree learner: {0} bytes", @@ -624,7 +624,7 @@ private protected virtual void Train(IChannel ch) if (Ensemble.NumTrees < numTotalTrees && ShouldRandomStartOptimizer()) { ch.Info("Randomizing start point"); - OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.RngSeed, false); + OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.Seed, false); revertRandomStart = true; } @@ -711,7 +711,7 @@ private protected virtual void Train(IChannel ch) { revertRandomStart = false; ch.Info("Reverting random score assignment"); - OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.RngSeed, true); + OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.Seed, true); } #if !NO_STORE @@ -796,7 +796,7 @@ private protected virtual void PrintIterationMessage(IChannel ch, IProgressChann private protected virtual void PrintTestResults(IChannel ch) { - if (FastTreeTrainerOptions.TestFrequency != int.MaxValue && (Ensemble.NumTrees % FastTreeTrainerOptions.TestFrequency == 0 || Ensemble.NumTrees == FastTreeTrainerOptions.NumTrees)) + if (FastTreeTrainerOptions.TestFrequency != int.MaxValue && (Ensemble.NumTrees % FastTreeTrainerOptions.TestFrequency == 0 || Ensemble.NumTrees == FastTreeTrainerOptions.NumberOfTrees)) { var sb = new StringBuilder(); using (var sw = new StringWriter(sb)) @@ -902,7 +902,7 @@ internal abstract class DataConverter /// in this array are initialized to non-null values but it must happen at least no later /// than immediately after we return from . /// - public readonly Double[][] BinUpperBounds; + public readonly double[][] BinUpperBounds; /// /// In the event that any features are filtered, this will contain the feature map, where @@ -927,7 +927,7 @@ private protected bool UsingMaxLabel get { return MaxLabel != float.PositiveInfinity; } } - private DataConverter(RoleMappedData data, IHost host, Double[][] binUpperBounds, float maxLabel, + private DataConverter(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit) { Contracts.AssertValue(host, "host"); @@ -946,7 +946,7 @@ private DataConverter(RoleMappedData data, IHost host, Double[][] binUpperBounds BinUpperBounds = binUpperBounds; } else - BinUpperBounds = new Double[NumFeatures][]; + BinUpperBounds = new double[NumFeatures][]; MaxLabel = maxLabel; PredictionKind = kind; CategoricalSplit = categoricalSplit; @@ -972,7 +972,7 @@ public static DataConverter Create(RoleMappedData data, IHost host, int maxBins, return conv; } - public static DataConverter Create(RoleMappedData data, IHost host, Double[][] binUpperBounds, + public static DataConverter Create(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel, bool diskTranspose, bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit) { Contracts.AssertValue(host, "host"); @@ -1025,7 +1025,7 @@ private protected static bool CalculateBins(BinFinder binFinder, in VBuffer> NonZeroBinnedValuesForSparse(ReadOnlySpan values, ReadOnlySpan indices, Double[] binUpperBounds) + private static IEnumerable> NonZeroBinnedValuesForSparse(ReadOnlySpan values, ReadOnlySpan indices, double[] binUpperBounds) { Contracts.Assert(values.Length == indices.Length); Contracts.Assert(Algorithms.FindFirstGE(binUpperBounds, 0) == 0); @@ -1093,12 +1093,12 @@ private FeatureFlockBase CreateOneHotFlock(IChannel ch, ch.Assert(min <= fi && fi < lim); int subfeature = f2sf[fi - min]; ch.Assert(subfeature >= 0); - Double val = ind[subfeature, i]; + double val = ind[subfeature, i]; #if false // Same note, too slow even for debug builds. // Assert that all the other features really would be cold for this position. Contracts.Assert(Enumerable.Range(min, fi - min).Concat(Enumerable.Range(fi + 1, lim - (fi + 1))).All(f => ind[f, i] < BinUpperBounds[f][0])); #endif - Double[] bub = BinUpperBounds[fi]; + double[] bub = BinUpperBounds[fi]; ch.Assert(bub.Length > 1); int bin = Algorithms.FindFirstGE(bub, val); ch.Assert(0 < bin && bin < bub.Length); // If 0, should not have been considered "on", so what the heck? @@ -1172,7 +1172,7 @@ private FeatureFlockBase CreateOneHotFlockCategorical(IChannel ch, // Assert that all the other features really would be cold for this position. Contracts.Assert(Enumerable.Range(min, fi - min).Concat(Enumerable.Range(fi + 1, lim - (fi + 1))).All(f => ind[f, i] < BinUpperBounds[f][0])); #endif - Double[] bub = BinUpperBounds[fi]; + double[] bub = BinUpperBounds[fi]; ch.Assert(bub.Length == 2); //REVIEW: leaving out check for the value to reduced memory consuption and going with //leap of faith based on what the user told. @@ -1207,7 +1207,7 @@ private FeatureFlockBase CreateOneHotFlockCategorical(IChannel ch, /// The upper bounds of the binning of this feature. /// A derived binned derived feature vector. private protected static SingletonFeatureFlock CreateSingletonFlock(IChannel ch, in VBuffer values, int[] binnedValues, - Double[] binUpperBounds) + double[] binUpperBounds) { Contracts.AssertValue(ch); ch.Assert(Utils.Size(binUpperBounds) > 0); @@ -1419,7 +1419,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB // Perhaps we should change the binning to just work over singles. VBuffer doubleTemp = default(VBuffer); - var copier = GetCopier(NumberDataViewType.Single, NumberDataViewType.Double); + var copier = GetCopier(NumberDataViewType.Single, NumberDataViewType.Double); int iFeature = 0; pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, features.Length)); while (cursor.MoveNext()) @@ -1488,7 +1488,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB VBuffer doubleTemp = default(VBuffer); int[] binnedValues = new int[numExamples]; - var copier = GetCopier(NumberDataViewType.Single, NumberDataViewType.Double); + var copier = GetCopier(NumberDataViewType.Single, NumberDataViewType.Double); int iFeature = 0; if (CategoricalSplit && CategoricalFeatureIndices != null) { @@ -1508,7 +1508,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB iFeatureLocal <= CategoricalFeatureIndices[catRangeIndex + 1]; ++iFeatureLocal) { - Double[] bup = BinUpperBounds[iFeatureLocal]; + double[] bup = BinUpperBounds[iFeatureLocal]; if (bup.Length == 1) { // This is a trivial feature. Skip it. @@ -1516,7 +1516,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB } Contracts.Assert(Utils.Size(bup) > 0); - Double firstBin = bup[0]; + double firstBin = bup[0]; GetFeatureValues(catCursor, iFeatureLocal, catGetter, ref temp, ref doubleTemp, copier); bool add = false; var doubleTempValues = doubleTemp.GetValues(); @@ -1594,7 +1594,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB // Construct the labels. short[] ratings = new short[numExamples]; - Double[] actualLabels = new Double[numExamples]; + double[] actualLabels = new double[numExamples]; if (labelIdx >= 0) { @@ -1610,7 +1610,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB if (UsingMaxLabel && !(0 <= label && label <= MaxLabel)) throw Host.Except("Found invalid label {0}. Value should be between 0 and {1}, inclusive.", label, MaxLabel); ratings[ii] = (short)label; - actualLabels[ii] = (Double)label; + actualLabels[ii] = (double)label; } } @@ -1818,7 +1818,7 @@ private void MakeBoundariesAndCheckLabels(out long missingInstances, out long to { long featureValues = 0; // Warn at about 2 GB usage. - const long featureValuesWarnThreshold = (2L << 30) / sizeof(Double); + const long featureValuesWarnThreshold = (2L << 30) / sizeof(double); bool featureValuesWarned = false; const string featureValuesWarning = "We seem to be processing a lot of data. Consider using the FastTree diskTranspose+ (or dt+) option, for slower but more memory efficient transposition."; const int queryChunkSize = 100; @@ -1827,7 +1827,7 @@ private void MakeBoundariesAndCheckLabels(out long missingInstances, out long to ch.Info("Changing data from row-wise to column-wise"); long pos = 0; - double rowCountDbl = (double?)_data.Data.GetRowCount() ?? Double.NaN; + double rowCountDbl = (double?)_data.Data.GetRowCount() ?? double.NaN; pch.SetHeader(new ProgressHeader("examples"), e => e.SetProgress(0, pos, rowCountDbl)); // REVIEW: Should we ignore rows with bad label, weight, or group? The previous code seemed to let @@ -2122,7 +2122,7 @@ private IEnumerable CreateFlocksCore(IChannel ch, IProgressCha iFeatureLocal <= CategoricalFeatureIndices[catRangeIndex + 1]; ++iFeatureLocal) { - Double[] bup = BinUpperBounds[iFeatureLocal]; + double[] bup = BinUpperBounds[iFeatureLocal]; if (bup.Length == 1) { // This is a trivial feature. Skip it. @@ -2130,7 +2130,7 @@ private IEnumerable CreateFlocksCore(IChannel ch, IProgressCha } Contracts.Assert(Utils.Size(bup) > 0); - Double firstBin = bup[0]; + double firstBin = bup[0]; using (IEnumerator hotEnumerator = _instanceList[iFeatureLocal].AllIndicesGT(NumExamples, firstBin).GetEnumerator()) { while (hotEnumerator.MoveNext()) @@ -2240,7 +2240,7 @@ private IEnumerable CreateFlocksCore(IChannel ch, IProgressCha for (; iFeature < featureLim; ++iFeature) { - Double[] bup = BinUpperBounds[iFeature]; + double[] bup = BinUpperBounds[iFeature]; Contracts.Assert(Utils.Size(bup) > 0); if (bup.Length == 1) { @@ -2263,7 +2263,7 @@ private IEnumerable CreateFlocksCore(IChannel ch, IProgressCha yield return createFlock(); } countBins += bup.Length - 1; - Double firstBin = bup[0]; + double firstBin = bup[0]; int localHotRows = 0; // The number of bits we would use if we incorporated the current feature in to the // existing running flock. @@ -2378,24 +2378,23 @@ private Dataset.DatasetSkeleton CreateDatasetSkeleton() } } - // REVIEW: Change this, as well as the bin finding code and bin upper bounds, to be float instead of Double. - + // REVIEW: Change this, as well as the bin finding code and bin upper bounds, to be float instead of double. /// /// A mutable list of index,value that may be kept sparse or dense. /// private sealed class ValuesList { private bool _isSparse; - private List _dense; + private List _dense; private int _nonZeroElements; // when dense, is the number of non-zero elements (for determining when to sparsify) - private List> _sparse; + private List> _sparse; public ValuesList() { - _dense = new List(); + _dense = new List(); } - public void Add(int index, Double value) + public void Add(int index, double value) { if (!_isSparse) { @@ -2406,7 +2405,7 @@ public void Add(int index, Double value) { // Add zeros if needed. while (_dense.Count < index) - _dense.Add(default(Double)); + _dense.Add(default(double)); // Add the value. _dense.Add(value); if (value != 0) @@ -2417,7 +2416,7 @@ public void Add(int index, Double value) // Note this also may happen because we just sparsified. Contracts.Assert(_isSparse); if (value != 0) - _sparse.Add(new KeyValuePair(index, value)); + _sparse.Add(new KeyValuePair(index, value)); } private bool ShouldSparsify(int nonZeroElements, int totalElements) @@ -2428,11 +2427,11 @@ private bool ShouldSparsify(int nonZeroElements, int totalElements) private void Sparsify() { - _sparse = new List>(_nonZeroElements); + _sparse = new List>(_nonZeroElements); for (int i = 0; i < _dense.Count; i++) { if (_dense[i] != 0) - _sparse.Add(new KeyValuePair(i, _dense[i])); + _sparse.Add(new KeyValuePair(i, _dense[i])); } _isSparse = true; _dense = null; @@ -2446,7 +2445,7 @@ private void Sparsify() /// comparison is made /// The count of all indices in the range of 0 to /// exclusive whose values are greater than - public int CountIndicesGT(int length, Double gtValue) + public int CountIndicesGT(int length, double gtValue) { Contracts.Assert(0 <= length); if (_isSparse) @@ -2470,7 +2469,7 @@ public int CountIndicesGT(int length, Double gtValue) /// All indices in the range of 0 to exclusive /// whose values are greater than , in /// increasing order - public IEnumerable AllIndicesGT(int lim, Double gtValue) + public IEnumerable AllIndicesGT(int lim, double gtValue) { Contracts.Assert(0 <= lim); if (_isSparse) @@ -2520,7 +2519,7 @@ public IEnumerable AllIndicesGT(int lim, Double gtValue) } } - public void CopyTo(int length, ref VBuffer dst) + public void CopyTo(int length, ref VBuffer dst) { Contracts.Assert(0 <= length); VBufferEditor editor; @@ -2653,7 +2652,7 @@ public sealed class ForwardIndexer /// it is OK to access [1, 5], then [0, 5], but once this is done you cannot /// access the same feature at the same position. /// - public Double this[int featureIndex, int rowIndex] + public double this[int featureIndex, int rowIndex] { get { @@ -2821,10 +2820,12 @@ public abstract class TreeEnsembleModelParameters : private protected abstract uint VerCategoricalSplitSerialized { get; } + [BestFriend] internal readonly DataViewType InputType; DataViewType IValueMapper.InputType => InputType; - protected readonly DataViewType OutputType; + [BestFriend] + internal readonly DataViewType OutputType; DataViewType IValueMapper.OutputType => OutputType; bool ICanSavePfa.CanSavePfa => true; @@ -3142,26 +3143,26 @@ void ICanSaveSummary.SaveSummary(TextWriter writer, RoleMappedSchema schema) foreach (var pair in ((ICanGetSummaryInKeyValuePairs)this).GetSummaryInKeyValuePairs(schema)) { - Host.Assert(pair.Value is Double); - writer.WriteLine("\t{0}\t{1}", pair.Key, (Double)pair.Value); + Host.Assert(pair.Value is double); + writer.WriteLine("\t{0}\t{1}", pair.Key, (double)pair.Value); } } - private IEnumerable> GetSortedFeatureGains(RoleMappedSchema schema) + private IEnumerable> GetSortedFeatureGains(RoleMappedSchema schema) { var gainMap = new FeatureToGainMap(TrainedEnsemble.Trees.ToList(), normalize: true); var names = default(VBuffer>); AnnotationUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, NumFeatures, ref names); var ordered = gainMap.OrderByDescending(pair => pair.Value); - Double max = ordered.FirstOrDefault().Value; - Double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max)); + double max = ordered.FirstOrDefault().Value; + double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max)); foreach (var pair in ordered) { var name = names.GetItemOrDefault(pair.Key).ToString(); if (string.IsNullOrEmpty(name)) name = $"f{pair.Key}"; - yield return new KeyValuePair(name, Math.Sqrt(pair.Value) * normFactor); + yield return new KeyValuePair(name, Math.Sqrt(pair.Value) * normFactor); } } @@ -3230,6 +3231,11 @@ private void ToCSharp(InternalRegressionTree tree, TextWriter writer, int node, } } + /// + /// Copy the weights of all training features to . + /// + /// a where feature weights would be assigned to. + /// The i-th element in stores the weight of the i-th feature. public void GetFeatureWeights(ref VBuffer weights) { var numFeatures = Math.Max(NumFeatures, MaxSplitFeatIdx + 1); @@ -3242,8 +3248,8 @@ public void GetFeatureWeights(ref VBuffer weights) return; } - Double max = gainMap.Values.Max(); - Double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max)); + double max = gainMap.Values.Max(); + double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max)); var bldr = new BufferBuilder(R4Adder.Instance); bldr.Reset(numFeatures, false); foreach (var pair in gainMap) @@ -3256,7 +3262,8 @@ ITree[] ITreeEnsemble.GetTrees() return TrainedEnsemble.Trees.Select(k => new Tree(k)).ToArray(); } - public float GetLeafValue(int treeId, int leafId) + [BestFriend] + internal float GetLeafValue(int treeId, int leafId) { return (float)TrainedEnsemble.GetTreeAt(treeId).LeafValue(leafId); } @@ -3266,7 +3273,8 @@ public float GetLeafValue(int treeId, int leafId) /// internal nodes in the path from the root to that leaf. If 'path' is null a new list is initialized. All elements /// in 'path' are cleared before filling in the current path nodes. /// - public int GetLeaf(int treeId, in VBuffer features, ref List path) + [BestFriend] + internal int GetLeaf(int treeId, in VBuffer features, ref List path) { return TrainedEnsemble.GetTreeAt(treeId).GetLeaf(in features, ref path); } @@ -3286,10 +3294,7 @@ DataViewRow ICanGetSummaryAsIRow.GetSummaryIRowOrNull(RoleMappedSchema schema) return AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations()); } - DataViewRow ICanGetSummaryAsIRow.GetStatsIRowOrNull(RoleMappedSchema schema) - { - return null; - } + DataViewRow ICanGetSummaryAsIRow.GetStatsIRowOrNull(RoleMappedSchema schema) => null; private sealed class Tree : ITree> { @@ -3431,6 +3436,7 @@ private protected TreeEnsembleModelParametersBasedOnQuantileRegressionTree(IHost TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure(); } + [BestFriend] private protected TreeEnsembleModelParametersBasedOnQuantileRegressionTree(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver) : base(env, name, ctx, ver) { diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs index be8ca0f0cc..6d2f2efded 100644 --- a/src/Microsoft.ML.FastTree/FastTreeArguments.cs +++ b/src/Microsoft.ML.FastTree/FastTreeArguments.cs @@ -20,12 +20,43 @@ internal interface IFastTreeTrainerFactory : IComponentFactory { } + /// + /// Stopping measurements for classification and regression. + /// + public enum EarlyStoppingMetric + { + /// + /// L1-norm of gradient. + /// + L1Norm = 1, + /// + /// L2-norm of gradient. + /// + L2Norm = 2 + }; + + /// + /// Stopping measurements for ranking. + /// + public enum EarlyStoppingRankingMetric + { + /// + /// NDCG@1 + /// + NdcgAt1 = 1, + /// + /// NDCG@3 + /// + NdcgAt3 = 3 + } + /// public sealed partial class FastTreeBinaryClassificationTrainer { [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)] public sealed class Options : BoostedTreeOptions, IFastTreeTrainerFactory { + /// /// Option for using derivatives optimized for unbalanced sets. /// @@ -33,6 +64,37 @@ public sealed class Options : BoostedTreeOptions, IFastTreeTrainerFactory [TGUI(Label = "Optimize for unbalanced")] public bool UnbalancedSets = false; + /// + /// internal state of . It should be always synced with + /// . + /// + // Disable 649 because Visual Studio can't detect its assignment via property. + #pragma warning disable 649 + private EarlyStoppingMetric _earlyStoppingMetric; + #pragma warning restore 649 + + /// + /// Early stopping metrics. + /// + public EarlyStoppingMetric EarlyStoppingMetric + { + get { return _earlyStoppingMetric; } + + set + { + // Update the state of the user-facing stopping metric. + _earlyStoppingMetric = value; + // Set up internal property according to its public value. + EarlyStoppingMetrics = (int)_earlyStoppingMetric; + } + } + + public Options() + { + // Use L1 by default. + EarlyStoppingMetric = EarlyStoppingMetric.L1Norm; + } + ITrainer IComponentFactory.CreateComponent(IHostEnvironment env) => new FastTreeBinaryClassificationTrainer(env, this); } } @@ -42,9 +104,31 @@ public sealed partial class FastTreeRegressionTrainer [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)] public sealed class Options : BoostedTreeOptions, IFastTreeTrainerFactory { + /// + /// internal state of . It should be always synced with + /// . + /// + private EarlyStoppingMetric _earlyStoppingMetric; + + /// + /// Early stopping metrics. + /// + public EarlyStoppingMetric EarlyStoppingMetric + { + get { return _earlyStoppingMetric; } + + set + { + // Update the state of the user-facing stopping metric. + _earlyStoppingMetric = value; + // Set up internal property according to its public value. + EarlyStoppingMetrics = (int)_earlyStoppingMetric; + } + } + public Options() { - EarlyStoppingMetrics = 1; // Use L1 by default. + EarlyStoppingMetric = EarlyStoppingMetric.L1Norm; // Use L1 by default. } ITrainer IComponentFactory.CreateComponent(IHostEnvironment env) => new FastTreeRegressionTrainer(env, this); @@ -64,6 +148,36 @@ public sealed class Options : BoostedTreeOptions, IFastTreeTrainerFactory "and intermediate values are compound Poisson loss.")] public Double Index = 1.5; + /// + /// internal state of . It should be always synced with + /// . + /// + // Disable 649 because Visual Studio can't detect its assignment via property. + #pragma warning disable 649 + private EarlyStoppingMetric _earlyStoppingMetric; + #pragma warning restore 649 + + /// + /// Early stopping metrics. + /// + public EarlyStoppingMetric EarlyStoppingMetric + { + get { return _earlyStoppingMetric; } + + set + { + // Update the state of the user-facing stopping metric. + _earlyStoppingMetric = value; + // Set up internal property according to its public value. + EarlyStoppingMetrics = (int)_earlyStoppingMetric; + } + } + + public Options() + { + EarlyStoppingMetric = EarlyStoppingMetric.L1Norm; // Use L1 by default. + } + ITrainer IComponentFactory.CreateComponent(IHostEnvironment env) => new FastTreeTweedieTrainer(env, this); } } @@ -75,42 +189,72 @@ public sealed class Options : BoostedTreeOptions, IFastTreeTrainerFactory { [Argument(ArgumentType.LastOccurenceWins, HelpText = "Comma seperated list of gains associated to each relevance label.", ShortName = "gains")] [TGUI(NoSweep = true)] - public string CustomGains = "0,3,7,15,31"; + public double[] CustomGains = new double[] { 0, 3, 7, 15, 31 }; [Argument(ArgumentType.LastOccurenceWins, HelpText = "Train DCG instead of NDCG", ShortName = "dcg")] - public bool TrainDcg; + public bool UseDcg; // REVIEW: Hiding sorting for now. Should be an enum or component factory. + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", ShortName = "sort", Hide = true)] [TGUI(NotGui = true)] - public string SortingAlgorithm = "DescendingStablePessimistic"; + internal string SortingAlgorithm = "DescendingStablePessimistic"; [Argument(ArgumentType.AtMostOnce, HelpText = "max-NDCG truncation to use in the Lambda Mart algorithm", ShortName = "n", Hide = true)] [TGUI(NotGui = true)] - public int LambdaMartMaxTruncation = 100; + public int NdcgTruncationLevel = 100; + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Use shifted NDCG", Hide = true)] [TGUI(NotGui = true)] - public bool ShiftedNdcg; + internal bool ShiftedNdcg; + [BestFriend] [Argument(ArgumentType.AtMostOnce, HelpText = "Cost function parameter (w/c)", ShortName = "cf", Hide = true)] [TGUI(NotGui = true)] - public char CostFunctionParam = 'w'; + internal char CostFunctionParam = 'w'; + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Distance weight 2 adjustment to cost", ShortName = "dw", Hide = true)] [TGUI(NotGui = true)] - public bool DistanceWeight2; + internal bool DistanceWeight2; + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Normalize query lambdas", ShortName = "nql", Hide = true)] [TGUI(NotGui = true)] - public bool NormalizeQueryLambdas; + internal bool NormalizeQueryLambdas; + + /// + /// internal state of . It should be always synced with + /// . + /// + // Disable 649 because Visual Studio can't detect its assignment via property. + #pragma warning disable 649 + private EarlyStoppingRankingMetric _earlyStoppingMetric; + #pragma warning restore 649 + + /// + /// Early stopping metrics. + /// + public EarlyStoppingRankingMetric EarlyStoppingMetric + { + get { return _earlyStoppingMetric; } + + set + { + // Update the state of the user-facing stopping metric. + _earlyStoppingMetric = value; + // Set up internal property according to its public value. + EarlyStoppingMetrics = (int)_earlyStoppingMetric; + } + } public Options() { - EarlyStoppingMetrics = 1; + EarlyStoppingMetric = EarlyStoppingRankingMetric.NdcgAt1; // Use L1 by default. } ITrainer IComponentFactory.CreateComponent(IHostEnvironment env) => new FastTreeRankingTrainer(env, this); @@ -129,12 +273,12 @@ internal override void Check(IExceptionContext ectx) #if OLD_DATALOAD ectx.CheckUserArg(0 <= secondaryMetricShare && secondaryMetricShare <= 1, "secondaryMetricShare", "secondaryMetricShare must be between 0 and 1."); #endif - ectx.CheckUserArg(0 < LambdaMartMaxTruncation, nameof(LambdaMartMaxTruncation), "lambdaMartMaxTruncation must be positive."); + ectx.CheckUserArg(0 < NdcgTruncationLevel, nameof(NdcgTruncationLevel), "must be positive."); } } } - public enum Bundle : Byte + public enum Bundle : byte { None = 0, AggregateLowPopulation = 1, @@ -144,10 +288,10 @@ public enum Bundle : Byte [BestFriend] internal static class Defaults { - public const int NumTrees = 100; - public const int NumLeaves = 20; - public const int MinDocumentsInLeaves = 10; - public const double LearningRates = 0.2; + public const int NumberOfTrees = 100; + public const int NumberOfLeaves = 20; + public const int MinimumExampleCountPerLeaf = 10; + public const double LearningRate = 0.2; } public abstract class TreeOptions : TrainerInputBaseWithGroupId @@ -162,10 +306,10 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId /// The number of threads to use. /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "The number of threads to use", ShortName = "t", NullName = "")] - public int? NumThreads = null; + public int? NumberOfThreads = null; // this random seed is used for: - // 1. doc sampling for feature binning + // 1. example sampling for feature binning // 2. init Randomize Score // 3. grad Sampling Rate in Objective Function // 4. tree learner @@ -175,7 +319,7 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId /// The seed of the random number generator. /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "The seed of the random number generator", ShortName = "r1")] - public int RngSeed = 123; + public int Seed = 123; // this random seed is only for active feature selection /// @@ -183,7 +327,7 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "The seed of the active feature selection", ShortName = "r3", Hide = true)] [TGUI(NotGui = true)] - public int FeatureSelectSeed = 123; + public int FeatureSelectionSeed = 123; /// /// The entropy (regularization) coefficient between 0 and 1. @@ -222,25 +366,25 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId [Argument(ArgumentType.LastOccurenceWins, HelpText = "Maximum categorical split groups to consider when splitting on a categorical feature. " + "Split groups are a collection of split points. This is used to reduce overfitting when " + "there many categorical features.", ShortName = "mcg")] - public int MaxCategoricalGroupsPerNode = 64; + public int MaximumCategoricalGroupCountPerNode = 64; /// /// Maximum categorical split points to consider when splitting on a categorical feature. /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "Maximum categorical split points to consider when splitting on a categorical feature.", ShortName = "maxcat")] - public int MaxCategoricalSplitPoints = 64; + public int MaximumCategoricalSplitPointCount = 64; /// - /// Minimum categorical docs percentage in a bin to consider for a split. + /// Minimum categorical example percentage in a bin to consider for a split. Default is 0.1% of all training examples. /// - [Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum categorical docs percentage in a bin to consider for a split.", ShortName = "mdop")] - public double MinDocsPercentageForCategoricalSplit = 0.001; + [Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum categorical example percentage in a bin to consider for a split.", ShortName = "mdop")] + public double MinimumExampleFractionForCategoricalSplit = 0.001; /// - /// Minimum categorical doc count in a bin to consider for a split. + /// Minimum categorical example count in a bin to consider for a split. /// - [Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum categorical doc count in a bin to consider for a split.", ShortName = "mdo")] - public int MinDocsForCategoricalSplit = 100; + [Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum categorical example count in a bin to consider for a split.", ShortName = "mdo")] + public int MinimumExamplesForCategoricalSplit = 100; /// /// Bias for calculating gradient for each feature bin for a categorical feature. @@ -263,7 +407,7 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId /// Maximum number of distinct values (bins) per feature. /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "Maximum number of distinct values (bins) per feature", ShortName = "mb")] - public int MaxBins = 255; // save one for undefs + public int MaximumBinCountPerFeature = 255; // save one for undefs /// /// Sparsity level needed to use sparse feature representation. @@ -298,10 +442,10 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId public Double SoftmaxTemperature; /// - /// Print execution time breakdown to stdout. + /// Print execution time breakdown to ML.NET channel. /// [Argument(ArgumentType.AtMostOnce, HelpText = "Print execution time breakdown to stdout", ShortName = "et")] - public bool ExecutionTimes; + public bool ExecutionTime; // REVIEW: Different from original FastRank arguments (shortname l vs. nl). Different default from TLC FR Wrapper (20 vs. 20). /// @@ -310,17 +454,17 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId [Argument(ArgumentType.LastOccurenceWins, HelpText = "The max number of leaves in each regression tree", ShortName = "nl", SortOrder = 2)] [TGUI(Description = "The maximum number of leaves per tree", SuggestedSweeps = "2-128;log;inc:4")] [TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, isLogScale: true, stepSize: 4)] - public int NumLeaves = Defaults.NumLeaves; + public int NumberOfLeaves = Defaults.NumberOfLeaves; /// - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. + /// The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data. /// // REVIEW: Arrays not supported in GUI // REVIEW: Different shortname than FastRank module. Same as the TLC FRWrapper. - [Argument(ArgumentType.LastOccurenceWins, HelpText = "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", ShortName = "mil", SortOrder = 3)] + [Argument(ArgumentType.LastOccurenceWins, HelpText = "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", ShortName = "mil", SortOrder = 3)] [TGUI(Description = "Minimum number of training instances required to form a leaf", SuggestedSweeps = "1,10,50")] [TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[] { 1, 10, 50 })] - public int MinDocumentsInLeafs = Defaults.MinDocumentsInLeaves; + public int MinimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf; /// /// Total number of decision trees to create in the ensemble. @@ -329,10 +473,10 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId [Argument(ArgumentType.LastOccurenceWins, HelpText = "Total number of decision trees to create in the ensemble", ShortName = "iter", SortOrder = 1)] [TGUI(Description = "Total number of trees constructed", SuggestedSweeps = "20,100,500")] [TlcModule.SweepableDiscreteParamAttribute("NumTrees", new object[] { 20, 100, 500 })] - public int NumTrees = Defaults.NumTrees; + public int NumberOfTrees = Defaults.NumberOfTrees; /// - /// The fraction of features (chosen randomly) to use on each iteration. + /// The fraction of features (chosen randomly) to use on each iteration. Use 0.9 if only 90% of features is needed. /// [Argument(ArgumentType.AtMostOnce, HelpText = "The fraction of features (chosen randomly) to use on each iteration", ShortName = "ff")] public Double FeatureFraction = 1; @@ -344,23 +488,23 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId public int BaggingSize; /// - /// Percentage of training examples used in each bag. + /// Percentage of training examples used in each bag. Default is 0.7 (70%). /// [Argument(ArgumentType.AtMostOnce, HelpText = "Percentage of training examples used in each bag", ShortName = "bagfrac")] // REVIEW: sweeping bagfrac doesn't make sense unless 'baggingSize' is non-zero. The 'SuggestedSweeps' here // are used to denote 'sensible range', but the GUI will interpret this as 'you must sweep these values'. So, I'm keeping // the values there for the future, when we have an appropriate way to encode this information. // [TGUI(SuggestedSweeps = "0.5,0.7,0.9")] - public Double BaggingTrainFraction = 0.7; + public Double BaggingExampleFraction = 0.7; /// - /// The fraction of features (chosen randomly) to use on each split. + /// The fraction of features (chosen randomly) to use on each split. If it's value is 0.9, 90% of all features would be dropped in expectation. /// [Argument(ArgumentType.AtMostOnce, HelpText = "The fraction of features (chosen randomly) to use on each split", ShortName = "sf")] - public Double SplitFraction = 1; + public Double FeatureFractionPerSplit = 1; /// - /// Smoothing paramter for tree regularization. + /// Smoothing parameter for tree regularization. /// [Argument(ArgumentType.AtMostOnce, HelpText = "Smoothing paramter for tree regularization", ShortName = "s")] public Double Smoothing; @@ -375,9 +519,10 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId /// /// The level of feature compression to use. /// + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "The level of feature compression to use", ShortName = "fcomp", Hide = true)] [TGUI(NotGui = true)] - public int FeatureCompressionLevel = 1; + internal int FeatureCompressionLevel = 1; /// /// Compress the tree Ensemble. @@ -386,28 +531,22 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId [TGUI(NotGui = true)] public bool CompressEnsemble; - /// - /// Maximum Number of trees after compression. - /// - // REVIEW: Not used. - [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum Number of trees after compression", ShortName = "cmpmax", Hide = true)] - [TGUI(NotGui = true)] - public int MaxTreesAfterCompression = -1; - /// /// Print metrics graph for the first test set. /// + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Print metrics graph for the first test set", ShortName = "graph", Hide = true)] [TGUI(NotGui = true)] - public bool PrintTestGraph; + internal bool PrintTestGraph; /// /// Print Train and Validation metrics in graph. /// //It is only enabled if printTestGraph is also set + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Print Train and Validation metrics in graph", ShortName = "graphtv", Hide = true)] [TGUI(NotGui = true)] - public bool PrintTrainValidGraph; + internal bool PrintTrainValidGraph; /// /// Calculate metric values for train/valid/test every k rounds. @@ -418,27 +557,27 @@ public abstract class TreeOptions : TrainerInputBaseWithGroupId internal virtual void Check(IExceptionContext ectx) { Contracts.AssertValue(ectx); - ectx.CheckUserArg(NumThreads == null || NumThreads > 0, nameof(NumThreads), "numThreads must be positive."); - ectx.CheckUserArg(NumLeaves >= 2, nameof(NumLeaves), "numLeaves must be at least 2."); - ectx.CheckUserArg(0 <= EntropyCoefficient && EntropyCoefficient <= 1, nameof(EntropyCoefficient), "entropyCoefficient must be between 0 and 1."); - ectx.CheckUserArg(0 <= GainConfidenceLevel && GainConfidenceLevel < 1, nameof(GainConfidenceLevel), "gainConfidenceLevel must be in [0, 1)."); - ectx.CheckUserArg(0 <= FeatureFraction && FeatureFraction <= 1, nameof(FeatureFraction), "featureFraction must be between 0 and 1."); - ectx.CheckUserArg(0 <= SplitFraction && SplitFraction <= 1, nameof(SplitFraction), "splitFraction must be between 0 and 1."); - ectx.CheckUserArg(0 <= SoftmaxTemperature, nameof(SoftmaxTemperature), "softmaxTemperature must be non-negative."); - ectx.CheckUserArg(0 < MaxBins, nameof(MaxBins), "maxBins must greater than 0."); - ectx.CheckUserArg(0 <= SparsifyThreshold && SparsifyThreshold <= 1, nameof(SparsifyThreshold), "specifyThreshold must be between 0 and 1."); - ectx.CheckUserArg(0 < NumTrees, nameof(NumTrees), "Number of trees must be positive."); - ectx.CheckUserArg(0 <= Smoothing && Smoothing <= 1, nameof(Smoothing), "smoothing must be between 0 and 1."); - ectx.CheckUserArg(0 <= BaggingSize, nameof(BaggingSize), "baggingSize must be non-negative."); - ectx.CheckUserArg(0 <= BaggingTrainFraction && BaggingTrainFraction <= 1, nameof(BaggingTrainFraction), "baggingTrainFraction must be between 0 and 1."); - ectx.CheckUserArg(0 <= FeatureFirstUsePenalty, nameof(FeatureFirstUsePenalty), "featureFirstUsePenalty must be non-negative."); - ectx.CheckUserArg(0 <= FeatureReusePenalty, nameof(FeatureReusePenalty), "featureReusePenalty must be non-negative."); - ectx.CheckUserArg(0 <= MaxCategoricalGroupsPerNode, nameof(MaxCategoricalGroupsPerNode), "maxCategoricalGroupsPerNode must be non-negative."); - ectx.CheckUserArg(0 <= MaxCategoricalSplitPoints, nameof(MaxCategoricalSplitPoints), "maxCategoricalSplitPoints must be non-negative."); - ectx.CheckUserArg(0 <= MinDocsPercentageForCategoricalSplit, nameof(MinDocsPercentageForCategoricalSplit), "minDocsPercentageForCategoricalSplit must be non-negative."); - ectx.CheckUserArg(0 <= MinDocsForCategoricalSplit, nameof(MinDocsForCategoricalSplit), "minDocsForCategoricalSplit must be non-negative."); - ectx.CheckUserArg(Bundle.None <= Bundling && Bundling <= Bundle.Adjacent, nameof(Bundling), "bundling must be between 0 and 2."); - ectx.CheckUserArg(Bias >= 0, nameof(Bias), "Bias must be greater than equal to zero."); + ectx.CheckUserArg(NumberOfThreads == null || NumberOfThreads > 0, nameof(NumberOfThreads), "Must be positive."); + ectx.CheckUserArg(NumberOfLeaves >= 2, nameof(NumberOfLeaves), "Must be at least 2."); + ectx.CheckUserArg(0 <= EntropyCoefficient && EntropyCoefficient <= 1, nameof(EntropyCoefficient), "Must be between 0 and 1."); + ectx.CheckUserArg(0 <= GainConfidenceLevel && GainConfidenceLevel < 1, nameof(GainConfidenceLevel), "Must be in [0, 1)."); + ectx.CheckUserArg(0 <= FeatureFraction && FeatureFraction <= 1, nameof(FeatureFraction), "Must be between 0 and 1."); + ectx.CheckUserArg(0 <= FeatureFractionPerSplit && FeatureFractionPerSplit <= 1, nameof(FeatureFractionPerSplit), "Must be between 0 and 1."); + ectx.CheckUserArg(0 <= SoftmaxTemperature, nameof(SoftmaxTemperature), "Must be non-negative."); + ectx.CheckUserArg(0 < MaximumBinCountPerFeature, nameof(MaximumBinCountPerFeature), "Must greater than 0."); + ectx.CheckUserArg(0 <= SparsifyThreshold && SparsifyThreshold <= 1, nameof(SparsifyThreshold), "Must be between 0 and 1."); + ectx.CheckUserArg(0 < NumberOfTrees, nameof(NumberOfTrees), "Must be positive."); + ectx.CheckUserArg(0 <= Smoothing && Smoothing <= 1, nameof(Smoothing), "Must be between 0 and 1."); + ectx.CheckUserArg(0 <= BaggingSize, nameof(BaggingSize), "Must be non-negative."); + ectx.CheckUserArg(0 <= BaggingExampleFraction && BaggingExampleFraction <= 1, nameof(BaggingExampleFraction), "Must be between 0 and 1."); + ectx.CheckUserArg(0 <= FeatureFirstUsePenalty, nameof(FeatureFirstUsePenalty), "Must be non-negative."); + ectx.CheckUserArg(0 <= FeatureReusePenalty, nameof(FeatureReusePenalty), "Must be non-negative."); + ectx.CheckUserArg(0 <= MaximumCategoricalGroupCountPerNode, nameof(MaximumCategoricalGroupCountPerNode), "Must be non-negative."); + ectx.CheckUserArg(0 <= MaximumCategoricalSplitPointCount, nameof(MaximumCategoricalSplitPointCount), "Must be non-negative."); + ectx.CheckUserArg(0 <= MinimumExampleFractionForCategoricalSplit, nameof(MinimumExampleFractionForCategoricalSplit), "Must be non-negative."); + ectx.CheckUserArg(0 <= MinimumExamplesForCategoricalSplit, nameof(MinimumExamplesForCategoricalSplit), "Must be non-negative."); + ectx.CheckUserArg(Bundle.None <= Bundling && Bundling <= Bundle.Adjacent, nameof(Bundling), "Must be between 0 and 2."); + ectx.CheckUserArg(Bias >= 0, nameof(Bias), "Must be greater than equal to zero."); } } @@ -463,13 +602,13 @@ public abstract class BoostedTreeOptions : TreeOptions /// Number of post-bracket line search steps. /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "Number of post-bracket line search steps", ShortName = "lssteps")] - public int NumPostBracketSteps; + public int MaximumNumberOfLineSearchSteps; /// /// Minimum line search step size. /// [Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum line search step size", ShortName = "minstep")] - public Double MinStepSize; + public Double MinimumStepSize; public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDescent, ConjugateGradientDescent }; @@ -489,9 +628,10 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc /// /// Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3). /// + [BestFriend] [Argument(ArgumentType.AtMostOnce, HelpText = "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", ShortName = "esmt")] [TGUI(Description = "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)")] - public int EarlyStoppingMetrics; + internal int EarlyStoppingMetrics; /// /// Enable post-training pruning to avoid overfitting. (a validation set is required). @@ -510,7 +650,7 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc /// [Argument(ArgumentType.AtMostOnce, HelpText = "The tolerance threshold for pruning", ShortName = "prth")] [TGUI(Description = "Pruning threshold")] - public Double PruningThreshold = 0.004; + public double PruningThreshold = 0.004; /// /// The moving window size for pruning. @@ -525,7 +665,7 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc [Argument(ArgumentType.LastOccurenceWins, HelpText = "The learning rate", ShortName = "lr", SortOrder = 4)] [TGUI(Label = "Learning Rate", SuggestedSweeps = "0.025-0.4;log")] [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale: true)] - public Double LearningRates = Defaults.LearningRates; + public double LearningRate = Defaults.LearningRate; /// /// Shrinkage. @@ -559,7 +699,7 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc /// Upper bound on absolute value of single tree output. /// [Argument(ArgumentType.AtMostOnce, HelpText = "Upper bound on absolute value of single tree output", ShortName = "mo")] - public Double MaxTreeOutput = 100; + public Double MaximumTreeOutput = 100; /// /// Training starts from random ordering (determined by /r1). @@ -593,24 +733,27 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc /// /// Freeform defining the scores that should be used as the baseline ranker. /// + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Freeform defining the scores that should be used as the baseline ranker", ShortName = "basescores", Hide = true)] [TGUI(NotGui = true)] - public string BaselineScoresFormula; + internal string BaselineScoresFormula; /// /// Baseline alpha for tradeoffs of risk (0 is normal training). /// + [BestFriend] [Argument(ArgumentType.LastOccurenceWins, HelpText = "Baseline alpha for tradeoffs of risk (0 is normal training)", ShortName = "basealpha", Hide = true)] [TGUI(NotGui = true)] - public string BaselineAlphaRisk; + internal string BaselineAlphaRisk; /// - /// The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position). + /// The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). /// - [Argument(ArgumentType.LastOccurenceWins, HelpText = "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + [BestFriend] + [Argument(ArgumentType.LastOccurenceWins, HelpText = "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", ShortName = "pdff", Hide = true)] [TGUI(NotGui = true)] - public string PositionDiscountFreeform; + internal string PositionDiscountFreeform; #if !NO_STORE [Argument(ArgumentType.LastOccurenceWins, HelpText = "Offload feature bins to a file store", ShortName = "fbsopt", Hide = true)] @@ -630,14 +773,14 @@ internal override void Check(IExceptionContext ectx) { base.Check(ectx); - ectx.CheckUserArg(0 <= MaxTreeOutput, nameof(MaxTreeOutput), "maxTreeOutput must be non-negative."); - ectx.CheckUserArg(0 <= PruningThreshold, nameof(PruningThreshold), "pruningThreshold must be non-negative."); - ectx.CheckUserArg(0 < PruningWindowSize, nameof(PruningWindowSize), "pruningWindowSize must be positive."); - ectx.CheckUserArg(0 < Shrinkage, nameof(Shrinkage), "shrinkage must be positive."); - ectx.CheckUserArg(0 <= DropoutRate && DropoutRate <= 1, nameof(DropoutRate), "dropoutRate must be between 0 and 1."); - ectx.CheckUserArg(0 < GetDerivativesSampleRate, nameof(GetDerivativesSampleRate), "getDerivativesSampleRate must be positive."); - ectx.CheckUserArg(0 <= NumPostBracketSteps, nameof(NumPostBracketSteps), "numPostBracketSteps must be non-negative."); - ectx.CheckUserArg(0 <= MinStepSize, nameof(MinStepSize), "minStepSize must be non-negative."); + ectx.CheckUserArg(0 <= MaximumTreeOutput, nameof(MaximumTreeOutput), "Must be non-negative."); + ectx.CheckUserArg(0 <= PruningThreshold, nameof(PruningThreshold), "Must be non-negative."); + ectx.CheckUserArg(0 < PruningWindowSize, nameof(PruningWindowSize), "Must be positive."); + ectx.CheckUserArg(0 < Shrinkage, nameof(Shrinkage), "Must be positive."); + ectx.CheckUserArg(0 <= DropoutRate && DropoutRate <= 1, nameof(DropoutRate), "Must be between 0 and 1."); + ectx.CheckUserArg(0 < GetDerivativesSampleRate, nameof(GetDerivativesSampleRate), "Must be positive."); + ectx.CheckUserArg(0 <= MaximumNumberOfLineSearchSteps, nameof(MaximumNumberOfLineSearchSteps), "Must be non-negative."); + ectx.CheckUserArg(0 <= MinimumStepSize, nameof(MinimumStepSize), "Must be non-negative."); } } } diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs index 51b406f50e..c7a54428f7 100644 --- a/src/Microsoft.ML.FastTree/FastTreeClassification.cs +++ b/src/Microsoft.ML.FastTree/FastTreeClassification.cs @@ -10,7 +10,6 @@ using Microsoft.ML.Calibrators; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -119,25 +118,25 @@ public sealed partial class FastTreeBinaryClassificationTrainer : /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the example weight. /// The learning rate. - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. - /// The max number of leaves in each regression tree. - /// Total number of decision trees to create in the ensemble. + /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. + /// The max number of leaves in each regression tree. + /// Total number of decision trees to create in the ensemble. internal FastTreeBinaryClassificationTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) - : base(env, TrainerUtils.MakeBoolScalarLabel(labelColumn), featureColumn, weightColumn, null, numLeaves, numTrees, minDatapointsInLeaves, learningRate) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string exampleWeightColumnName = null, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) + : base(env, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate) { // Set the sigmoid parameter to the 2 * learning rate, for traditional FastTreeClassification loss - _sigmoidParameter = 2.0 * FastTreeTrainerOptions.LearningRates; + _sigmoidParameter = 2.0 * FastTreeTrainerOptions.LearningRate; } /// @@ -149,7 +148,7 @@ internal FastTreeBinaryClassificationTrainer(IHostEnvironment env, Options optio : base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName)) { // Set the sigmoid parameter to the 2 * learning rate, for traditional FastTreeClassification loss - _sigmoidParameter = 2.0 * FastTreeTrainerOptions.LearningRates; + _sigmoidParameter = 2.0 * FastTreeTrainerOptions.LearningRate; } private protected override PredictionKind PredictionKind => PredictionKind.BinaryClassification; @@ -191,14 +190,14 @@ private protected override ObjectiveFunctionBase ConstructObjFunc(IChannel ch) return new ObjectiveImpl( TrainSet, _trainSetLabels, - FastTreeTrainerOptions.LearningRates, + FastTreeTrainerOptions.LearningRate, FastTreeTrainerOptions.Shrinkage, _sigmoidParameter, FastTreeTrainerOptions.UnbalancedSets, - FastTreeTrainerOptions.MaxTreeOutput, + FastTreeTrainerOptions.MaximumTreeOutput, FastTreeTrainerOptions.GetDerivativesSampleRate, FastTreeTrainerOptions.BestStepRankingRegressionTrees, - FastTreeTrainerOptions.RngSeed, + FastTreeTrainerOptions.Seed, ParallelTraining); } @@ -209,7 +208,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm( { var lossCalculator = new BinaryClassificationTest(optimizationAlgorithm.TrainingScores, _trainSetLabels, _sigmoidParameter); // REVIEW: we should makeloss indices an enum in BinaryClassificationTest - optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(lossCalculator, FastTreeTrainerOptions.UnbalancedSets ? 3 /*Unbalanced sets loss*/ : 1 /*normal loss*/, FastTreeTrainerOptions.NumPostBracketSteps, FastTreeTrainerOptions.MinStepSize); + optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(lossCalculator, FastTreeTrainerOptions.UnbalancedSets ? 3 /*Unbalanced sets loss*/ : 1 /*normal loss*/, FastTreeTrainerOptions.MaximumNumberOfLineSearchSteps, FastTreeTrainerOptions.MinimumStepSize); } return optimizationAlgorithm; } diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs index 88e5c41485..6db1145e7c 100644 --- a/src/Microsoft.ML.FastTree/FastTreeRanking.cs +++ b/src/Microsoft.ML.FastTree/FastTreeRanking.cs @@ -12,7 +12,6 @@ using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -61,26 +60,26 @@ public sealed partial class FastTreeRankingTrainer /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the group ID. - /// The name for the column containing the initial weight. - /// The max number of leaves in each regression tree. - /// Total number of decision trees to create in the ensemble. - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the group ID. + /// The name for the column containing the examle weight. + /// The max number of leaves in each regression tree. + /// Total number of decision trees to create in the ensemble. + /// The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data. /// The learning rate. internal FastTreeRankingTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string groupIdColumn = DefaultColumnNames.GroupId, - string weightColumn = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) - : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weightColumn, groupIdColumn, numLeaves, numTrees, minDatapointsInLeaves, learningRate) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string rowGroupColumnName = DefaultColumnNames.GroupId, + string exampleWeightColumnName = null, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) + : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, exampleWeightColumnName, rowGroupColumnName, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate) { - Host.CheckNonEmpty(groupIdColumn, nameof(groupIdColumn)); + Host.CheckNonEmpty(rowGroupColumnName, nameof(rowGroupColumnName)); } /// @@ -132,7 +131,7 @@ private Double[] GetLabelGains() try { Host.AssertValue(FastTreeTrainerOptions.CustomGains); - return FastTreeTrainerOptions.CustomGains.Split(',').Select(k => Convert.ToDouble(k.Trim())).ToArray(); + return FastTreeTrainerOptions.CustomGains; } catch (Exception ex) { @@ -144,26 +143,17 @@ private Double[] GetLabelGains() private protected override void CheckOptions(IChannel ch) { - if (!string.IsNullOrEmpty(FastTreeTrainerOptions.CustomGains)) + if (FastTreeTrainerOptions.CustomGains != null) { - var stringGain = FastTreeTrainerOptions.CustomGains.Split(','); - if (stringGain.Length < 5) + var gains = FastTreeTrainerOptions.CustomGains; + if (gains.Length < 5) { throw ch.ExceptUserArg(nameof(FastTreeTrainerOptions.CustomGains), - "{0} an invalid number of gain levels. We require at least 5. Make certain they're comma separated.", - stringGain.Length); + "Has {0} gain levels. We require at least 5 elements.", + gains.Length); } - Double[] gain = new Double[stringGain.Length]; - for (int i = 0; i < stringGain.Length; ++i) - { - if (!Double.TryParse(stringGain[i], out gain[i])) - { - throw ch.ExceptUserArg(nameof(FastTreeTrainerOptions.CustomGains), - "Could not parse '{0}' as a floating point number", stringGain[0]); - } - } - DcgCalculator.LabelGainMap = gain; - Dataset.DatasetSkeleton.LabelGainMap = gain; + DcgCalculator.LabelGainMap = gains; + Dataset.DatasetSkeleton.LabelGainMap = gains; } ch.CheckUserArg((FastTreeTrainerOptions.EarlyStoppingRule == null && !FastTreeTrainerOptions.EnablePruning) || (FastTreeTrainerOptions.EarlyStoppingMetrics == 1 || FastTreeTrainerOptions.EarlyStoppingMetrics == 3), nameof(FastTreeTrainerOptions.EarlyStoppingMetrics), @@ -178,7 +168,7 @@ private protected override void Initialize(IChannel ch) if (FastTreeTrainerOptions.CompressEnsemble) { _ensembleCompressor = new LassoBasedEnsembleCompressor(); - _ensembleCompressor.Initialize(FastTreeTrainerOptions.NumTrees, TrainSet, TrainSet.Ratings, FastTreeTrainerOptions.RngSeed); + _ensembleCompressor.Initialize(FastTreeTrainerOptions.NumberOfTrees, TrainSet, TrainSet.Ratings, FastTreeTrainerOptions.Seed); } } @@ -193,7 +183,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm( if (FastTreeTrainerOptions.UseLineSearch) { _specialTrainSetTest = new FastNdcgTest(optimizationAlgorithm.TrainingScores, TrainSet.Ratings, FastTreeTrainerOptions.SortingAlgorithm, FastTreeTrainerOptions.EarlyStoppingMetrics); - optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(_specialTrainSetTest, 0, FastTreeTrainerOptions.NumPostBracketSteps, FastTreeTrainerOptions.MinStepSize); + optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(_specialTrainSetTest, 0, FastTreeTrainerOptions.MaximumNumberOfLineSearchSteps, FastTreeTrainerOptions.MinimumStepSize); } return optimizationAlgorithm; } @@ -201,7 +191,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm( private protected override BaggingProvider CreateBaggingProvider() { Host.Assert(FastTreeTrainerOptions.BaggingSize > 0); - return new RankingBaggingProvider(TrainSet, FastTreeTrainerOptions.NumLeaves, FastTreeTrainerOptions.RngSeed, FastTreeTrainerOptions.BaggingTrainFraction); + return new RankingBaggingProvider(TrainSet, FastTreeTrainerOptions.NumberOfLeaves, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.BaggingExampleFraction); } private protected override void PrepareLabels(IChannel ch) @@ -499,7 +489,7 @@ private enum DupeIdInfo // parameters private int _maxDcgTruncationLevel; - private bool _trainDcg; + private bool _useDcg; // A lookup table for the sigmoid used in the lambda calculation // Note: Is built for a specific sigmoid parameter, so assumes this will be constant throughout computation private double[] _sigmoidTable; @@ -552,12 +542,12 @@ private enum DupeIdInfo public LambdaRankObjectiveFunction(Dataset trainset, short[] labels, Options options, IParallelTraining parallelTraining) : base(trainset, - options.LearningRates, + options.LearningRate, options.Shrinkage, - options.MaxTreeOutput, + options.MaximumTreeOutput, options.GetDerivativesSampleRate, options.BestStepRankingRegressionTrees, - options.RngSeed) + options.Seed) { _labels = labels; @@ -571,9 +561,9 @@ public LambdaRankObjectiveFunction(Dataset trainset, short[] labels, Options opt _labelCounts[q] = new int[relevancyLevel]; // precomputed arrays - _maxDcgTruncationLevel = options.LambdaMartMaxTruncation; - _trainDcg = options.TrainDcg; - if (_trainDcg) + _maxDcgTruncationLevel = options.NdcgTruncationLevel; + _useDcg = options.UseDcg; + if (_useDcg) { _inverseMaxDcgt = new double[Dataset.NumQueries]; for (int q = 0; q < Dataset.NumQueries; ++q) @@ -607,7 +597,7 @@ public LambdaRankObjectiveFunction(Dataset trainset, short[] labels, Options opt FillGainLabels(); #region parameters - _sigmoidParam = options.LearningRates; + _sigmoidParam = options.LearningRate; _costFunctionParam = options.CostFunctionParam; _distanceWeight2 = options.DistanceWeight2; _normalizeQueryLambdas = options.NormalizeQueryLambdas; @@ -676,7 +666,7 @@ private void SetupBaselineRisk(Options options) uint[] vals = new uint[ffmap.RawFeatureCount]; int iInd = Array.IndexOf(ffnames, "I"); int tInd = Array.IndexOf(ffnames, "T"); - int totalTrees = options.NumTrees; + int totalTrees = options.NumberOfTrees; if (tInd >= 0) vals[tInd] = (uint)totalTrees; _baselineAlpha = Enumerable.Range(0, totalTrees).Select(i => @@ -876,7 +866,7 @@ protected override void GetGradientInOneQuery(int query, int threadIndex) // Continous cost function and shifted NDCG require a re-sort and recomputation of maxDCG // (Change of scores in the former and scores and labels in the latter) - if (!_trainDcg && (_costFunctionParam == 'c' || _useShiftedNdcg)) + if (!_useDcg && (_costFunctionParam == 'c' || _useShiftedNdcg)) { PermutationSort(permutation, scoresToUse, labels, numDocuments, begin); inverseMaxDcg = 1.0 / DcgCalculator.MaxDcgQuery(labels, begin, numDocuments, numDocuments, _labelCounts[query]); diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs index 3533adfd6a..ac7ca46fbc 100644 --- a/src/Microsoft.ML.FastTree/FastTreeRegression.cs +++ b/src/Microsoft.ML.FastTree/FastTreeRegression.cs @@ -8,7 +8,6 @@ using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -52,22 +51,22 @@ public sealed partial class FastTreeRegressionTrainer /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the example weight. /// The learning rate. - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. - /// The max number of leaves in each regression tree. - /// Total number of decision trees to create in the ensemble. + /// The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data. + /// The max number of leaves in each regression tree. + /// Total number of decision trees to create in the ensemble. internal FastTreeRegressionTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) - : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weightColumn, null, numLeaves, numTrees, minDatapointsInLeaves, learningRate) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string exampleWeightColumnName = null, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) + : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate) { } @@ -127,7 +126,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm( { var lossCalculator = new RegressionTest(optimizationAlgorithm.TrainingScores); // REVIEW: We should make loss indices an enum in BinaryClassificationTest. - optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(lossCalculator, 1 /*L2 error*/, FastTreeTrainerOptions.NumPostBracketSteps, FastTreeTrainerOptions.MinStepSize); + optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(lossCalculator, 1 /*L2 error*/, FastTreeTrainerOptions.MaximumNumberOfLineSearchSteps, FastTreeTrainerOptions.MinimumStepSize); } return optimizationAlgorithm; @@ -390,12 +389,12 @@ internal sealed class ObjectiveImpl : ObjectiveFunctionBase, IStepSearch public ObjectiveImpl(Dataset trainData, RegressionGamTrainer.Options options) : base( trainData, - options.LearningRates, + options.LearningRate, 0, - options.MaxOutput, + options.MaximumTreeOutput, options.GetDerivativesSampleRate, false, - options.RngSeed) + options.Seed) { _labels = GetDatasetRegressionLabels(trainData); } @@ -403,12 +402,12 @@ public ObjectiveImpl(Dataset trainData, RegressionGamTrainer.Options options) : public ObjectiveImpl(Dataset trainData, Options options) : base( trainData, - options.LearningRates, + options.LearningRate, options.Shrinkage, - options.MaxTreeOutput, + options.MaximumTreeOutput, options.GetDerivativesSampleRate, options.BestStepRankingRegressionTrees, - options.RngSeed) + options.Seed) { if (options.DropoutRate > 0 && LearningRate > 0) // Don't do shrinkage if dropouts are used. Shrinkage = 1.0 / LearningRate; diff --git a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs index b18ed335f3..58510cec75 100644 --- a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs +++ b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs @@ -9,7 +9,6 @@ using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -50,25 +49,25 @@ public sealed partial class FastTreeTweedieTrainer /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the example weight. /// The learning rate. - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. - /// The max number of leaves in each regression tree. - /// Total number of decision trees to create in the ensemble. + /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. + /// The max number of leaves in each regression tree. + /// Total number of decision trees to create in the ensemble. internal FastTreeTweedieTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) - : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weightColumn, null, numLeaves, numTrees, minDatapointsInLeaves, learningRate) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string exampleWeightColumnName = null, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) + : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate) { - Host.CheckNonEmpty(labelColumn, nameof(labelColumn)); - Host.CheckNonEmpty(featureColumn, nameof(featureColumn)); + Host.CheckNonEmpty(labelColumnName, nameof(labelColumnName)); + Host.CheckNonEmpty(featureColumnName, nameof(featureColumnName)); Initialize(); } @@ -134,7 +133,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm( var lossCalculator = new RegressionTest(optimizationAlgorithm.TrainingScores); // REVIEW: We should make loss indices an enum in BinaryClassificationTest. // REVIEW: Nope, subcomponent. - optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(lossCalculator, 1 /*L2 error*/, FastTreeTrainerOptions.NumPostBracketSteps, FastTreeTrainerOptions.MinStepSize); + optimizationAlgorithm.AdjustTreeOutputsOverride = new LineSearch(lossCalculator, 1 /*L2 error*/, FastTreeTrainerOptions.MaximumNumberOfLineSearchSteps, FastTreeTrainerOptions.MinimumStepSize); } return optimizationAlgorithm; @@ -341,12 +340,12 @@ private sealed class ObjectiveImpl : ObjectiveFunctionBase, IStepSearch public ObjectiveImpl(Dataset trainData, Options options) : base( trainData, - options.LearningRates, + options.LearningRate, options.Shrinkage, - options.MaxTreeOutput, + options.MaximumTreeOutput, options.GetDerivativesSampleRate, options.BestStepRankingRegressionTrees, - options.RngSeed) + options.Seed) { if (options.DropoutRate > 0 && LearningRate > 0) // Don't do shrinkage if dropouts are used. Shrinkage = 1.0 / LearningRate; @@ -361,7 +360,7 @@ public ObjectiveImpl(Dataset trainData, Options options) _index1 = 1 - options.Index; _index2 = 2 - options.Index; - _maxClamp = Math.Abs(options.MaxTreeOutput); + _maxClamp = Math.Abs(options.MaximumTreeOutput); } public void AdjustTreeOutputs(IChannel ch, InternalRegressionTree tree, DocumentPartitioning partitioning, ScoreTracker trainingScores) diff --git a/src/Microsoft.ML.FastTree/GamClassification.cs b/src/Microsoft.ML.FastTree/GamClassification.cs index ba75b315d6..980c6916a5 100644 --- a/src/Microsoft.ML.FastTree/GamClassification.cs +++ b/src/Microsoft.ML.FastTree/GamClassification.cs @@ -59,20 +59,20 @@ internal BinaryClassificationGamTrainer(IHostEnvironment env, Options options) /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. - /// The number of iterations to use in learning the features. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the example weight. + /// The number of iterations to use in learning the features. /// The learning rate. GAMs work best with a small learning rate. - /// The maximum number of bins to use to approximate features + /// The maximum number of bins to use to approximate features internal BinaryClassificationGamTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numIterations = GamDefaults.NumIterations, - double learningRate = GamDefaults.LearningRates, - int maxBins = GamDefaults.MaxBins) - : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumn), featureColumn, weightColumn, numIterations, learningRate, maxBins) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string rowGroupColumnName = null, + int numberOfIterations = GamDefaults.NumberOfIterations, + double learningRate = GamDefaults.LearningRate, + int maximumBinCountPerFeature = GamDefaults.MaximumBinCountPerFeature) + : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, rowGroupColumnName, numberOfIterations, learningRate, maximumBinCountPerFeature) { _sigmoidParameter = 1; } @@ -115,14 +115,14 @@ private protected override ObjectiveFunctionBase CreateObjectiveFunction() return new FastTreeBinaryClassificationTrainer.ObjectiveImpl( TrainSet, ConvertTargetsToBool(TrainSet.Targets), - GamTrainerOptions.LearningRates, + GamTrainerOptions.LearningRate, 0, _sigmoidParameter, GamTrainerOptions.UnbalancedSets, - GamTrainerOptions.MaxOutput, + GamTrainerOptions.MaximumTreeOutput, GamTrainerOptions.GetDerivativesSampleRate, false, - GamTrainerOptions.RngSeed, + GamTrainerOptions.Seed, ParallelTraining ); } diff --git a/src/Microsoft.ML.FastTree/GamModelParameters.cs b/src/Microsoft.ML.FastTree/GamModelParameters.cs index 091de95de9..61052acd43 100644 --- a/src/Microsoft.ML.FastTree/GamModelParameters.cs +++ b/src/Microsoft.ML.FastTree/GamModelParameters.cs @@ -13,7 +13,6 @@ using Microsoft.ML.Command; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -30,23 +29,23 @@ namespace Microsoft.ML.Trainers.FastTree public abstract class GamModelParametersBase : ModelParametersBase, IValueMapper, ICalculateFeatureContribution, IFeatureContributionMapper, ICanSaveInTextFormat, ICanSaveSummary, ICanSaveInIniFormat { - private readonly double[][] _binUpperBounds; - private readonly double[][] _binEffects; /// /// The model intercept. Also known as bias or mean effect. /// - public readonly double Intercept; + public readonly double Bias; /// /// The number of shape functions used in the model. /// - public readonly int NumShapeFunctions; + public readonly int NumberOfShapeFunctions; + + private readonly double[][] _binUpperBounds; + private readonly double[][] _binEffects; private readonly VectorType _inputType; private readonly DataViewType _outputType; // These would be the bins for a totally sparse input. private readonly int[] _binsAtAllZero; // The output value for all zeros private readonly double _valueAtAllZero; - private readonly int[] _shapeToInputMap; private readonly int _numInputFeatures; private readonly Dictionary _inputFeatureToShapeFunctionMap; @@ -73,17 +72,17 @@ private protected GamModelParametersBase(IHostEnvironment env, string name, Host.CheckParam(shapeToInputMap == null || shapeToInputMap.Length == binEffects.Length, nameof(shapeToInputMap), "Must have same number of features as binEffects"); // Define the model basics - Intercept = intercept; + Bias = intercept; _binUpperBounds = binUpperBounds; _binEffects = binEffects; - NumShapeFunctions = binEffects.Length; + NumberOfShapeFunctions = binEffects.Length; // For sparse inputs we have a fast lookup - _binsAtAllZero = new int[NumShapeFunctions]; + _binsAtAllZero = new int[NumberOfShapeFunctions]; _valueAtAllZero = 0; // Walk through each feature and perform checks / updates - for (int i = 0; i < NumShapeFunctions; i++) + for (int i = 0; i < NumberOfShapeFunctions; i++) { // Check data validity Host.CheckValue(binEffects[i], nameof(binEffects), "Array contained null entries"); @@ -97,11 +96,11 @@ private protected GamModelParametersBase(IHostEnvironment env, string name, // Define the sparse mappings from/to input to/from shape functions _shapeToInputMap = shapeToInputMap; if (_shapeToInputMap == null) - _shapeToInputMap = Utils.GetIdentityPermutation(NumShapeFunctions); + _shapeToInputMap = Utils.GetIdentityPermutation(NumberOfShapeFunctions); _numInputFeatures = numInputFeatures; if (_numInputFeatures == -1) - _numInputFeatures = NumShapeFunctions; + _numInputFeatures = NumberOfShapeFunctions; _inputFeatureToShapeFunctionMap = new Dictionary(_shapeToInputMap.Length); for (int i = 0; i < _shapeToInputMap.Length; i++) { @@ -121,24 +120,24 @@ private protected GamModelParametersBase(IHostEnvironment env, string name, Mode BinaryReader reader = ctx.Reader; - NumShapeFunctions = reader.ReadInt32(); - Host.CheckDecode(NumShapeFunctions >= 0); + NumberOfShapeFunctions = reader.ReadInt32(); + Host.CheckDecode(NumberOfShapeFunctions >= 0); _numInputFeatures = reader.ReadInt32(); Host.CheckDecode(_numInputFeatures >= 0); - Intercept = reader.ReadDouble(); + Bias = reader.ReadDouble(); if (ctx.Header.ModelVerWritten == 0x00010001) using (var ch = env.Start("GamWarningChannel")) ch.Warning("GAMs models written prior to ML.NET 0.6 are loaded with an incorrect Intercept. For these models, subtract the value of the intercept from the prediction."); - _binEffects = new double[NumShapeFunctions][]; - _binUpperBounds = new double[NumShapeFunctions][]; - _binsAtAllZero = new int[NumShapeFunctions]; - for (int i = 0; i < NumShapeFunctions; i++) + _binEffects = new double[NumberOfShapeFunctions][]; + _binUpperBounds = new double[NumberOfShapeFunctions][]; + _binsAtAllZero = new int[NumberOfShapeFunctions]; + for (int i = 0; i < NumberOfShapeFunctions; i++) { _binEffects[i] = reader.ReadDoubleArray(); Host.CheckDecode(Utils.Size(_binEffects[i]) >= 1); } - for (int i = 0; i < NumShapeFunctions; i++) + for (int i = 0; i < NumberOfShapeFunctions; i++) { _binUpperBounds[i] = reader.ReadDoubleArray(_binEffects[i].Length); _valueAtAllZero += GetBinEffect(i, 0, out _binsAtAllZero[i]); @@ -147,13 +146,13 @@ private protected GamModelParametersBase(IHostEnvironment env, string name, Mode Host.CheckDecode(len >= 0); _inputFeatureToShapeFunctionMap = new Dictionary(len); - _shapeToInputMap = Utils.CreateArray(NumShapeFunctions, -1); + _shapeToInputMap = Utils.CreateArray(NumberOfShapeFunctions, -1); for (int i = 0; i < len; i++) { int key = reader.ReadInt32(); Host.CheckDecode(0 <= key && key < _numInputFeatures); int val = reader.ReadInt32(); - Host.CheckDecode(0 <= val && val < NumShapeFunctions); + Host.CheckDecode(0 <= val && val < NumberOfShapeFunctions); Host.CheckDecode(!_inputFeatureToShapeFunctionMap.ContainsKey(key)); Host.CheckDecode(_shapeToInputMap[val] == -1); _inputFeatureToShapeFunctionMap[key] = val; @@ -168,17 +167,17 @@ private protected override void SaveCore(ModelSaveContext ctx) { Host.CheckValue(ctx, nameof(ctx)); - ctx.Writer.Write(NumShapeFunctions); - Host.Assert(NumShapeFunctions >= 0); + ctx.Writer.Write(NumberOfShapeFunctions); + Host.Assert(NumberOfShapeFunctions >= 0); ctx.Writer.Write(_numInputFeatures); Host.Assert(_numInputFeatures >= 0); - ctx.Writer.Write(Intercept); - for (int i = 0; i < NumShapeFunctions; i++) + ctx.Writer.Write(Bias); + for (int i = 0; i < NumberOfShapeFunctions; i++) ctx.Writer.WriteDoubleArray(_binEffects[i]); int diff = _binEffects.Sum(e => e.Take(e.Length - 1).Select((ef, i) => ef != e[i + 1] ? 1 : 0).Sum()); int bound = _binEffects.Sum(e => e.Length - 1); - for (int i = 0; i < NumShapeFunctions; i++) + for (int i = 0; i < NumberOfShapeFunctions; i++) { ctx.Writer.WriteDoublesNoCount(_binUpperBounds[i]); Host.Assert(_binUpperBounds[i].Length == _binEffects[i].Length); @@ -204,7 +203,7 @@ private void Map(in VBuffer features, ref float response) { Host.CheckParam(features.Length == _numInputFeatures, nameof(features), "Bad length of input"); - double value = Intercept; + double value = Bias; var featuresValues = features.GetValues(); if (features.IsDense) @@ -234,9 +233,9 @@ private void Map(in VBuffer features, ref float response) internal double GetFeatureBinsAndScore(in VBuffer features, int[] bins) { Host.CheckParam(features.Length == _numInputFeatures, nameof(features)); - Host.CheckParam(Utils.Size(bins) == NumShapeFunctions, nameof(bins)); + Host.CheckParam(Utils.Size(bins) == NumberOfShapeFunctions, nameof(bins)); - double value = Intercept; + double value = Bias; var featuresValues = features.GetValues(); if (features.IsDense) { @@ -251,7 +250,7 @@ internal double GetFeatureBinsAndScore(in VBuffer features, int[] bins) var featuresIndices = features.GetIndices(); // Add in the precomputed results for all features value += _valueAtAllZero; - Array.Copy(_binsAtAllZero, bins, NumShapeFunctions); + Array.Copy(_binsAtAllZero, bins, NumberOfShapeFunctions); // Update the results for features we have for (int i = 0; i < featuresValues.Length; ++i) @@ -266,14 +265,14 @@ internal double GetFeatureBinsAndScore(in VBuffer features, int[] bins) private double GetBinEffect(int featureIndex, double featureValue) { - Host.Assert(0 <= featureIndex && featureIndex < NumShapeFunctions, "Index out of range."); + Host.Assert(0 <= featureIndex && featureIndex < NumberOfShapeFunctions, "Index out of range."); int index = Algorithms.FindFirstGE(_binUpperBounds[featureIndex], featureValue); return _binEffects[featureIndex][index]; } private double GetBinEffect(int featureIndex, double featureValue, out int binIndex) { - Host.Check(0 <= featureIndex && featureIndex < NumShapeFunctions, "Index out of range."); + Host.Check(0 <= featureIndex && featureIndex < NumberOfShapeFunctions, "Index out of range."); binIndex = Algorithms.FindFirstGE(_binUpperBounds[featureIndex], featureValue); return _binEffects[featureIndex][binIndex]; } @@ -283,9 +282,9 @@ private double GetBinEffect(int featureIndex, double featureValue, out int binIn /// /// The index of the feature (in the training vector) to get. /// The bin upper bounds. May be zero length if this feature has no bins. - public double[] GetBinUpperBounds(int featureIndex) + public IReadOnlyList GetBinUpperBounds(int featureIndex) { - Host.Check(0 <= featureIndex && featureIndex < NumShapeFunctions, "Index out of range."); + Host.Check(0 <= featureIndex && featureIndex < NumberOfShapeFunctions, "Index out of range."); if (!_inputFeatureToShapeFunctionMap.TryGetValue(featureIndex, out int j)) return new double[0]; @@ -297,10 +296,11 @@ public double[] GetBinUpperBounds(int featureIndex) /// /// Get all the bin upper bounds. /// - public double[][] GetBinUpperBounds() + [BestFriend] + internal double[][] GetBinUpperBounds() { - double[][] binUpperBounds = new double[NumShapeFunctions][]; - for (int i = 0; i < NumShapeFunctions; i++) + double[][] binUpperBounds = new double[NumberOfShapeFunctions][]; + for (int i = 0; i < NumberOfShapeFunctions; i++) { if (_inputFeatureToShapeFunctionMap.TryGetValue(i, out int j)) { @@ -320,9 +320,9 @@ public double[][] GetBinUpperBounds() /// /// The index of the feature (in the training vector) to get. /// The binned effects for each feature. May be zero length if this feature has no bins. - public double[] GetBinEffects(int featureIndex) + public IReadOnlyList GetBinEffects(int featureIndex) { - Host.Check(0 <= featureIndex && featureIndex < NumShapeFunctions, "Index out of range."); + Host.Check(0 <= featureIndex && featureIndex < NumberOfShapeFunctions, "Index out of range."); if (!_inputFeatureToShapeFunctionMap.TryGetValue(featureIndex, out int j)) return new double[0]; @@ -334,10 +334,11 @@ public double[] GetBinEffects(int featureIndex) /// /// Get all the binned effects. /// - public double[][] GetBinEffects() + [BestFriend] + internal double[][] GetBinEffects() { - double[][] binEffects = new double[NumShapeFunctions][]; - for (int i = 0; i < NumShapeFunctions; i++) + double[][] binEffects = new double[NumberOfShapeFunctions][]; + for (int i = 0; i < NumberOfShapeFunctions; i++) { if (_inputFeatureToShapeFunctionMap.TryGetValue(i, out int j)) { @@ -358,7 +359,7 @@ void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema) Host.CheckValueOrNull(schema); writer.WriteLine("\xfeffFeature index table"); // add BOM to tell excel this is UTF-8 - writer.WriteLine($"Number of features:\t{NumShapeFunctions + 1:D}"); + writer.WriteLine($"Number of features:\t{NumberOfShapeFunctions + 1:D}"); writer.WriteLine("Feature Index\tFeature Name"); // REVIEW: We really need some unit tests around text exporting (for this, and other learners). @@ -371,7 +372,7 @@ void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema) var names = default(VBuffer>); AnnotationUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, _numInputFeatures, ref names); - for (int internalIndex = 0; internalIndex < NumShapeFunctions; internalIndex++) + for (int internalIndex = 0; internalIndex < NumberOfShapeFunctions; internalIndex++) { int featureIndex = _shapeToInputMap[internalIndex]; var name = names.GetItemOrDefault(featureIndex); @@ -381,8 +382,8 @@ void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema) writer.WriteLine(); writer.WriteLine("Per feature binned effects:"); writer.WriteLine("Feature Index\tFeature Value Bin Upper Bound\tOutput (effect on label)"); - writer.WriteLine($"{-1:D}\t{float.MaxValue:R}\t{Intercept:R}"); - for (int internalIndex = 0; internalIndex < NumShapeFunctions; internalIndex++) + writer.WriteLine($"{-1:D}\t{float.MaxValue:R}\t{Bias:R}"); + for (int internalIndex = 0; internalIndex < NumberOfShapeFunctions; internalIndex++) { int featureIndex = _shapeToInputMap[internalIndex]; @@ -437,7 +438,7 @@ void ICanSaveInIniFormat.SaveAsIni(TextWriter writer, RoleMappedSchema schema, I Host.CheckValue(writer, nameof(writer), "writer must not be null"); var ensemble = new InternalTreeEnsemble(); - for (int featureIndex = 0; featureIndex < NumShapeFunctions; featureIndex++) + for (int featureIndex = 0; featureIndex < NumberOfShapeFunctions; featureIndex++) { var effects = _binEffects[featureIndex]; var binThresholds = _binUpperBounds[featureIndex]; @@ -460,7 +461,7 @@ void ICanSaveInIniFormat.SaveAsIni(TextWriter writer, RoleMappedSchema schema, I rawThresholds: new[] { 0f }, lteChild: new[] { ~0 }, gtChild: new[] { ~1 }, - leafValues: new[] { Intercept, Intercept }); + leafValues: new[] { Bias, Bias }); ensemble.AddTree(interceptTree); var ini = FastTreeIniFileUtils.TreeEnsembleToIni( @@ -852,7 +853,7 @@ public static FeatureInfo[] GetInfos(Context context) { lock (context._pred) { - return Utils.BuildArray(context._pred.NumShapeFunctions, + return Utils.BuildArray(context._pred.NumberOfShapeFunctions, i => new FeatureInfo(context, context._pred._shapeToInputMap[i], i, context._catsMap)); } } @@ -860,8 +861,7 @@ public static FeatureInfo[] GetInfos(Context context) } /// - /// Attempts to initialize required items, from the input model file. In the event that anything goes - /// wrong, this method will throw. + /// Attempts to initialize required items, from the input model file. It could throw if something goes wrong. /// /// The channel /// A structure containing essential information about the GAM dataset that enables diff --git a/src/Microsoft.ML.FastTree/GamRegression.cs b/src/Microsoft.ML.FastTree/GamRegression.cs index fc27185c02..ca0b127bf3 100644 --- a/src/Microsoft.ML.FastTree/GamRegression.cs +++ b/src/Microsoft.ML.FastTree/GamRegression.cs @@ -45,20 +45,20 @@ internal RegressionGamTrainer(IHostEnvironment env, Options options) /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. - /// The number of iterations to use in learning the features. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the example weight. + /// The number of iterations to use in learning the features. /// The learning rate. GAMs work best with a small learning rate. - /// The maximum number of bins to use to approximate features + /// The maximum number of bins to use to approximate features internal RegressionGamTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numIterations = GamDefaults.NumIterations, - double learningRate = GamDefaults.LearningRates, - int maxBins = GamDefaults.MaxBins) - : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weightColumn, numIterations, learningRate, maxBins) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string rowGroupColumnName = null, + int numberOfIterations = GamDefaults.NumberOfIterations, + double learningRate = GamDefaults.LearningRate, + int maximumBinCountPerFeature = GamDefaults.MaximumBinCountPerFeature) + : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, rowGroupColumnName, numberOfIterations, learningRate, maximumBinCountPerFeature) { } diff --git a/src/Microsoft.ML.FastTree/GamTrainer.cs b/src/Microsoft.ML.FastTree/GamTrainer.cs index 214a661000..23c273422f 100644 --- a/src/Microsoft.ML.FastTree/GamTrainer.cs +++ b/src/Microsoft.ML.FastTree/GamTrainer.cs @@ -66,35 +66,35 @@ public abstract class OptionsBase : TrainerInputBaseWithWeight [Argument(ArgumentType.LastOccurenceWins, HelpText = "Total number of iterations over all features", ShortName = "iter", SortOrder = 1)] [TGUI(SuggestedSweeps = "200,1500,9500")] [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[] { 200, 1500, 9500 })] - public int NumIterations = GamDefaults.NumIterations; + public int NumberOfIterations = GamDefaults.NumberOfIterations; [Argument(ArgumentType.LastOccurenceWins, HelpText = "The number of threads to use", ShortName = "t", NullName = "")] - public int? NumThreads = null; + public int? NumberOfThreads = null; [Argument(ArgumentType.LastOccurenceWins, HelpText = "The learning rate", ShortName = "lr", SortOrder = 4)] [TGUI(SuggestedSweeps = "0.001,0.1;log")] [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.001f, 0.1f, isLogScale: true)] - public double LearningRates = GamDefaults.LearningRates; + public double LearningRate = GamDefaults.LearningRate; [Argument(ArgumentType.LastOccurenceWins, HelpText = "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", ShortName = "dt")] public bool? DiskTranspose; [Argument(ArgumentType.LastOccurenceWins, HelpText = "Maximum number of distinct values (bins) per feature", ShortName = "mb")] - public int MaxBins = GamDefaults.MaxBins; + public int MaximumBinCountPerFeature = GamDefaults.MaximumBinCountPerFeature; [Argument(ArgumentType.AtMostOnce, HelpText = "Upper bound on absolute value of single output", ShortName = "mo")] - public double MaxOutput = Double.PositiveInfinity; + public double MaximumTreeOutput = double.PositiveInfinity; [Argument(ArgumentType.AtMostOnce, HelpText = "Sample each query 1 in k times in the GetDerivatives function", ShortName = "sr")] public int GetDerivativesSampleRate = 1; [Argument(ArgumentType.LastOccurenceWins, HelpText = "The seed of the random number generator", ShortName = "r1")] - public int RngSeed = 123; + public int Seed = 123; [Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum number of training instances required to form a partition", ShortName = "mi", SortOrder = 3)] [TGUI(SuggestedSweeps = "1,10,50")] [TlcModule.SweepableDiscreteParamAttribute("MinDocuments", new object[] { 1, 10, 50 })] - public int MinDocuments = 10; + public int MinimumExampleCountPerLeaf = 10; [Argument(ArgumentType.LastOccurenceWins, HelpText = "Whether to collectivize features during dataset preparation to speed up training", ShortName = "flocks", Hide = true)] public bool FeatureFlocks = true; @@ -148,23 +148,23 @@ public abstract class OptionsBase : TrainerInputBaseWithWeight private protected GamTrainerBase(IHostEnvironment env, string name, SchemaShape.Column label, - string featureColumn, - string weightColumn, - int numIterations, + string featureColumnName, + string weightCrowGroupColumnName, + int numberOfIterations, double learningRate, - int maxBins) - : base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(featureColumn), label, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn)) + int maximumBinCountPerFeature) + : base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(featureColumnName), label, TrainerUtils.MakeR4ScalarWeightColumn(weightCrowGroupColumnName)) { GamTrainerOptions = new TOptions(); - GamTrainerOptions.NumIterations = numIterations; - GamTrainerOptions.LearningRates = learningRate; - GamTrainerOptions.MaxBins = maxBins; + GamTrainerOptions.NumberOfIterations = numberOfIterations; + GamTrainerOptions.LearningRate = learningRate; + GamTrainerOptions.MaximumBinCountPerFeature = maximumBinCountPerFeature; GamTrainerOptions.LabelColumnName = label.Name; - GamTrainerOptions.FeatureColumnName = featureColumn; + GamTrainerOptions.FeatureColumnName = featureColumnName; - if (weightColumn != null) - GamTrainerOptions.ExampleWeightColumnName = weightColumn; + if (weightCrowGroupColumnName != null) + GamTrainerOptions.ExampleWeightColumnName = weightCrowGroupColumnName; Info = new TrainerInfo(normalization: false, calibration: NeedCalibration, caching: false, supportValid: true); _gainConfidenceInSquaredStandardDeviations = Math.Pow(ProbabilityFunctions.Probit(1 - (1 - GamTrainerOptions.GainConfidenceLevel) * 0.5), 2); @@ -180,13 +180,13 @@ private protected GamTrainerBase(IHostEnvironment env, TOptions options, string Contracts.CheckValue(env, nameof(env)); Host.CheckValue(options, nameof(options)); - Host.CheckParam(options.LearningRates > 0, nameof(options.LearningRates), "Must be positive."); - Host.CheckParam(options.NumThreads == null || options.NumThreads > 0, nameof(options.NumThreads), "Must be positive."); + Host.CheckParam(options.LearningRate > 0, nameof(options.LearningRate), "Must be positive."); + Host.CheckParam(options.NumberOfThreads == null || options.NumberOfThreads > 0, nameof(options.NumberOfThreads), "Must be positive."); Host.CheckParam(0 <= options.EntropyCoefficient && options.EntropyCoefficient <= 1, nameof(options.EntropyCoefficient), "Must be in [0, 1]."); Host.CheckParam(0 <= options.GainConfidenceLevel && options.GainConfidenceLevel < 1, nameof(options.GainConfidenceLevel), "Must be in [0, 1)."); - Host.CheckParam(0 < options.MaxBins, nameof(options.MaxBins), "Must be posittive."); - Host.CheckParam(0 < options.NumIterations, nameof(options.NumIterations), "Must be positive."); - Host.CheckParam(0 < options.MinDocuments, nameof(options.MinDocuments), "Must be positive."); + Host.CheckParam(0 < options.MaximumBinCountPerFeature, nameof(options.MaximumBinCountPerFeature), "Must be posittive."); + Host.CheckParam(0 < options.NumberOfIterations, nameof(options.NumberOfIterations), "Must be positive."); + Host.CheckParam(0 < options.MinimumExampleCountPerLeaf, nameof(options.MinimumExampleCountPerLeaf), "Must be positive."); GamTrainerOptions = options; @@ -234,7 +234,7 @@ private void ConvertData(RoleMappedData trainData, RoleMappedData validationData CheckLabel(trainData); var useTranspose = UseTranspose(GamTrainerOptions.DiskTranspose, trainData); - var instanceConverter = new ExamplesToFastTreeBins(Host, GamTrainerOptions.MaxBins, useTranspose, !GamTrainerOptions.FeatureFlocks, GamTrainerOptions.MinDocuments, float.PositiveInfinity); + var instanceConverter = new ExamplesToFastTreeBins(Host, GamTrainerOptions.MaximumBinCountPerFeature, useTranspose, !GamTrainerOptions.FeatureFlocks, GamTrainerOptions.MinimumExampleCountPerLeaf, float.PositiveInfinity); ParallelTraining.InitEnvironment(); TrainSet = instanceConverter.FindBinsAndReturnDataset(trainData, PredictionKind, ParallelTraining, null, false); @@ -274,7 +274,7 @@ private void TrainCore(IChannel ch) private void TrainMainEffectsModel(IChannel ch) { Contracts.AssertValue(ch); - int iterations = GamTrainerOptions.NumIterations; + int iterations = GamTrainerOptions.NumberOfIterations; ch.Info("Starting to train ..."); @@ -340,7 +340,7 @@ private void TrainingIteration(int globalFeatureIndex, double[] gradient, double // Compute the split for the feature _histogram[flockIndex].FindBestSplitForFeature(_leafSplitHelper, _leafSplitCandidates, _leafSplitCandidates.Targets.Length, sumTargets, sumWeights, - globalFeatureIndex, flockIndex, subFeatureIndex, GamTrainerOptions.MinDocuments, HasWeights, + globalFeatureIndex, flockIndex, subFeatureIndex, GamTrainerOptions.MinimumExampleCountPerLeaf, HasWeights, _gainConfidenceInSquaredStandardDeviations, _entropyCoefficient, TrainSet.Flocks[flockIndex].Trust(subFeatureIndex), 0); @@ -403,7 +403,7 @@ private void UpdateScoresForSet(Dataset dataset, double[] scores, int iteration) private void CombineGraphs(IChannel ch) { // Prune backwards to the best iteration - int bestIteration = GamTrainerOptions.NumIterations; + int bestIteration = GamTrainerOptions.NumberOfIterations; if (GamTrainerOptions.EnablePruning && PruningTest != null) { ch.Info("Pruning"); @@ -415,8 +415,8 @@ private void CombineGraphs(IChannel ch) bestIteration = PruningTest.BestIteration; bestLoss = PruningTest.BestResult.FinalValue; } - if (bestIteration != GamTrainerOptions.NumIterations) - ch.Info($"Best Iteration ({lossFunctionName}): {bestIteration} @ {bestLoss:G6} (vs {GamTrainerOptions.NumIterations} @ {finalResult.FinalValue:G6})."); + if (bestIteration != GamTrainerOptions.NumberOfIterations) + ch.Info($"Best Iteration ({lossFunctionName}): {bestIteration} @ {bestLoss:G6} (vs {GamTrainerOptions.NumberOfIterations} @ {finalResult.FinalValue:G6})."); else ch.Info("No pruning necessary. More iterations may be necessary."); } @@ -556,8 +556,8 @@ private void ConvertTreeToGraph(int globalFeatureIndex, int iteration) { SplitInfo splitinfo = _leafSplitCandidates.FeatureSplitInfo[globalFeatureIndex]; _subGraph.Splits[globalFeatureIndex][iteration].SplitPoint = splitinfo.Threshold; - _subGraph.Splits[globalFeatureIndex][iteration].LteValue = GamTrainerOptions.LearningRates * splitinfo.LteOutput; - _subGraph.Splits[globalFeatureIndex][iteration].GtValue = GamTrainerOptions.LearningRates * splitinfo.GTOutput; + _subGraph.Splits[globalFeatureIndex][iteration].LteValue = GamTrainerOptions.LearningRate * splitinfo.LteOutput; + _subGraph.Splits[globalFeatureIndex][iteration].GtValue = GamTrainerOptions.LearningRate * splitinfo.GTOutput; } private void InitializeGamHistograms() @@ -572,7 +572,7 @@ private void Initialize(IChannel ch) using (Timer.Time(TimerEvent.InitializeTraining)) { InitializeGamHistograms(); - _subGraph = new SubGraph(TrainSet.NumFeatures, GamTrainerOptions.NumIterations); + _subGraph = new SubGraph(TrainSet.NumFeatures, GamTrainerOptions.NumberOfIterations); _leafSplitCandidates = new LeastSquaresRegressionTreeLearner.LeafSplitCandidates(TrainSet); _leafSplitHelper = new LeafSplitHelper(HasWeights); } @@ -582,7 +582,7 @@ private void InitializeThreads() { ParallelTraining = new SingleTrainer(); - int numThreads = GamTrainerOptions.NumThreads ?? Environment.ProcessorCount; + int numThreads = GamTrainerOptions.NumberOfThreads ?? Environment.ProcessorCount; if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor) using (var ch = Host.Start("GamTrainer")) { @@ -703,8 +703,8 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm internal static class GamDefaults { - internal const int NumIterations = 9500; - internal const int MaxBins = 255; - internal const double LearningRates = 0.002; // A small value + internal const int NumberOfIterations = 9500; + internal const int MaximumBinCountPerFeature = 255; + internal const double LearningRate = 0.002; // A small value } } diff --git a/src/Microsoft.ML.FastTree/RandomForest.cs b/src/Microsoft.ML.FastTree/RandomForest.cs index 73ac1e36d5..488f4eee5f 100644 --- a/src/Microsoft.ML.FastTree/RandomForest.cs +++ b/src/Microsoft.ML.FastTree/RandomForest.cs @@ -25,14 +25,14 @@ private protected RandomForestTrainerBase(IHostEnvironment env, TOptions options /// private protected RandomForestTrainerBase(IHostEnvironment env, SchemaShape.Column label, - string featureColumn, - string weightColumn, - string groupIdColumn, - int numLeaves, - int numTrees, - int minDatapointsInLeaves, + string featureColumnName, + string exampleWeightColumnName, + string rowGroupColumnName, + int numberOfLeaves, + int numberOfTrees, + int minimumExampleCountPerLeaf, bool quantileEnabled = false) - : base(env, label, featureColumn, weightColumn, null, numLeaves, numTrees, minDatapointsInLeaves) + : base(env, label, featureColumnName, exampleWeightColumnName, null, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf) { _quantileEnabled = quantileEnabled; } @@ -61,12 +61,12 @@ private protected override void InitializeTests() private protected override TreeLearner ConstructTreeLearner(IChannel ch) { return new RandomForestLeastSquaresTreeLearner( - TrainSet, FastTreeTrainerOptions.NumLeaves, FastTreeTrainerOptions.MinDocumentsInLeafs, FastTreeTrainerOptions.EntropyCoefficient, + TrainSet, FastTreeTrainerOptions.NumberOfLeaves, FastTreeTrainerOptions.MinimumExampleCountPerLeaf, FastTreeTrainerOptions.EntropyCoefficient, FastTreeTrainerOptions.FeatureFirstUsePenalty, FastTreeTrainerOptions.FeatureReusePenalty, FastTreeTrainerOptions.SoftmaxTemperature, - FastTreeTrainerOptions.HistogramPoolSize, FastTreeTrainerOptions.RngSeed, FastTreeTrainerOptions.SplitFraction, - FastTreeTrainerOptions.AllowEmptyTrees, FastTreeTrainerOptions.GainConfidenceLevel, FastTreeTrainerOptions.MaxCategoricalGroupsPerNode, - FastTreeTrainerOptions.MaxCategoricalSplitPoints, _quantileEnabled, FastTreeTrainerOptions.QuantileSampleCount, ParallelTraining, - FastTreeTrainerOptions.MinDocsPercentageForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinDocsForCategoricalSplit, FastTreeTrainerOptions.Bias); + FastTreeTrainerOptions.HistogramPoolSize, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.FeatureFractionPerSplit, + FastTreeTrainerOptions.AllowEmptyTrees, FastTreeTrainerOptions.GainConfidenceLevel, FastTreeTrainerOptions.MaximumCategoricalGroupCountPerNode, + FastTreeTrainerOptions.MaximumCategoricalSplitPointCount, _quantileEnabled, FastTreeTrainerOptions.NumberOfQuantileSamples, ParallelTraining, + FastTreeTrainerOptions.MinimumExampleFractionForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinimumExamplesForCategoricalSplit, FastTreeTrainerOptions.Bias); } internal abstract class RandomForestObjectiveFunction : ObjectiveFunctionBase @@ -78,7 +78,7 @@ protected RandomForestObjectiveFunction(Dataset trainData, TOptions options, dou maxStepSize, 1, // No derivative sampling in random forests. false, // Improvements to quasi-newton step not relevant to RF. - options.RngSeed) + options.Seed) { } } diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs index 60fc2e6242..bde7bf5052 100644 --- a/src/Microsoft.ML.FastTree/RandomForestClassification.cs +++ b/src/Microsoft.ML.FastTree/RandomForestClassification.cs @@ -10,7 +10,6 @@ using Microsoft.ML.CommandLine; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -33,13 +32,13 @@ namespace Microsoft.ML.Trainers.FastTree public abstract class FastForestOptionsBase : TreeOptions { [Argument(ArgumentType.AtMostOnce, HelpText = "Number of labels to be sampled from each leaf to make the distribtuion", ShortName = "qsc")] - public int QuantileSampleCount = 100; + public int NumberOfQuantileSamples = 100; public FastForestOptionsBase() { FeatureFraction = 0.7; BaggingSize = 1; - SplitFraction = 0.7; + FeatureFractionPerSplit = 0.7; } } @@ -112,7 +111,7 @@ public sealed partial class FastForestClassification : public sealed class Options : FastForestOptionsBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Upper bound on absolute value of single tree output", ShortName = "mo")] - public Double MaxTreeOutput = 100; + public Double MaximumOutputMagnitudePerTree = 100; [Argument(ArgumentType.AtMostOnce, HelpText = "The calibrator kind to apply to the predictor. Specify null for no calibration", Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)] internal ICalibratorTrainerFactory Calibrator = new PlattCalibratorTrainerFactory(); @@ -135,23 +134,23 @@ public sealed class Options : FastForestOptionsBase /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. - /// The max number of leaves in each regression tree. - /// Total number of decision trees to create in the ensemble. - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the example weight. + /// The max number of leaves in each regression tree. + /// Total number of decision trees to create in the ensemble. + /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. internal FastForestClassification(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves) - : base(env, TrainerUtils.MakeBoolScalarLabel(labelColumn), featureColumn, weightColumn, null, numLeaves, numTrees, minDatapointsInLeaves) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string exampleWeightColumnName = null, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf) + : base(env, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf) { - Host.CheckNonEmpty(labelColumn, nameof(labelColumn)); - Host.CheckNonEmpty(featureColumn, nameof(featureColumn)); + Host.CheckNonEmpty(labelColumnName, nameof(labelColumnName)); + Host.CheckNonEmpty(featureColumnName, nameof(featureColumnName)); } /// @@ -231,7 +230,7 @@ private sealed class ObjectiveFunctionImpl : RandomForestObjectiveFunction private readonly bool[] _labels; public ObjectiveFunctionImpl(Dataset trainSet, bool[] trainSetLabels, Options options) - : base(trainSet, options, options.MaxTreeOutput) + : base(trainSet, options, options.MaximumOutputMagnitudePerTree) { _labels = trainSetLabels; } diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs index 359217da05..3eee18666c 100644 --- a/src/Microsoft.ML.FastTree/RandomForestRegression.cs +++ b/src/Microsoft.ML.FastTree/RandomForestRegression.cs @@ -8,7 +8,6 @@ using Microsoft.ML.CommandLine; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Internallearn; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Model; using Microsoft.ML.Trainers.FastTree; @@ -264,23 +263,23 @@ public sealed class Options : FastForestOptionsBase /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The optional name for the column containing the initial weight. - /// The max number of leaves in each regression tree. - /// Total number of decision trees to create in the ensemble. - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. + /// The name of the label column. + /// The name of the feature column. + /// The optional name for the column containing the example weight. + /// The max number of leaves in each regression tree. + /// Total number of decision trees to create in the ensemble. + /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data. internal FastForestRegression(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weightColumn = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves) - : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weightColumn, null, numLeaves, numTrees, minDatapointsInLeaves) + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string exampleWeightColumnName = null, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf) + : base(env, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf) { - Host.CheckNonEmpty(labelColumn, nameof(labelColumn)); - Host.CheckNonEmpty(featureColumn, nameof(featureColumn)); + Host.CheckNonEmpty(labelColumnName, nameof(labelColumnName)); + Host.CheckNonEmpty(featureColumnName, nameof(featureColumnName)); } /// @@ -310,7 +309,7 @@ private protected override FastForestRegressionModelParameters TrainModelCore(Tr ConvertData(trainData); TrainCore(ch); } - return new FastForestRegressionModelParameters(Host, TrainedEnsemble, FeatureCount, InnerOptions, FastTreeTrainerOptions.QuantileSampleCount); + return new FastForestRegressionModelParameters(Host, TrainedEnsemble, FeatureCount, InnerOptions, FastTreeTrainerOptions.NumberOfQuantileSamples); } private protected override void PrepareLabels(IChannel ch) diff --git a/src/Microsoft.ML.FastTree/RegressionTree.cs b/src/Microsoft.ML.FastTree/RegressionTree.cs index 3d1bfbc91f..0645a9f57f 100644 --- a/src/Microsoft.ML.FastTree/RegressionTree.cs +++ b/src/Microsoft.ML.FastTree/RegressionTree.cs @@ -21,11 +21,11 @@ public abstract class RegressionTreeBase private readonly InternalRegressionTree _tree; /// - /// See . + /// See . /// private readonly ImmutableArray _lteChild; /// - /// See . + /// See . /// private readonly ImmutableArray _gtChild; /// @@ -50,9 +50,9 @@ public abstract class RegressionTreeBase private readonly ImmutableArray _splitGains; /// - /// [i] is the i-th node's child index used when - /// (1) the numerical feature indexed by [i] is less than the - /// threshold [i], or + /// [i] is the i-th node's child index used when + /// (1) the numerical feature indexed by [i] is less than or equal + /// to the threshold [i], or /// (2) the categorical features indexed by 's /// returned value with nodeIndex=i is NOT a sub-set of with /// nodeIndex=i. @@ -63,14 +63,14 @@ public abstract class RegressionTreeBase /// bitwise complement operator in C#; for details, see /// https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/operators/bitwise-complement-operator. /// - public IReadOnlyList LteChild => _lteChild; + public IReadOnlyList LeftChild => _lteChild; /// - /// [i] is the i-th node's child index used when the two conditions, (1) and (2), - /// described in 's document are not true. Its return value follows the format - /// used in . + /// [i] is the i-th node's child index used when the two conditions, (1) and (2), + /// described in 's document are not true. Its return value follows the format + /// used in . /// - public IReadOnlyList GtChild => _gtChild; + public IReadOnlyList RightChild => _gtChild; /// /// [i] is the feature index used the splitting function of the @@ -99,13 +99,13 @@ public abstract class RegressionTreeBase /// /// Return categorical thresholds used at node indexed by nodeIndex. If the considered input feature does NOT /// matche any of values returned by , we call it a - /// less-than-threshold event and therefore [nodeIndex] is the child node that input + /// less-than-threshold event and therefore [nodeIndex] is the child node that input /// should go next. The returned value is valid only if [nodeIndex] is true. /// public IReadOnlyList GetCategoricalSplitFeaturesAt(int nodeIndex) { - if (nodeIndex < 0 || nodeIndex >= NumNodes) - throw Contracts.Except($"The input index, {nodeIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumNodes} (exclusive)."); + if (nodeIndex < 0 || nodeIndex >= NumberOfNodes) + throw Contracts.Except($"The input index, {nodeIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumberOfNodes} (exclusive)."); if (_tree.CategoricalSplitFeatures == null || _tree.CategoricalSplitFeatures[nodeIndex] == null) return new List(); // Zero-length vector. @@ -121,8 +121,8 @@ public IReadOnlyList GetCategoricalSplitFeaturesAt(int nodeIndex) /// public IReadOnlyList GetCategoricalCategoricalSplitFeatureRangeAt(int nodeIndex) { - if (nodeIndex < 0 || nodeIndex >= NumNodes) - throw Contracts.Except($"The input node index, {nodeIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumNodes} (exclusive)."); + if (nodeIndex < 0 || nodeIndex >= NumberOfNodes) + throw Contracts.Except($"The input node index, {nodeIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumberOfNodes} (exclusive)."); if (_tree.CategoricalSplitFeatureRanges == null || _tree.CategoricalSplitFeatureRanges[nodeIndex] == null) return new List(); // Zero-length vector. @@ -136,13 +136,13 @@ public IReadOnlyList GetCategoricalCategoricalSplitFeatureRangeAt(int nodeI public IReadOnlyList SplitGains => _splitGains; /// - /// Number of leaves in the tree. Note that does not take non-leaf nodes into account. + /// Number of leaves in the tree. Note that does not take non-leaf nodes into account. /// - public int NumLeaves => _tree.NumLeaves; + public int NumberOfLeaves => _tree.NumLeaves; /// /// Number of nodes in the tree. This doesn't include any leaves. For example, a tree with node0->node1, - /// node0->leaf3, node1->leaf1, node1->leaf2, and should + /// node0->leaf3, node1->leaf1, node1->leaf2, and should /// be 2 and 3, respectively. /// // A visualization of the example mentioned in this doc string. @@ -152,7 +152,7 @@ public IReadOnlyList GetCategoricalCategoricalSplitFeatureRangeAt(int nodeI // / \ // leaf1 leaf2 // The index of leaf starts with 1 because interally we use "-1" as the 1st leaf's index, "-2" for the 2nd leaf's index, and so on. - public int NumNodes => _tree.NumNodes; + public int NumberOfNodes => _tree.NumNodes; internal RegressionTreeBase(InternalRegressionTree tree) { @@ -209,8 +209,8 @@ public sealed class QuantileRegressionTree : RegressionTreeBase /// Training labels public IReadOnlyList GetLeafSamplesAt(int leafIndex) { - if (leafIndex < 0 || leafIndex >= NumLeaves) - throw Contracts.Except($"The input leaf index, {leafIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumLeaves} (exclusive)."); + if (leafIndex < 0 || leafIndex >= NumberOfLeaves) + throw Contracts.Except($"The input leaf index, {leafIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumberOfLeaves} (exclusive)."); // _leafSample always contains valid values assigned in constructor. return _leafSamples[leafIndex]; @@ -225,8 +225,8 @@ public IReadOnlyList GetLeafSamplesAt(int leafIndex) /// Training labels' weights public IReadOnlyList GetLeafSampleWeightsAt(int leafIndex) { - if (leafIndex < 0 || leafIndex >= NumLeaves) - throw Contracts.Except($"The input leaf index, {leafIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumLeaves} (exclusive)."); + if (leafIndex < 0 || leafIndex >= NumberOfLeaves) + throw Contracts.Except($"The input leaf index, {leafIndex}, is invalid. Its valid range is from 0 (inclusive) to {NumberOfLeaves} (exclusive)."); // _leafSampleWeights always contains valid values assigned in constructor. return _leafSampleWeights[leafIndex]; diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 7d33a01796..79c626c470 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; using Microsoft.ML.Data; using Microsoft.ML.Trainers.FastTree; @@ -20,22 +19,22 @@ public static class TreeExtensions /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of datapoints allowed in a leaf of a regression tree, out of the subsampled data. + /// Total number of decision trees to create in the ensemble. + /// The maximum number of leaves per decision tree. + /// The minimal number of datapoints allowed in a leaf of a regression tree, out of the subsampled data. /// The learning rate. public static FastTreeRegressionTrainer FastTree(this RegressionCatalog.RegressionTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new FastTreeRegressionTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numLeaves, numTrees, minDatapointsInLeaves, learningRate); + return new FastTreeRegressionTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate); } /// @@ -60,22 +59,22 @@ public static FastTreeRegressionTrainer FastTree(this RegressionCatalog.Regressi /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of datapoints allowed in a leaf of the tree, out of the subsampled data. + /// Total number of decision trees to create in the ensemble. + /// The maximum number of leaves per decision tree. + /// The minimal number of datapoints allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. public static FastTreeBinaryClassificationTrainer FastTree(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new FastTreeBinaryClassificationTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numLeaves, numTrees, minDatapointsInLeaves, learningRate); + return new FastTreeBinaryClassificationTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate); } /// @@ -101,23 +100,23 @@ public static FastTreeBinaryClassificationTrainer FastTree(this BinaryClassifica /// The name of the feature column. /// The name of the group column. /// The name of the example weight column (optional). - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of datapoints allowed in a leaf of the tree, out of the subsampled data. + /// Total number of decision trees to create in the ensemble. + /// The maximum number of leaves per decision tree. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. public static FastTreeRankingTrainer FastTree(this RankingCatalog.RankingTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string rowGroupColumnName = DefaultColumnNames.GroupId, string exampleWeightColumnName = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new FastTreeRankingTrainer(env, labelColumnName, featureColumnName, rowGroupColumnName, exampleWeightColumnName, numLeaves, numTrees, minDatapointsInLeaves, learningRate); + return new FastTreeRankingTrainer(env, labelColumnName, featureColumnName, rowGroupColumnName, exampleWeightColumnName, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate); } /// @@ -142,20 +141,20 @@ public static FastTreeRankingTrainer FastTree(this RankingCatalog.RankingTrainer /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of iterations to use in learning the features. + /// The number of iterations to use in learning the features. + /// The maximum number of bins to use to approximate features. /// The learning rate. GAMs work best with a small learning rate. - /// The maximum number of bins to use to approximate features. public static BinaryClassificationGamTrainer GeneralizedAdditiveModels(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numIterations = GamDefaults.NumIterations, - double learningRate = GamDefaults.LearningRates, - int maxBins = GamDefaults.MaxBins) + int numberOfIterations = GamDefaults.NumberOfIterations, + int maximumBinCountPerFeature = GamDefaults.MaximumBinCountPerFeature, + double learningRate = GamDefaults.LearningRate) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new BinaryClassificationGamTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numIterations, learningRate, maxBins); + return new BinaryClassificationGamTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfIterations, learningRate, maximumBinCountPerFeature); } /// @@ -178,20 +177,20 @@ public static BinaryClassificationGamTrainer GeneralizedAdditiveModels(this Bina /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of iterations to use in learning the features. + /// The number of iterations to use in learning the features. + /// The maximum number of bins to use to approximate features. /// The learning rate. GAMs work best with a small learning rate. - /// The maximum number of bins to use to approximate features. public static RegressionGamTrainer GeneralizedAdditiveModels(this RegressionCatalog.RegressionTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numIterations = GamDefaults.NumIterations, - double learningRate = GamDefaults.LearningRates, - int maxBins = GamDefaults.MaxBins) + int numberOfIterations = GamDefaults.NumberOfIterations, + int maxBinCountPerFeature = GamDefaults.MaximumBinCountPerFeature, + double learningRate = GamDefaults.LearningRate) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new RegressionGamTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numIterations, learningRate, maxBins); + return new RegressionGamTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfIterations, learningRate, maxBinCountPerFeature); } /// @@ -222,10 +221,10 @@ public static FastTreeTweedieTrainer FastTreeTweedie(this RegressionCatalog.Regr string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates) + int numLeaves = Defaults.NumberOfLeaves, + int numTrees = Defaults.NumberOfTrees, + int minDatapointsInLeaves = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); @@ -261,9 +260,9 @@ public static FastForestRegression FastForest(this RegressionCatalog.RegressionT string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves) + int numLeaves = Defaults.NumberOfLeaves, + int numTrees = Defaults.NumberOfTrees, + int minDatapointsInLeaves = Defaults.MinimumExampleCountPerLeaf) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); @@ -299,9 +298,9 @@ public static FastForestClassification FastForest(this BinaryClassificationCatal string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves) + int numLeaves = Defaults.NumberOfLeaves, + int numTrees = Defaults.NumberOfTrees, + int minDatapointsInLeaves = Defaults.MinimumExampleCountPerLeaf) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); diff --git a/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs b/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs index 49e12c4098..8e2b08d3d7 100644 --- a/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs @@ -21,9 +21,9 @@ public static class TreeRegressionExtensions /// The label column. /// The features column. /// The optional weights column. - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of datapoints allowed in a leaf of a regression tree, out of the subsampled data. + /// Total number of decision trees to create in the ensemble. + /// The maximum number of leaves per decision tree. + /// The minimal number of data points allowed in a leaf of a regression tree, out of the subsampled data. /// The learning rate. /// A delegate that is called every time the /// method is called on the @@ -39,19 +39,19 @@ public static class TreeRegressionExtensions /// public static Scalar FastTree(this RegressionCatalog.RegressionTrainers catalog, Scalar label, Vector features, Scalar weights = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate, Action onFit = null) { - CheckUserValues(label, features, weights, numLeaves, numTrees, minDatapointsInLeaves, learningRate, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate, onFit); var rec = new TrainerEstimatorReconciler.Regression( (env, labelName, featuresName, weightsName) => { - var trainer = new FastTreeRegressionTrainer(env, labelName, featuresName, weightsName, numLeaves, - numTrees, minDatapointsInLeaves, learningRate); + var trainer = new FastTreeRegressionTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, + numberOfTrees, minimumExampleCountPerLeaf, learningRate); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); return trainer; @@ -113,9 +113,9 @@ public static Scalar FastTree(this RegressionCatalog.RegressionTrainers c /// The label column. /// The features column. /// The optional weights column. - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of datapoints allowed in a leaf of the tree, out of the subsampled data. + /// Total number of decision trees to create in the ensemble. + /// The maximum number of leaves per decision tree. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// A delegate that is called every time the /// method is called on the @@ -132,19 +132,19 @@ public static Scalar FastTree(this RegressionCatalog.RegressionTrainers c /// public static (Scalar score, Scalar probability, Scalar predictedLabel) FastTree(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, Scalar label, Vector features, Scalar weights = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate, Action> onFit = null) { - CheckUserValues(label, features, weights, numLeaves, numTrees, minDatapointsInLeaves, learningRate, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate, onFit); var rec = new TrainerEstimatorReconciler.BinaryClassifier( (env, labelName, featuresName, weightsName) => { - var trainer = new FastTreeBinaryClassificationTrainer(env, labelName, featuresName, weightsName, numLeaves, - numTrees, minDatapointsInLeaves, learningRate); + var trainer = new FastTreeBinaryClassificationTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, + numberOfTrees, minimumExampleCountPerLeaf, learningRate); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); @@ -212,9 +212,9 @@ public static (Scalar score, Scalar probability, Scalar pred /// The features column. /// The groupId column. /// The optional weights column. - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of datapoints allowed in a leaf of a regression tree, out of the subsampled data. + /// Total number of decision trees to create in the ensemble. + /// The maximum number of leaves per decision tree. + /// The minimal number of data points allowed in a leaf of a regression tree, out of the subsampled data. /// The learning rate. /// A delegate that is called every time the /// method is called on the @@ -224,19 +224,19 @@ public static (Scalar score, Scalar probability, Scalar pred /// The Score output column indicating the predicted value. public static Scalar FastTree(this RankingCatalog.RankingTrainers catalog, Scalar label, Vector features, Key groupId, Scalar weights = null, - int numLeaves = Defaults.NumLeaves, - int numTrees = Defaults.NumTrees, - int minDatapointsInLeaves = Defaults.MinDocumentsInLeaves, - double learningRate = Defaults.LearningRates, + int numberOfLeaves = Defaults.NumberOfLeaves, + int numberOfTrees = Defaults.NumberOfTrees, + int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, + double learningRate = Defaults.LearningRate, Action onFit = null) { - CheckUserValues(label, features, weights, numLeaves, numTrees, minDatapointsInLeaves, learningRate, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate, onFit); var rec = new TrainerEstimatorReconciler.Ranker( (env, labelName, featuresName, groupIdName, weightsName) => { - var trainer = new FastTreeRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numLeaves, - numTrees, minDatapointsInLeaves, learningRate); + var trainer = new FastTreeRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numberOfLeaves, + numberOfTrees, minimumExampleCountPerLeaf, learningRate); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); return trainer; @@ -287,18 +287,18 @@ public static Scalar FastTree(this RankingCatalog.RankingTrainers c } internal static void CheckUserValues(PipelineColumn label, Vector features, Scalar weights, - int numLeaves, - int numTrees, - int minDatapointsInLeaves, + int numberOfLeaves, + int numberOfTrees, + int minimumExampleCountPerLeaf, double learningRate, Delegate onFit) { Contracts.CheckValue(label, nameof(label)); Contracts.CheckValue(features, nameof(features)); Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(numLeaves >= 2, nameof(numLeaves), "Must be at least 2."); - Contracts.CheckParam(numTrees > 0, nameof(numTrees), "Must be positive"); - Contracts.CheckParam(minDatapointsInLeaves > 0, nameof(minDatapointsInLeaves), "Must be positive"); + Contracts.CheckParam(numberOfLeaves >= 2, nameof(numberOfLeaves), "Must be at least 2."); + Contracts.CheckParam(numberOfTrees > 0, nameof(numberOfTrees), "Must be positive"); + Contracts.CheckParam(minimumExampleCountPerLeaf > 0, nameof(minimumExampleCountPerLeaf), "Must be positive"); Contracts.CheckParam(learningRate > 0, nameof(learningRate), "Must be positive"); Contracts.CheckValueOrNull(onFit); } diff --git a/src/Microsoft.ML.Sweeper/Algorithms/SmacSweeper.cs b/src/Microsoft.ML.Sweeper/Algorithms/SmacSweeper.cs index 123eafddac..772a8d07ad 100644 --- a/src/Microsoft.ML.Sweeper/Algorithms/SmacSweeper.cs +++ b/src/Microsoft.ML.Sweeper/Algorithms/SmacSweeper.cs @@ -137,8 +137,8 @@ private FastForestRegressionModelParameters FitModel(IEnumerable pre new FastForestRegression.Options { FeatureFraction = _args.SplitRatio, - NumTrees = _args.NumOfTrees, - MinDocumentsInLeafs = _args.NMinForSplit, + NumberOfTrees = _args.NumOfTrees, + MinimumExampleCountPerLeaf = _args.NMinForSplit, LabelColumnName = DefaultColumnNames.Label, FeatureColumnName = DefaultColumnNames.Features, }); diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 8368b448e2..b753141da4 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -5094,7 +5094,7 @@ "ShortName": "ff", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -5125,7 +5125,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -5156,9 +5156,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -5251,7 +5251,7 @@ "Default": "Auto" }, { - "Name": "MaxTreeOutput", + "Name": "MaximumOutputMagnitudePerTree", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -5286,7 +5286,7 @@ "Default": 1000000 }, { - "Name": "QuantileSampleCount", + "Name": "NumberOfQuantileSamples", "Type": "Int", "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", "Aliases": [ @@ -5315,7 +5315,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -5327,7 +5327,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -5339,7 +5339,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -5411,7 +5411,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -5423,7 +5423,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -5435,9 +5435,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -5447,9 +5447,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -5490,7 +5490,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -5562,7 +5562,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -5598,7 +5598,7 @@ "Default": 1 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -5610,7 +5610,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -5670,18 +5670,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -5744,7 +5732,7 @@ "ShortName": "ffr", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -5775,7 +5763,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -5806,9 +5794,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -5910,7 +5898,7 @@ "Default": false }, { - "Name": "QuantileSampleCount", + "Name": "NumberOfQuantileSamples", "Type": "Int", "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", "Aliases": [ @@ -5939,7 +5927,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -5951,7 +5939,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -5963,7 +5951,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -6035,7 +6023,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -6047,7 +6035,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -6059,9 +6047,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -6071,9 +6059,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -6114,7 +6102,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -6186,7 +6174,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -6222,7 +6210,7 @@ "Default": 1 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -6234,7 +6222,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -6294,18 +6282,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -6368,7 +6344,7 @@ "ShortName": "ftc", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -6399,7 +6375,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -6430,9 +6406,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -6462,7 +6438,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -6579,7 +6555,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -6591,7 +6567,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -6646,7 +6622,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -6761,7 +6737,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -6823,7 +6799,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -6850,7 +6826,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -6862,7 +6838,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -6874,7 +6850,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -6946,7 +6922,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -6958,7 +6934,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -6970,9 +6946,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -6982,9 +6958,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -7025,7 +7001,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -7097,7 +7073,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -7133,7 +7109,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -7145,7 +7121,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -7205,18 +7181,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -7279,7 +7243,7 @@ "ShortName": "ftrank", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -7310,7 +7274,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -7341,9 +7305,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -7373,7 +7337,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -7455,7 +7419,10 @@ }, { "Name": "CustomGains", - "Type": "String", + "Type": { + "Kind": "Array", + "ItemType": "Float" + }, "Desc": "Comma seperated list of gains associated to each relevance label.", "Aliases": [ "gains" @@ -7463,10 +7430,16 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "0,3,7,15,31" + "Default": [ + 0.0, + 3.0, + 7.0, + 15.0, + 31.0 + ] }, { - "Name": "TrainDcg", + "Name": "UseDcg", "Type": "Bool", "Desc": "Train DCG instead of NDCG", "Aliases": [ @@ -7490,7 +7463,7 @@ "Default": "DescendingStablePessimistic" }, { - "Name": "LambdaMartMaxTruncation", + "Name": "NdcgTruncationLevel", "Type": "Int", "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", "Aliases": [ @@ -7571,7 +7544,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -7583,7 +7556,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -7753,7 +7726,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -7815,7 +7788,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -7842,7 +7815,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -7854,7 +7827,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -7866,7 +7839,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -7938,7 +7911,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -7950,7 +7923,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -7962,9 +7935,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -7974,9 +7947,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -8017,7 +7990,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -8089,7 +8062,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -8125,7 +8098,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -8137,7 +8110,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -8197,18 +8170,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -8271,7 +8232,7 @@ "ShortName": "ftr", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -8302,7 +8263,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -8333,9 +8294,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -8365,7 +8326,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -8470,7 +8431,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -8482,7 +8443,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -8652,7 +8613,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -8714,7 +8675,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -8741,7 +8702,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -8753,7 +8714,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -8765,7 +8726,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -8837,7 +8798,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -8849,7 +8810,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -8861,9 +8822,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -8873,9 +8834,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -8916,7 +8877,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -8988,7 +8949,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -9024,7 +8985,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -9036,7 +8997,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -9096,18 +9057,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -9170,7 +9119,7 @@ "ShortName": "fttweedie", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -9201,7 +9150,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -9232,9 +9181,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -9264,7 +9213,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -9378,7 +9327,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -9390,7 +9339,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -9445,7 +9394,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -9560,7 +9509,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -9622,7 +9571,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -9649,7 +9598,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -9661,7 +9610,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -9673,7 +9622,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -9745,7 +9694,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -9757,7 +9706,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -9769,9 +9718,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -9781,9 +9730,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -9824,7 +9773,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -9896,7 +9845,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -9932,7 +9881,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -9944,7 +9893,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -10004,18 +9953,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -10345,7 +10282,7 @@ "ShortName": "gam", "Inputs": [ { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Total number of iterations over all features", "Aliases": [ @@ -10388,7 +10325,7 @@ "Default": "Features" }, { - "Name": "MinDocuments", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of training instances required to form a partition", "Aliases": [ @@ -10420,7 +10357,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -10525,7 +10462,7 @@ "Default": 0 }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -10549,7 +10486,7 @@ "Default": null }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -10561,7 +10498,7 @@ "Default": 255 }, { - "Name": "MaxOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single output", "Aliases": [ @@ -10585,7 +10522,7 @@ "Default": 1 }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -10645,7 +10582,7 @@ "ShortName": "gamr", "Inputs": [ { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Total number of iterations over all features", "Aliases": [ @@ -10688,7 +10625,7 @@ "Default": "Features" }, { - "Name": "MinDocuments", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of training instances required to form a partition", "Aliases": [ @@ -10720,7 +10657,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -10825,7 +10762,7 @@ "Default": 0 }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -10849,7 +10786,7 @@ "Default": null }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -10861,7 +10798,7 @@ "Default": 255 }, { - "Name": "MaxOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single output", "Aliases": [ @@ -10885,7 +10822,7 @@ "Default": 1 }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -25101,7 +25038,7 @@ "FriendlyName": "FastTree (Boosted Trees) Classification", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -25132,7 +25069,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -25163,9 +25100,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -25195,7 +25132,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -25312,7 +25249,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -25324,7 +25261,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -25379,7 +25316,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -25494,7 +25431,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -25556,7 +25493,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -25583,7 +25520,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -25595,7 +25532,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -25607,7 +25544,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -25679,7 +25616,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -25691,7 +25628,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -25703,9 +25640,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -25715,9 +25652,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -25758,7 +25695,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -25830,7 +25767,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -25866,7 +25803,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -25878,7 +25815,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -25938,18 +25875,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -25994,7 +25919,7 @@ "FriendlyName": "FastTree (Boosted Trees) Ranking", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -26025,7 +25950,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -26056,9 +25981,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -26088,7 +26013,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -26170,7 +26095,10 @@ }, { "Name": "CustomGains", - "Type": "String", + "Type": { + "Kind": "Array", + "ItemType": "Float" + }, "Desc": "Comma seperated list of gains associated to each relevance label.", "Aliases": [ "gains" @@ -26178,10 +26106,16 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "0,3,7,15,31" + "Default": [ + 0.0, + 3.0, + 7.0, + 15.0, + 31.0 + ] }, { - "Name": "TrainDcg", + "Name": "UseDcg", "Type": "Bool", "Desc": "Train DCG instead of NDCG", "Aliases": [ @@ -26205,7 +26139,7 @@ "Default": "DescendingStablePessimistic" }, { - "Name": "LambdaMartMaxTruncation", + "Name": "NdcgTruncationLevel", "Type": "Int", "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", "Aliases": [ @@ -26286,7 +26220,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -26298,7 +26232,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -26468,7 +26402,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -26530,7 +26464,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -26557,7 +26491,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -26569,7 +26503,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -26581,7 +26515,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -26653,7 +26587,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -26665,7 +26599,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -26677,9 +26611,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -26689,9 +26623,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -26732,7 +26666,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -26804,7 +26738,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -26840,7 +26774,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -26852,7 +26786,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -26912,18 +26846,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -26968,7 +26890,7 @@ "FriendlyName": "FastTree (Boosted Trees) Regression", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -26999,7 +26921,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -27030,9 +26952,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -27062,7 +26984,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -27167,7 +27089,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -27179,7 +27101,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -27349,7 +27271,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -27411,7 +27333,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -27438,7 +27360,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -27450,7 +27372,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -27462,7 +27384,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -27534,7 +27456,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -27546,7 +27468,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -27558,9 +27480,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -27570,9 +27492,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -27613,7 +27535,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -27685,7 +27607,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -27721,7 +27643,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -27733,7 +27655,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -27793,18 +27715,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -27849,7 +27759,7 @@ "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -27880,7 +27790,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -27911,9 +27821,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -27943,7 +27853,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -28057,7 +27967,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -28069,7 +27979,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -28124,7 +28034,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -28239,7 +28149,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -28301,7 +28211,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -28328,7 +28238,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -28340,7 +28250,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -28352,7 +28262,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -28424,7 +28334,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -28436,7 +28346,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -28448,9 +28358,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -28460,9 +28370,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -28503,7 +28413,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -28575,7 +28485,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -28611,7 +28521,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -28623,7 +28533,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -28683,18 +28593,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 54dadb5b51..a94ca5c5fd 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -3528,8 +3528,8 @@ public void EntryPointTreeLeafFeaturizer() var fastTree = Trainers.FastTree.FastTree.TrainBinary(Env, new FastTreeBinaryClassificationTrainer.Options { FeatureColumnName = "Features", - NumTrees = 5, - NumLeaves = 4, + NumberOfTrees = 5, + NumberOfLeaves = 4, LabelColumnName = DefaultColumnNames.Label, TrainingData = concat.OutputData }); @@ -4961,18 +4961,18 @@ public void TestCrossValidationMacroWithNonDefaultNames() }, { 'Name': 'Trainers.FastTreeRanker', 'Inputs': { - 'CustomGains': '0,3,7,15,31', - 'TrainDcg': false, + 'CustomGains': [0,3,7,15,31], + 'UseDcg': false, 'SortingAlgorithm': 'DescendingStablePessimistic', - 'LambdaMartMaxTruncation': 100, + 'NdcgTruncationLevel': 100, 'ShiftedNdcg': false, 'CostFunctionParam': 'w', 'DistanceWeight2': false, 'NormalizeQueryLambdas': false, 'BestStepRankingRegressionTrees': false, 'UseLineSearch': false, - 'NumPostBracketSteps': 0, - 'MinStepSize': 0.0, + 'MaximumNumberOfLineSearchSteps': 0, + 'MinimumStepSize': 0.0, 'OptimizationAlgorithm': 'GradientDescent', 'EarlyStoppingRule': null, 'EarlyStoppingMetrics': 1, @@ -4980,12 +4980,12 @@ public void TestCrossValidationMacroWithNonDefaultNames() 'UseTolerantPruning': false, 'PruningThreshold': 0.004, 'PruningWindowSize': 5, - 'LearningRates': 0.2, + 'LearningRate': 0.2, 'Shrinkage': 1.0, 'DropoutRate': 0.0, 'GetDerivativesSampleRate': 1, 'WriteLastEnsemble': false, - 'MaxTreeOutput': 100.0, + 'MaximumTreeOutput': 100.0, 'RandomStart': false, 'FilterZeroLambdas': false, 'BaselineScoresFormula': null, @@ -4995,39 +4995,38 @@ public void TestCrossValidationMacroWithNonDefaultNames() 'Name': 'Single', 'Settings': {} }, - 'NumThreads': 1, - 'RngSeed': 123, - 'FeatureSelectSeed': 123, + 'NumberOfThreads': 1, + 'Seed': 123, + 'FeatureSelectionSeed': 123, 'EntropyCoefficient': 0.0, 'HistogramPoolSize': -1, 'DiskTranspose': null, 'FeatureFlocks': true, 'CategoricalSplit': false, - 'MaxCategoricalGroupsPerNode': 64, - 'MaxCategoricalSplitPoints': 64, - 'MinDocsPercentageForCategoricalSplit': 0.001, - 'MinDocsForCategoricalSplit': 100, + 'MaximumCategoricalGroupCountPerNode': 64, + 'MaximumCategoricalSplitPointCount': 64, + 'MinimumExampleFractionForCategoricalSplit': 0.001, + 'MinimumExamplesForCategoricalSplit': 100, 'Bias': 0.0, 'Bundling': 'None', - 'MaxBins': 255, + 'MaximumBinCountPerFeature': 255, 'SparsifyThreshold': 0.7, 'FeatureFirstUsePenalty': 0.0, 'FeatureReusePenalty': 0.0, 'GainConfidenceLevel': 0.0, 'SoftmaxTemperature': 0.0, - 'ExecutionTimes': false, - 'NumLeaves': 20, - 'MinDocumentsInLeafs': 10, - 'NumTrees': 100, + 'ExecutionTime': false, + 'NumberOfLeaves': 20, + 'MinimumExampleCountPerLeaf': 10, + 'NumberOfTrees': 100, 'FeatureFraction': 1.0, 'BaggingSize': 0, - 'BaggingTrainFraction': 0.7, - 'SplitFraction': 1.0, + 'BaggingExampleFraction': 0.7, + 'FeatureFractionPerSplit': 1.0, 'Smoothing': 0.0, 'AllowEmptyTrees': true, 'FeatureCompressionLevel': 1, 'CompressEnsemble': false, - 'MaxTreesAfterCompression': -1, 'PrintTestGraph': false, 'PrintTrainValidGraph': false, 'TestFrequency': 2147483647, diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index 9b20efa06c..7bf6d53db0 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -79,7 +79,7 @@ module SmokeTest1 = let data = ml.Data.LoadFromTextFile(testDataPath, hasHeader = true, allowQuoting = true) let pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Append(ml.BinaryClassification.Trainers.FastTree(numLeaves = 5, numTrees = 5)) + .Append(ml.BinaryClassification.Trainers.FastTree(numberOfLeaves = 5, numberOfTrees = 5)) let model = pipeline.Fit(data) @@ -119,7 +119,7 @@ module SmokeTest2 = let data = ml.Data.LoadFromTextFile(testDataPath, hasHeader = true, allowQuoting = true) let pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Append(ml.BinaryClassification.Trainers.FastTree(numLeaves = 5, numTrees = 5)) + .Append(ml.BinaryClassification.Trainers.FastTree(numberOfLeaves = 5, numberOfTrees = 5)) let model = pipeline.Fit(data) @@ -156,7 +156,7 @@ module SmokeTest3 = let data = ml.Data.LoadFromTextFile(testDataPath, hasHeader = true, allowQuoting = true) let pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Append(ml.BinaryClassification.Trainers.FastTree(numLeaves = 5, numTrees = 5)) + .Append(ml.BinaryClassification.Trainers.FastTree(numberOfLeaves = 5, numberOfTrees = 5)) let model = pipeline.Fit(data) diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 7abd1f3241..49d510593f 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -179,7 +179,7 @@ public void TrainAndEvaluateRanking() // Create a training pipeline. var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) - .Append(mlContext.Ranking.Trainers.FastTree(new FastTreeRankingTrainer.Options { NumThreads = 1 })); + .Append(mlContext.Ranking.Trainers.FastTree(new FastTreeRankingTrainer.Options { NumberOfThreads = 1 })); // Train the model. var model = pipeline.Fit(data); @@ -247,7 +247,7 @@ public void TrainAndEvaluateRegression() "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) - .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumThreads = 1 })); + .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumberOfThreads = 1 })); // Train the model. var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Functional.Tests/Explainability.cs b/test/Microsoft.ML.Functional.Tests/Explainability.cs index 1dc426a436..d1c678da02 100644 --- a/test/Microsoft.ML.Functional.Tests/Explainability.cs +++ b/test/Microsoft.ML.Functional.Tests/Explainability.cs @@ -258,7 +258,7 @@ public void LocalFeatureImportanceForGamModel() // Create a pipeline to train on the housing data. var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels(numIterations: 2)); + .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels(numberOfIterations: 2)); // Fit the pipeline and transform the data. var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index 32694b6d24..cb0cacfc7e 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -82,8 +82,8 @@ public void TrainWithValidationSet() // Train the model with a validation set. var trainedModel = mlContext.Regression.Trainers.FastTree(new Trainers.FastTree.FastTreeRegressionTrainer.Options { - NumTrees = 2, - EarlyStoppingMetrics = 2, + NumberOfTrees = 2, + EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, EarlyStoppingRule = new GLEarlyStoppingCriterion.Options() }) .Fit(trainData: preprocessedTrainData, validationData: preprocessedValidData); diff --git a/test/Microsoft.ML.Predictor.Tests/TestGamPublicInterfaces.cs b/test/Microsoft.ML.Predictor.Tests/TestGamPublicInterfaces.cs index d1fdf4d8aa..1a6a5cf90b 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestGamPublicInterfaces.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestGamPublicInterfaces.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Linq; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Trainers.FastTree; using Xunit; @@ -33,20 +34,20 @@ public void TestGamDirectInstatiation() var gam = new RegressionGamModelParameters(mlContext, binUpperBounds, binEffects, intercept); // Check that the model has the right number of shape functions - Assert.Equal(binUpperBounds.Length, gam.NumShapeFunctions); + Assert.Equal(binUpperBounds.Length, gam.NumberOfShapeFunctions); // Check the intercept - Assert.Equal(intercept, gam.Intercept, 6); + Assert.Equal(intercept, gam.Bias, 6); // Check that the binUpperBounds were made correctly CheckArrayOfArrayEquality(binUpperBounds, gam.GetBinUpperBounds()); - for (int i = 0; i < gam.NumShapeFunctions; i++) - Utils.AreEqual(binUpperBounds[i], gam.GetBinUpperBounds(i)); + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + Utils.AreEqual(binUpperBounds[i], gam.GetBinUpperBounds(i).ToArray()); // Check that the bin effects were made correctly CheckArrayOfArrayEquality(binEffects, gam.GetBinEffects()); - for (int i = 0; i < gam.NumShapeFunctions; i++) - Utils.AreEqual(binEffects[i], gam.GetBinEffects(i)); + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + Utils.AreEqual(binEffects[i], gam.GetBinEffects(i).ToArray()); // Check that the constructor handles null inputs properly Assert.Throws(() => new RegressionGamModelParameters(mlContext, binUpperBounds, null, intercept)); diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index 04c4086425..6fe704e625 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -597,8 +597,8 @@ public void TestTreeEnsembleCombiner() fastTrees[i] = FastTree.TrainBinary(ML, new FastTreeBinaryClassificationTrainer.Options { FeatureColumnName = "Features", - NumTrees = 5, - NumLeaves = 4, + NumberOfTrees = 5, + NumberOfLeaves = 4, LabelColumnName = DefaultColumnNames.Label, TrainingData = dataView }).PredictorModel; @@ -619,8 +619,8 @@ public void TestTreeEnsembleCombinerWithCategoricalSplits() fastTrees[i] = FastTree.TrainBinary(ML, new FastTreeBinaryClassificationTrainer.Options { FeatureColumnName = "Features", - NumTrees = 5, - NumLeaves = 4, + NumberOfTrees = 5, + NumberOfLeaves = 4, CategoricalSplit = true, LabelColumnName = DefaultColumnNames.Label, TrainingData = cat @@ -719,8 +719,8 @@ public void TestEnsembleCombiner() FastTree.TrainBinary(ML, new FastTreeBinaryClassificationTrainer.Options { FeatureColumnName = "Features", - NumTrees = 5, - NumLeaves = 4, + NumberOfTrees = 5, + NumberOfLeaves = 4, LabelColumnName = DefaultColumnNames.Label, TrainingData = dataView }).PredictorModel, diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs index a62355e94a..a4c65f6670 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs @@ -453,8 +453,8 @@ public void FastTreeBinaryClassification() var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: catalog.Trainers.FastTree(r.label, r.features, - numTrees: 10, - numLeaves: 5, + numberOfTrees: 10, + numberOfLeaves: 5, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); @@ -494,8 +494,8 @@ public void FastTreeRegression() var est = reader.MakeNewEstimator() .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, - numTrees: 10, - numLeaves: 5, + numberOfTrees: 10, + numberOfLeaves: 5, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); diff --git a/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs b/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs index baf8fb6f0c..ac790fecfe 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs @@ -28,9 +28,9 @@ public void FastTreeRegressionRepresentation() var opts = new FastTreeRegressionTrainer.Options() { - NumTrees = 10, - NumLeaves = 5, - NumThreads = 1 + NumberOfTrees = 10, + NumberOfLeaves = 5, + NumberOfThreads = 1 }; FastTreeRegressionModelParameters pred = null; @@ -51,7 +51,7 @@ public void FastTreeRegressionRepresentation() Assert.Equal(10, treeCollection.TreeWeights.Count); var trees = treeCollection.Trees; - Assert.Equal(4, trees[0].NumNodes); + Assert.Equal(4, trees[0].NumberOfNodes); // Numerical split. There is no categorical split so the follwoing vector contains 0-element. var categoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(0); @@ -62,12 +62,12 @@ public void FastTreeRegressionRepresentation() Assert.Equal(0, categoricalSplitFeatureRange.Count); var expectedGtChild = new int[] { 3, 2, -4, -5 }; - Assert.Equal(4, trees[0].GtChild.Count); - Assert.Equal(expectedGtChild, trees[0].GtChild); + Assert.Equal(4, trees[0].RightChild.Count); + Assert.Equal(expectedGtChild, trees[0].RightChild); var expectedLteChild = new int[] { 1, -1, -3, -2 }; - Assert.Equal(4, trees[0].LteChild.Count); - Assert.Equal(expectedLteChild, trees[0].LteChild); + Assert.Equal(4, trees[0].LeftChild.Count); + Assert.Equal(expectedLteChild, trees[0].LeftChild); var expectedCategoricalSplitFlags = new bool[] { false, false, false, false }; Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); @@ -82,7 +82,7 @@ public void FastTreeRegressionRepresentation() for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); - Assert.Equal(5, trees[0].NumLeaves); + Assert.Equal(5, trees[0].NumberOfLeaves); var expectedLeafValues = new double[] { 40.159015006449692, 80.434805844435061, 57.072130551545513, 82.898710076162757, 104.17547955322266 }; Assert.Equal(5, trees[0].LeafValues.Count); @@ -108,12 +108,12 @@ public void FastTreeRegressionRepresentationWithCategoricalSplit() var opts = new FastTreeRegressionTrainer.Options() { CategoricalSplit = true, - NumTrees = 3, - NumLeaves = 5, - NumThreads = 1, + NumberOfTrees = 3, + NumberOfLeaves = 5, + NumberOfThreads = 1, // This is the minimal samples to form a split (i.e., generating two extra nodes/leaves). For a small data set, // we should set a small value. Otherwise, the trained trees could be empty. - MinDocumentsInLeafs = 2 + MinimumExampleCountPerLeaf = 2 }; var est = reader.MakeNewEstimator() @@ -133,15 +133,15 @@ public void FastTreeRegressionRepresentationWithCategoricalSplit() Assert.Equal(3, treeCollection.TreeWeights.Count); var trees = treeCollection.Trees; - Assert.Equal(4, trees[0].NumNodes); + Assert.Equal(4, trees[0].NumberOfNodes); var expectedGtChild = new int[] { 3, -3, -4, -5 }; - Assert.Equal(4, trees[0].GtChild.Count); - Assert.Equal(expectedGtChild, trees[0].GtChild); + Assert.Equal(4, trees[0].RightChild.Count); + Assert.Equal(expectedGtChild, trees[0].RightChild); var expectedLteChild = new int[] { 1, 2, -1, -2 }; - Assert.Equal(4, trees[0].LteChild.Count); - Assert.Equal(expectedLteChild, trees[0].LteChild); + Assert.Equal(4, trees[0].LeftChild.Count); + Assert.Equal(expectedLteChild, trees[0].LeftChild); var expectedCategoricalSplitFlags = new bool[] { true, true, true, true }; Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); @@ -171,7 +171,7 @@ public void FastTreeRegressionRepresentationWithCategoricalSplit() int[] expectedCounts = { 62, 52, 54, 22 }; int[] expectedStarts = { 5315, 10, 2141, 533 }; int[] expectedEnds = { 5782, 401, 2558, 874 }; - for (int i = 0; i < trees[0].NumNodes; ++i) + for (int i = 0; i < trees[0].NumberOfNodes; ++i) { // Retrieve i-th node's split features. var actualCategoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(i); @@ -180,7 +180,7 @@ public void FastTreeRegressionRepresentationWithCategoricalSplit() Assert.Equal(expectedEnds[i], actualCategoricalSplitFeatures[expectedCounts[i] - 1]); } - Assert.Equal(5, trees[0].NumLeaves); + Assert.Equal(5, trees[0].NumberOfLeaves); var expectedLeafValues = new double[] { 48.456055413607892, 86.584156799316418, 87.017326642027, 76.381184971185391, 117.68872643673058 }; Assert.Equal(5, trees[0].LeafValues.Count); diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index 740d2fc5a0..16aa66bd5f 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -699,7 +699,7 @@ public void CommandShowSchemaModel() col=Label:Num:0}} xf=Categorical{{col=CatFeatures:CatFeaturesText}} xf=Concat{{col=Features:NumFeatures,CatFeatures}} - trainer=ft{{iter=1 numLeaves=2}} + trainer=ft{{numberOfTrees=1 numberOfLeaves=2}} out={{{1}}}", trainDataPath, modelPath); RunMTAThread(new ThreadStart(() => MainForTest(args))); TestCore("showschema", string.Format("steps+ in={{{0}}} meta+", modelPath)); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 7bc26635ad..c3ec209076 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -180,7 +180,7 @@ void CommandLineOnnxConversionTest() string dataPath = GetDataPath("breast-cancer.txt"); string modelPath = GetOutputPath("ModelWithLessIO.zip"); var trainingPathArgs = $"data={dataPath} out={modelPath}"; - var trainingArgs = " loader=text{col=Label:BL:0 col=F1:R4:1-8 col=F2:TX:9} xf=Cat{col=F2} xf=Concat{col=Features:F1,F2} tr=ft{numThreads=1 numLeaves=8 numTrees=3} seed=1"; + var trainingArgs = " loader=text{col=Label:BL:0 col=F1:R4:1-8 col=F2:TX:9} xf=Cat{col=F2} xf=Concat{col=Features:F1,F2} tr=ft{numberOfThreads=1 numberOfLeaves=8 numberOfTrees=3} seed=1"; Assert.Equal(0, Maml.Main(new[] { "train " + trainingPathArgs + trainingArgs})); var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); @@ -213,7 +213,7 @@ public void KeyToVectorWithBagOnnxConversionTest() var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.OneHotEncodingTransformer.OutputKind.Bag) .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numLeaves: 2, numTrees: 1, minDatapointsInLeaves: 2)); + .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); var model = pipeline.Fit(data); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); @@ -408,7 +408,7 @@ public void RemoveVariablesInPipelineTest() .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) .Append(mlContext.Transforms.Normalize("Features")) - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numLeaves: 2, numTrees: 1, minDatapointsInLeaves: 2)); + .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); var model = pipeline.Fit(data); var transformedData = model.Transform(data); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index f9e4d05d8a..c1444b3baf 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -405,7 +405,7 @@ public void TrainOnAutoGeneratedData() var dynamicpipeline = mlContext.Transforms.Categorical.OneHotEncoding("DemographicCategory") .Append(new ColumnConcatenatingEstimator (mlContext, "Features", "DemographicCategory", "LastVisits")) .AppendCacheCheckpoint(mlContext) // FastTree will benefit from caching data in memory. - .Append(mlContext.BinaryClassification.Trainers.FastTree("HasChurned", "Features", numTrees: 20)); + .Append(mlContext.BinaryClassification.Trainers.FastTree("HasChurned", "Features", numberOfTrees: 20)); var dynamicModel = dynamicpipeline.Fit(trainData); @@ -422,7 +422,7 @@ public void TrainOnAutoGeneratedData() r.HasChurned, Features: r.DemographicCategory.OneHotEncoding().ConcatWith(r.LastVisits))) .AppendCacheCheckpoint() // FastTree will benefit from caching data in memory. - .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.HasChurned, r.Features, numTrees: 20)); + .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.HasChurned, r.Features, numberOfTrees: 20)); var staticModel = staticpipeline.Fit(staticData); @@ -550,7 +550,7 @@ private void CategoricalFeaturizationOn(params string[] dataPath) // Concatenate two of the 3 categorical pipelines, and the numeric features. Features: r.NumericalFeatures.ConcatWith(r.CategoricalBag, r.WorkclassOneHotTrimmed))) // Now we're ready to train. We chose our FastTree trainer for this classification task. - .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.Label, r.Features, numTrees: 50)); + .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.Label, r.Features, numberOfTrees: 50)); // Train the model. var model = fullpipeline.Fit(data); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 1cb5b31033..2983250834 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -388,7 +388,7 @@ private void CategoricalFeaturizationOn(params string[] dataPath) // reading them from disk multiple times. .AppendCacheCheckpoint(mlContext) // Now we're ready to train. We chose our FastTree trainer for this classification task. - .Append(mlContext.BinaryClassification.Trainers.FastTree(numTrees: 50)); + .Append(mlContext.BinaryClassification.Trainers.FastTree(numberOfTrees: 50)); // Train the model. var model = fullLearningPipeline.Fit(data); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs index e82545cd34..dc8691072d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs @@ -51,7 +51,7 @@ public void FastTreeClassificationIntrospectiveTraining() var ml = new MLContext(seed: 1, conc: 1); var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true); - var trainer = ml.BinaryClassification.Trainers.FastTree(numLeaves: 5, numTrees: 3); + var trainer = ml.BinaryClassification.Trainers.FastTree(numberOfLeaves: 5, numberOfTrees: 3); BinaryPredictionTransformer> pred = null; @@ -74,16 +74,16 @@ public void FastTreeClassificationIntrospectiveTraining() // Inspect the last tree. var tree = treeCollection.Trees[2]; - Assert.Equal(5, tree.NumLeaves); - Assert.Equal(4, tree.NumNodes); - Assert.Equal(tree.LteChild, new int[] { 2, -2, -1, -3 }); - Assert.Equal(tree.GtChild, new int[] { 1, 3, -4, -5 }); + Assert.Equal(5, tree.NumberOfLeaves); + Assert.Equal(4, tree.NumberOfNodes); + Assert.Equal(tree.LeftChild, new int[] { 2, -2, -1, -3 }); + Assert.Equal(tree.RightChild, new int[] { 1, 3, -4, -5 }); Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 }); - Assert.Equal(tree.SplitGains.Count, tree.NumNodes); - Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumNodes); + Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); + Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 }; var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f }; - for (int i = 0; i < tree.NumNodes; ++i) + for (int i = 0; i < tree.NumberOfNodes; ++i) { Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6); Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6); @@ -119,16 +119,16 @@ public void FastForestRegressionIntrospectiveTraining() // Inspect the last tree. var tree = treeCollection.Trees[2]; - Assert.Equal(5, tree.NumLeaves); - Assert.Equal(4, tree.NumNodes); - Assert.Equal(tree.LteChild, new int[] { -1, -2, -3, -4 }); - Assert.Equal(tree.GtChild, new int[] { 1, 2, 3, -5 }); + Assert.Equal(5, tree.NumberOfLeaves); + Assert.Equal(4, tree.NumberOfNodes); + Assert.Equal(tree.LeftChild, new int[] { -1, -2, -3, -4 }); + Assert.Equal(tree.RightChild, new int[] { 1, 2, 3, -5 }); Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 9, 0, 1, 8 }); - Assert.Equal(tree.SplitGains.Count, tree.NumNodes); - Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumNodes); + Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); + Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); var expectedSplitGains = new double[] { 21.279269008093962, 19.376698810984138, 17.830020749728774, 17.366801337893413 }; var expectedThresholds = new float[] { 0.208134219f, 0.198336035f, 0.202952743f, 0.205061346f }; - for (int i = 0; i < tree.NumNodes; ++i) + for (int i = 0; i < tree.NumberOfNodes; ++i) { Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6); Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6); @@ -139,7 +139,7 @@ public void FastForestRegressionIntrospectiveTraining() Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); var samples = new double[] { 0.97468354430379744, 1.0, 0.97727272727272729, 0.972972972972973, 0.26124197002141325 }; - for (int i = 0; i < tree.NumLeaves; ++i) + for (int i = 0; i < tree.NumberOfLeaves; ++i) { var sample = tree.GetLeafSamplesAt(i); Assert.Single(sample); diff --git a/test/Microsoft.ML.Tests/Scenarios/OvaTest.cs b/test/Microsoft.ML.Tests/Scenarios/OvaTest.cs index b88bf1c176..c6e47bb8eb 100644 --- a/test/Microsoft.ML.Tests/Scenarios/OvaTest.cs +++ b/test/Microsoft.ML.Tests/Scenarios/OvaTest.cs @@ -98,7 +98,7 @@ public void OvaFastTree() // Pipeline var pipeline = mlContext.MulticlassClassification.Trainers.OneVersusAll( - mlContext.BinaryClassification.Trainers.FastTree(new FastTreeBinaryClassificationTrainer.Options { NumThreads = 1 }), + mlContext.BinaryClassification.Trainers.FastTree(new FastTreeBinaryClassificationTrainer.Options { NumberOfThreads = 1 }), useProbabilities: false); var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 574b772989..3ee5ffc0a6 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -24,9 +24,9 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() var trainer = ML.BinaryClassification.Trainers.FastTree( new FastTreeBinaryClassificationTrainer.Options { - NumThreads = 1, - NumTrees = 10, - NumLeaves = 5, + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 5, }); // Train the defined tree model. diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs index 88c8e46e64..b714bc6458 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs @@ -30,9 +30,9 @@ public void FastTreeBinaryEstimator() var trainer = ML.BinaryClassification.Trainers.FastTree( new FastTreeBinaryClassificationTrainer.Options { - NumThreads = 1, - NumTrees = 10, - NumLeaves = 5, + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 5, }); var pipeWithTrainer = pipe.Append(trainer); @@ -72,7 +72,7 @@ public void GAMClassificationEstimator() var trainer = new BinaryClassificationGamTrainer(Env, new BinaryClassificationGamTrainer.Options { GainConfidenceLevel = 0, - NumIterations = 15, + NumberOfIterations = 15, }); var pipeWithTrainer = pipe.Append(trainer); TestEstimatorCore(pipeWithTrainer, dataView); @@ -91,8 +91,8 @@ public void FastForestClassificationEstimator() var trainer = ML.BinaryClassification.Trainers.FastForest( new FastForestClassification.Options { - NumLeaves = 10, - NumTrees = 20, + NumberOfLeaves = 10, + NumberOfTrees = 20, }); var pipeWithTrainer = pipe.Append(trainer); @@ -115,7 +115,7 @@ public void FastTreeRankerEstimator() new FastTreeRankingTrainer.Options { FeatureColumnName = "NumericFeatures", - NumTrees = 10, + NumberOfTrees = 10, RowGroupColumnName = "Group" }); @@ -153,7 +153,7 @@ public void FastTreeRegressorEstimator() { var dataView = GetRegressionPipeline(); var trainer = ML.Regression.Trainers.FastTree( - new FastTreeRegressionTrainer.Options { NumTrees = 10, NumThreads = 1, NumLeaves = 5 }); + new FastTreeRegressionTrainer.Options { NumberOfTrees = 10, NumberOfThreads = 1, NumberOfLeaves = 5 }); TestEstimatorCore(trainer, dataView); var model = trainer.Fit(dataView, dataView); @@ -190,7 +190,7 @@ public void GAMRegressorEstimator() var trainer = new RegressionGamTrainer(Env, new RegressionGamTrainer.Options { EnablePruning = false, - NumIterations = 15, + NumberOfIterations = 15, }); TestEstimatorCore(trainer, dataView); @@ -228,7 +228,7 @@ public void FastForestRegressorEstimator() new FastForestRegression.Options { BaggingSize = 2, - NumTrees = 10, + NumberOfTrees = 10, }); TestEstimatorCore(trainer, dataView);