From 0cb88364ee97fc8d9038308fb35efcf16343b9a2 Mon Sep 17 00:00:00 2001 From: Xiaoyun Zhang Date: Fri, 26 May 2023 11:11:52 -0700 Subject: [PATCH 1/6] Update AutoMLExperiment.cs --- src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index deeb2cab2e..274a4fc9d6 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -315,7 +315,7 @@ Abandoning Trial {trialSettings.TrialId} and continue training. trialResultManager?.AddOrUpdateTrialResult(trialResult); aggregateTrainingStopManager.Update(trialResult); - if (ex is not OperationCanceledException && _bestTrialResult == null) + if (ex is not OperationCanceledException && ex is not OutOfMemoryException && _bestTrialResult == null) { logger.Trace($"trial fatal error - {JsonSerializer.Serialize(trialSettings)}, stop training"); From 9fe2bf3c2c3de86b946bce787131b2ef0582231a Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Fri, 26 May 2023 13:25:42 -0700 Subject: [PATCH 2/6] implement subsampling for train-validation dataset manager --- .../API/AutoMLExperimentExtension.cs | 15 +++-- .../API/BinaryClassificationExperiment.cs | 4 +- .../API/MulticlassClassificationExperiment.cs | 4 +- .../API/RegressionExperiment.cs | 4 +- .../AutoMLExperiment/IDatasetManager.cs | 61 +++++++++++++++++-- .../Runner/SweepablePipelineRunner.cs | 4 +- src/Microsoft.ML.AutoML/Tuner/EciCfoTuner.cs | 10 ++- .../AutoML/AutoMLExperimentExtension.cs | 3 +- .../Reductions/GridSearchTrialRunner.cs | 19 +++--- .../AutoMLExperimentTests.cs | 11 ++++ .../TrainValidaionDatasetManagerTest.cs | 52 ++++++++++++++++ 11 files changed, 154 insertions(+), 33 deletions(-) create mode 100644 test/Microsoft.ML.AutoML.Tests/TrainValidaionDatasetManagerTest.cs diff --git a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs index d752153d51..a2a7063190 100644 --- a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs @@ -10,6 +10,7 @@ using System.Text.Json.Serialization; using Microsoft.Extensions.DependencyInjection; using Microsoft.ML.Runtime; +using Microsoft.ML.SearchSpace.Option; using Newtonsoft.Json; using static Microsoft.ML.DataOperationsCatalog; @@ -24,14 +25,18 @@ public static class AutoMLExperimentExtension /// /// dataset for training a model. /// dataset for validating a model during training. + /// determine if subsampling to train. This will be useful if is too large to be held in memory. /// - public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, IDataView train, IDataView validation) + public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, IDataView train, IDataView validation, bool subSamplingTrainDataset = false) { - var datasetManager = new TrainValidateDatasetManager() + var datasetManager = new TrainValidateDatasetManager(train, validation); + + if (subSamplingTrainDataset) { - TrainDataset = train, - ValidateDataset = validation - }; + var searchSpace = new SearchSpace.SearchSpace(); + searchSpace.Add(datasetManager.SubSamplingKey, new UniformSingleOption(0, 1, false, 0.1f)); + experiment.AddSearchSpace(nameof(TrainValidateDatasetManager), searchSpace); + } experiment.ServiceCollection.AddSingleton(datasetManager); experiment.ServiceCollection.AddSingleton(datasetManager); diff --git a/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs b/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs index 0b8651082d..99b7fdc000 100644 --- a/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs +++ b/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs @@ -420,8 +420,8 @@ public TrialResult Run(TrialSettings settings) { var stopWatch = new Stopwatch(); stopWatch.Start(); - var model = pipeline.Fit(trainTestDatasetManager.TrainDataset); - var eval = model.Transform(trainTestDatasetManager.ValidateDataset); + var model = pipeline.Fit(trainTestDatasetManager.LoadTrainDataset(_context, settings)); + var eval = model.Transform(trainTestDatasetManager.LoadValidateDataset(_context, settings)); var metrics = _context.BinaryClassification.EvaluateNonCalibrated(eval, metricManager.LabelColumn, predictedLabelColumnName: metricManager.PredictedColumn); var metric = GetMetric(metricManager.Metric, metrics); var loss = metricManager.IsMaximize ? -metric : metric; diff --git a/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs b/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs index b855c9e71a..19ba4b7c39 100644 --- a/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs +++ b/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs @@ -398,8 +398,8 @@ public TrialResult Run(TrialSettings settings) { var stopWatch = new Stopwatch(); stopWatch.Start(); - var model = pipeline.Fit(trainTestDatasetManager.TrainDataset); - var eval = model.Transform(trainTestDatasetManager.ValidateDataset); + var model = pipeline.Fit(trainTestDatasetManager.LoadTrainDataset(_context, settings)); + var eval = model.Transform(trainTestDatasetManager.LoadValidateDataset(_context, settings)); var metrics = _context.MulticlassClassification.Evaluate(eval, metricManager.LabelColumn, predictedLabelColumnName: metricManager.PredictedColumn); var metric = GetMetric(metricManager.Metric, metrics); var loss = metricManager.IsMaximize ? -metric : metric; diff --git a/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs b/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs index 99f9e9800f..e3b8e1c956 100644 --- a/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs +++ b/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs @@ -425,8 +425,8 @@ public Task RunAsync(TrialSettings settings, CancellationToken ct) { var stopWatch = new Stopwatch(); stopWatch.Start(); - var model = pipeline.Fit(trainTestDatasetManager.TrainDataset); - var eval = model.Transform(trainTestDatasetManager.ValidateDataset); + var model = pipeline.Fit(trainTestDatasetManager.LoadTrainDataset(_context, settings)); + var eval = model.Transform(trainTestDatasetManager.LoadValidateDataset(_context, settings)); var metrics = _context.Regression.Evaluate(eval, metricManager.LabelColumn, scoreColumnName: metricManager.ScoreColumn); var metric = GetMetric(metricManager.Metric, metrics); var loss = metricManager.IsMaximize ? -metric : metric; diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs index 0ab7591043..c8ac1c9e24 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs @@ -2,6 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.SearchSpace; + namespace Microsoft.ML.AutoML { /// @@ -12,7 +14,7 @@ public interface IDatasetManager { } - internal interface ICrossValidateDatasetManager + public interface ICrossValidateDatasetManager { int? Fold { get; set; } @@ -21,18 +23,68 @@ internal interface ICrossValidateDatasetManager string SamplingKeyColumnName { get; set; } } - internal interface ITrainValidateDatasetManager + public interface ITrainValidateDatasetManager { - IDataView TrainDataset { get; set; } + IDataView LoadTrainDataset(MLContext context, TrialSettings settings); - IDataView ValidateDataset { get; set; } + IDataView LoadValidateDataset(MLContext context, TrialSettings settings); } internal class TrainValidateDatasetManager : IDatasetManager, ITrainValidateDatasetManager { + private ulong _rowCount; + private IDataView _trainDataset; + private readonly IDataView _validateDataset; + private readonly string _subSamplingKey = "TrainValidateDatasetSubsamplingKey"; + private bool _isInitialized = false; + public TrainValidateDatasetManager(IDataView trainDataset, IDataView validateDataset, string subSamplingKey = null) + { + _trainDataset = trainDataset; + _validateDataset = validateDataset; + _subSamplingKey = subSamplingKey ?? _subSamplingKey; + } + + public string SubSamplingKey => _subSamplingKey; + public IDataView TrainDataset { get; set; } public IDataView ValidateDataset { get; set; } + + /// + /// Load Train Dataset. If contains then the train dataset will be subsampled. + /// + /// train dataset. + public IDataView LoadTrainDataset(MLContext context, TrialSettings settings) + { + if (!_isInitialized) + { + InitializeTrainDataset(context); + _isInitialized = true; + } + var trainTestSplitParameter = settings.Parameter.ContainsKey(nameof(TrainValidateDatasetManager)) ? settings.Parameter[nameof(TrainValidateDatasetManager)] : null; + if (trainTestSplitParameter is Parameter parameter) + { + var subSampleRatio = parameter.ContainsKey(_subSamplingKey) ? parameter[_subSamplingKey].AsType() : 1; + if (subSampleRatio < 1.0) + { + var subSampledTrainDataset = context.Data.TakeRows(_trainDataset, (long)(subSampleRatio * _rowCount)); + return subSampledTrainDataset; + } + } + + return _trainDataset; + } + + public IDataView LoadValidateDataset(MLContext context, TrialSettings settings) + { + return _validateDataset; + } + + private void InitializeTrainDataset(MLContext context) + { + _rowCount = DatasetDimensionsUtil.CountRows(_trainDataset, ulong.MaxValue); + _trainDataset = context.Data.ShuffleRows(_trainDataset); + } } internal class CrossValidateDatasetManager : IDatasetManager, ICrossValidateDatasetManager @@ -40,6 +92,7 @@ internal class CrossValidateDatasetManager : IDatasetManager, ICrossValidateData public IDataView Dataset { get; set; } public int? Fold { get; set; } + public string SamplingKeyColumnName { get; set; } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs index f69d38e541..d237bde08a 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs @@ -68,8 +68,8 @@ public TrialResult Run(TrialSettings settings) if (_datasetManager is ITrainValidateDatasetManager trainTestDatasetManager) { - var model = mlnetPipeline.Fit(trainTestDatasetManager.TrainDataset); - var eval = model.Transform(trainTestDatasetManager.ValidateDataset); + var model = mlnetPipeline.Fit(trainTestDatasetManager.LoadTrainDataset(_mLContext, settings)); + var eval = model.Transform(trainTestDatasetManager.LoadValidateDataset(_mLContext, settings)); var metric = _metricManager.Evaluate(_mLContext, eval); stopWatch.Stop(); var loss = _metricManager.IsMaximize ? -metric : metric; diff --git a/src/Microsoft.ML.AutoML/Tuner/EciCfoTuner.cs b/src/Microsoft.ML.AutoML/Tuner/EciCfoTuner.cs index 51096dde72..c3cf345013 100644 --- a/src/Microsoft.ML.AutoML/Tuner/EciCfoTuner.cs +++ b/src/Microsoft.ML.AutoML/Tuner/EciCfoTuner.cs @@ -32,7 +32,9 @@ public EciCostFrugalTuner(SweepablePipeline sweepablePipeline, AutoMLExperiment. _tuners = pipelineSchemas.ToDictionary(schema => schema, schema => { var searchSpace = sweepablePipeline.BuildSweepableEstimatorPipeline(schema).SearchSpace; - return new CostFrugalTuner(searchSpace, searchSpace.SampleFromFeatureSpace(searchSpace.Default), seed: settings.Seed) as ITuner; + var aggregateSearchSpace = new SearchSpace.SearchSpace(settings.SearchSpace); + aggregateSearchSpace[AutoMLExperiment.PipelineSearchspaceName] = searchSpace; + return new CostFrugalTuner(aggregateSearchSpace, aggregateSearchSpace.SampleFromFeatureSpace(aggregateSearchSpace.Default), seed: settings.Seed) as ITuner; }); if (trialResultManager != null) @@ -57,22 +59,18 @@ public Parameter Propose(TrialSettings settings) parameter[k.Key] = _defaultParameter[k.Key]; } } - settings.Parameter[AutoMLExperiment.PipelineSearchspaceName] = parameter; + settings.Parameter = parameter; return settings.Parameter; } public void Update(TrialResult result) { - var originalParameter = result.TrialSettings.Parameter; var schema = result.TrialSettings.Parameter[AutoMLExperiment.PipelineSearchspaceName]["_SCHEMA_"].AsType(); _pipelineProposer.Update(result, schema); if (_tuners.TryGetValue(schema, out var tuner)) { - var parameter = result.TrialSettings.Parameter[AutoMLExperiment.PipelineSearchspaceName]; - result.TrialSettings.Parameter = parameter; tuner.Update(result); - result.TrialSettings.Parameter = originalParameter; } } } diff --git a/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs b/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs index e3b5351237..f6395f9fb2 100644 --- a/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs @@ -70,8 +70,9 @@ public static AutoMLExperiment SetBinaryClassificationMetricWithFairLearn( var moment = serviceProvider.GetRequiredService(); var datasetManager = serviceProvider.GetRequiredService(); var pipeline = serviceProvider.GetRequiredService(); - return new GridSearchTrailRunner(context, datasetManager.TrainDataset, datasetManager.ValidateDataset, labelColumn, sensitiveColumnName, pipeline, moment); + return new GridSearchTrailRunner(context, datasetManager, labelColumn, sensitiveColumnName, pipeline, moment); }); + experiment.SetRandomSearchTuner(); return experiment; diff --git a/src/Microsoft.ML.Fairlearn/Reductions/GridSearchTrialRunner.cs b/src/Microsoft.ML.Fairlearn/Reductions/GridSearchTrialRunner.cs index 1adf3b37d4..7d1be076bb 100644 --- a/src/Microsoft.ML.Fairlearn/Reductions/GridSearchTrialRunner.cs +++ b/src/Microsoft.ML.Fairlearn/Reductions/GridSearchTrialRunner.cs @@ -27,18 +27,16 @@ namespace Microsoft.ML.Fairlearn public class GridSearchTrailRunner : ITrialRunner { private readonly MLContext _context; - private readonly IDataView _trainDataset; - private readonly IDataView _testDataset; private readonly string _labelColumn; private readonly string _sensitiveColumn; private readonly SweepablePipeline _pipeline; private readonly ClassificationMoment _moment; + private readonly ITrainValidateDatasetManager _datasetManager; - public GridSearchTrailRunner(MLContext context, IDataView trainDataset, IDataView testDataset, string labelColumn, string sensitiveColumn, SweepablePipeline pipeline, ClassificationMoment moment) + public GridSearchTrailRunner(MLContext context, ITrainValidateDatasetManager datasetManager, string labelColumn, string sensitiveColumn, SweepablePipeline pipeline, ClassificationMoment moment) { _context = context; - this._trainDataset = trainDataset; - this._testDataset = testDataset; + this._datasetManager = datasetManager; this._labelColumn = labelColumn; this._sensitiveColumn = sensitiveColumn; _pipeline = pipeline; @@ -68,21 +66,24 @@ public Task RunAsync(TrialSettings settings, CancellationToken ct) return (sign, e, value); }); + var trainDataset = _datasetManager.LoadTrainDataset(_context, settings); + var validateDataset = _datasetManager.LoadValidateDataset(_context, settings); + var df = new DataFrame(); df["sign"] = DataFrameColumn.Create("sign", lambdasValue.Select(x => x.sign)); df["group_id"] = DataFrameColumn.Create("group_id", lambdasValue.Select(x => x.e)); df["value"] = DataFrameColumn.Create("value", lambdasValue.Select(x => x.value)); - _moment.LoadData(this._trainDataset, DataFrameColumn.Create("y", this._trainDataset.GetColumn(this._labelColumn)), DataFrameColumn.Create("group_id", this._trainDataset.GetColumn(this._sensitiveColumn))); + _moment.LoadData(trainDataset, DataFrameColumn.Create("y", trainDataset.GetColumn(this._labelColumn)), DataFrameColumn.Create("group_id", trainDataset.GetColumn(this._sensitiveColumn))); var signWeightColumn = _moment.SignedWeights(df); - var trainDataset = ZipDataView.Create(_context, new IDataView[] { _trainDataset, new DataFrame(signWeightColumn) }); + trainDataset = ZipDataView.Create(_context, new IDataView[] { trainDataset, new DataFrame(signWeightColumn) }); var model = pipeline.Fit(trainDataset); // returns an IDataview object that contains the predictions - var eval = model.Transform(this._testDataset); + var eval = model.Transform(validateDataset); // extract the predicted label and convert it to 1.0f and 0.0 so that we can feed that into the gamma function var predictedLabel = eval.GetColumn("PredictedLabel").Select(b => b ? 1f : 0f).ToArray(); var column = DataFrameColumn.Create("pred", predictedLabel); //Get the gamma based on the predicted label of the testDataset - _moment.LoadData(this._testDataset, DataFrameColumn.Create("y", eval.GetColumn(this._labelColumn)), DataFrameColumn.Create("group_id", _testDataset.GetColumn(this._sensitiveColumn))); + _moment.LoadData(validateDataset, DataFrameColumn.Create("y", eval.GetColumn(this._labelColumn)), DataFrameColumn.Create("group_id", validateDataset.GetColumn(this._sensitiveColumn))); var gamma = _moment.Gamma(column); double fairnessLost = Convert.ToSingle(gamma["value"].Max()); var metrics = _context.BinaryClassification.EvaluateNonCalibrated(eval, this._labelColumn); diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index 40f3f9fb68..c126942183 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -359,6 +359,17 @@ public async Task AutoMLExperiment_Taxi_Fare_Train_Test_Split_Test() var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.5); + + // test subsamping + experiment = context.Auto().CreateExperiment(); + experiment.SetDataset(train, test, true) + .SetRegressionMetric(RegressionMetric.RSquared, label) + .SetPipeline(pipeline) + .SetMaxModelToExplore(1); + result = await experiment.RunAsync(); + result.Metric.Should().BeGreaterThan(0.5); + result.TrialSettings.Parameter[nameof(TrainValidateDatasetManager)]["TrainValidateDatasetSubsamplingKey"] + .AsType().Should().Be(0.1); } [Fact] diff --git a/test/Microsoft.ML.AutoML.Tests/TrainValidaionDatasetManagerTest.cs b/test/Microsoft.ML.AutoML.Tests/TrainValidaionDatasetManagerTest.cs new file mode 100644 index 0000000000..aa03befceb --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TrainValidaionDatasetManagerTest.cs @@ -0,0 +1,52 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using FluentAssertions; +using Microsoft.ML.AutoML.Test; +using Microsoft.ML.SearchSpace; +using Microsoft.ML.TestFramework; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.AutoML.Test +{ + public class TrainValidaionDatasetManagerTest : BaseTestClass + { + public TrainValidaionDatasetManagerTest(ITestOutputHelper output) : base(output) + { + } + + [Fact] + public void TrainValidationDatasetManagerSubSamplingTest() + { + var context = new MLContext(1); + var dataPath = DatasetUtil.GetUciAdultDataset(); + var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel); + var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); + var trainData = textLoader.Load(dataPath); + + var trainDataLength = DatasetDimensionsUtil.CountRows(trainData, ulong.MaxValue); + trainDataLength.Should().Be(500); + + var trainValidationDatasetManager = new TrainValidateDatasetManager(trainData, trainData, "SubSampleKey"); + + var parameter = Parameter.CreateNestedParameter(); + parameter[nameof(TrainValidateDatasetManager)] = Parameter.CreateNestedParameter(); + parameter[nameof(TrainValidateDatasetManager)][trainValidationDatasetManager.SubSamplingKey] = Parameter.FromDouble(0.3); + var setting = new TrialSettings + { + Parameter = parameter, + }; + + var subSampleTrainData = trainValidationDatasetManager.LoadTrainDataset(context, setting); + var subSampleTrainDataLength = DatasetDimensionsUtil.CountRows(subSampleTrainData, ulong.MaxValue); + subSampleTrainDataLength.Should().Be(150); + } + } +} From e37287508391e29eb5440b8282cc6d1480d6a98d Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Sat, 27 May 2023 10:51:57 -0700 Subject: [PATCH 3/6] fix test --- .../AutoMLExperiment/IDatasetManager.cs | 27 +++++---- .../Runner/SweepablePipelineRunner.cs | 4 +- .../AutoML/AutoMLExperimentExtension.cs | 12 +++- .../AutoMLExperimentTests.cs | 2 +- .../TrainerEstimators/TreeEstimators.cs | 56 +++++++++++++++++++ 5 files changed, 81 insertions(+), 20 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs index c8ac1c9e24..41ecfae37c 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +#nullable enable using Microsoft.ML.SearchSpace; @@ -18,16 +19,16 @@ public interface ICrossValidateDatasetManager { int? Fold { get; set; } - IDataView Dataset { get; set; } + IDataView? Dataset { get; set; } - string SamplingKeyColumnName { get; set; } + string? SamplingKeyColumnName { get; set; } } public interface ITrainValidateDatasetManager { - IDataView LoadTrainDataset(MLContext context, TrialSettings settings); + IDataView LoadTrainDataset(MLContext context, TrialSettings? settings); - IDataView LoadValidateDataset(MLContext context, TrialSettings settings); + IDataView LoadValidateDataset(MLContext context, TrialSettings? settings); } internal class TrainValidateDatasetManager : IDatasetManager, ITrainValidateDatasetManager @@ -37,7 +38,7 @@ internal class TrainValidateDatasetManager : IDatasetManager, ITrainValidateData private readonly IDataView _validateDataset; private readonly string _subSamplingKey = "TrainValidateDatasetSubsamplingKey"; private bool _isInitialized = false; - public TrainValidateDatasetManager(IDataView trainDataset, IDataView validateDataset, string subSamplingKey = null) + public TrainValidateDatasetManager(IDataView trainDataset, IDataView validateDataset, string? subSamplingKey = null) { _trainDataset = trainDataset; _validateDataset = validateDataset; @@ -46,22 +47,20 @@ public TrainValidateDatasetManager(IDataView trainDataset, IDataView validateDat public string SubSamplingKey => _subSamplingKey; - public IDataView TrainDataset { get; set; } - - public IDataView ValidateDataset { get; set; } - /// /// Load Train Dataset. If contains then the train dataset will be subsampled. /// + /// MLContext. + /// trial settings. If null, return entire train dataset. /// train dataset. - public IDataView LoadTrainDataset(MLContext context, TrialSettings settings) + public IDataView LoadTrainDataset(MLContext context, TrialSettings? settings) { if (!_isInitialized) { InitializeTrainDataset(context); _isInitialized = true; } - var trainTestSplitParameter = settings.Parameter.ContainsKey(nameof(TrainValidateDatasetManager)) ? settings.Parameter[nameof(TrainValidateDatasetManager)] : null; + var trainTestSplitParameter = settings?.Parameter.ContainsKey(nameof(TrainValidateDatasetManager)) is true ? settings.Parameter[nameof(TrainValidateDatasetManager)] : null; if (trainTestSplitParameter is Parameter parameter) { var subSampleRatio = parameter.ContainsKey(_subSamplingKey) ? parameter[_subSamplingKey].AsType() : 1; @@ -75,7 +74,7 @@ public IDataView LoadTrainDataset(MLContext context, TrialSettings settings) return _trainDataset; } - public IDataView LoadValidateDataset(MLContext context, TrialSettings settings) + public IDataView LoadValidateDataset(MLContext context, TrialSettings? settings) { return _validateDataset; } @@ -89,10 +88,10 @@ private void InitializeTrainDataset(MLContext context) internal class CrossValidateDatasetManager : IDatasetManager, ICrossValidateDatasetManager { - public IDataView Dataset { get; set; } + public IDataView? Dataset { get; set; } public int? Fold { get; set; } - public string SamplingKeyColumnName { get; set; } + public string? SamplingKeyColumnName { get; set; } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs index d237bde08a..df49f8a4ef 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs @@ -68,8 +68,8 @@ public TrialResult Run(TrialSettings settings) if (_datasetManager is ITrainValidateDatasetManager trainTestDatasetManager) { - var model = mlnetPipeline.Fit(trainTestDatasetManager.LoadTrainDataset(_mLContext, settings)); - var eval = model.Transform(trainTestDatasetManager.LoadValidateDataset(_mLContext, settings)); + var model = mlnetPipeline.Fit(trainTestDatasetManager.LoadTrainDataset(_mLContext!, settings)); + var eval = model.Transform(trainTestDatasetManager.LoadValidateDataset(_mLContext!, settings)); var metric = _metricManager.Evaluate(_mLContext, eval); stopWatch.Stop(); var loss = _metricManager.IsMaximize ? -metric : metric; diff --git a/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs b/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs index f6395f9fb2..e9f8c41389 100644 --- a/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.Fairlearn/AutoML/AutoMLExperimentExtension.cs @@ -9,6 +9,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.ML.AutoML; using Microsoft.ML.Data; +using Microsoft.ML.SearchSpace; namespace Microsoft.ML.Fairlearn.AutoML { @@ -55,9 +56,14 @@ public static AutoMLExperiment SetBinaryClassificationMetricWithFairLearn( { var datasetManager = serviceProvider.GetRequiredService(); var moment = new UtilityParity(); - var sensitiveFeature = DataFrameColumn.Create("group_id", datasetManager.TrainDataset.GetColumn(sensitiveColumnName)); - var label = DataFrameColumn.Create("label", datasetManager.TrainDataset.GetColumn(labelColumn)); - moment.LoadData(datasetManager.TrainDataset, label, sensitiveFeature); + var context = serviceProvider.GetRequiredService(); + var trainData = datasetManager.LoadTrainDataset(context, new TrialSettings + { + Parameter = Parameter.CreateNestedParameter(), + }); + var sensitiveFeature = DataFrameColumn.Create("group_id", trainData.GetColumn(sensitiveColumnName)); + var label = DataFrameColumn.Create("label", trainData.GetColumn(labelColumn)); + moment.LoadData(trainData, label, sensitiveFeature); var lambdaSearchSpace = Utilities.GenerateBinaryClassificationLambdaSearchSpace(moment, gridLimit, negativeAllowed); experiment.AddSearchSpace("_lambda_search_space", lambdaSearchSpace); diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index c126942183..a3c446d406 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -369,7 +369,7 @@ public async Task AutoMLExperiment_Taxi_Fare_Train_Test_Split_Test() result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.5); result.TrialSettings.Parameter[nameof(TrainValidateDatasetManager)]["TrainValidateDatasetSubsamplingKey"] - .AsType().Should().Be(0.1); + .AsType().Should().Be(0.1f); } [Fact] diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs index 5f073c58e5..b4c7fbfb9d 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs @@ -21,6 +21,8 @@ using Xunit; using FluentAssertions; using System.IO; +using static Microsoft.ML.DataOperationsCatalog; +using System.Data; namespace Microsoft.ML.Tests.TrainerEstimators { @@ -50,6 +52,60 @@ public void FastTreeBinaryEstimator() Done(); } + [Fact] + public void FastTreeBinaryEstimatorOnLongLengthArray() + { + var dataset = ML.Data.LoadFromEnumerable(GenerateIEnumerableWithMaxLongLength()); + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 5, + DiskTranspose = true, + LabelColumnName = nameof(SingleFeatureWithBooleanLabel.Label), + FeatureColumnName = nameof(SingleFeatureWithBooleanLabel.Feature) + }); + TestEstimatorCore(trainer, dataset); + Done(); + } + + class SingleFeatureWithBooleanLabel + { + public bool Label { get; set; } + + [VectorType(1)] + public float[] Feature { get; set; } + } + + private IEnumerable GenerateIEnumerableWithMaxLongLength() + { + var currentLabel = true; + var currentFloat = 0f; + var length = 0L; + var bufferLength = 2 >> 15; + var featureBuffer = new float[bufferLength]; + var labelBuffer = new bool[bufferLength]; + while (length < int.MaxValue / 2) + { + for (int i = 0; i < bufferLength; i++) + { + featureBuffer[i] = currentFloat; + labelBuffer[i] = currentLabel; + currentFloat++; + currentLabel = !currentLabel; + } + + var buffer = Enumerable.Zip(featureBuffer, labelBuffer, (f, l) => new SingleFeatureWithBooleanLabel { Feature = new float[] { f }, Label = l }); + foreach (var item in buffer) + { + yield return item; + } + + length += bufferLength; + } + } + [LightGBMFact] public void LightGBMBinaryEstimator() { From 01e02ac66e76a577aa16fe27110b417ade97e92d Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Mon, 12 Jun 2023 16:32:55 -0700 Subject: [PATCH 4/6] fix comments --- .../API/AutoMLExperimentExtension.cs | 8 +---- .../AutoMLExperiment/IDatasetManager.cs | 31 +++++++++++++++---- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs index a2a7063190..fa1d11e1ce 100644 --- a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs @@ -67,13 +67,7 @@ public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, Trai /// public static AutoMLExperiment SetDataset(this AutoMLExperiment experiment, IDataView dataset, int fold = 10, string samplingKeyColumnName = null) { - var datasetManager = new CrossValidateDatasetManager() - { - Dataset = dataset, - Fold = fold, - SamplingKeyColumnName = samplingKeyColumnName, - }; - + var datasetManager = new CrossValidateDatasetManager(dataset, fold, samplingKeyColumnName); experiment.ServiceCollection.AddSingleton(datasetManager); experiment.ServiceCollection.AddSingleton(datasetManager); diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs index 41ecfae37c..b065c62976 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IDatasetManager.cs @@ -15,16 +15,28 @@ public interface IDatasetManager { } - public interface ICrossValidateDatasetManager + /// + /// Inferface for cross validate dataset manager. + /// + public interface ICrossValidateDatasetManager : IDatasetManager { - int? Fold { get; set; } + /// + /// Cross validate fold. + /// + int Fold { get; set; } - IDataView? Dataset { get; set; } + /// + /// The dataset to cross validate. + /// + IDataView Dataset { get; set; } + /// + /// The dataset column used for grouping rows. + /// string? SamplingKeyColumnName { get; set; } } - public interface ITrainValidateDatasetManager + public interface ITrainValidateDatasetManager : IDatasetManager { IDataView LoadTrainDataset(MLContext context, TrialSettings? settings); @@ -88,9 +100,16 @@ private void InitializeTrainDataset(MLContext context) internal class CrossValidateDatasetManager : IDatasetManager, ICrossValidateDatasetManager { - public IDataView? Dataset { get; set; } + public CrossValidateDatasetManager(IDataView dataset, int fold, string? samplingKeyColumnName = null) + { + Dataset = dataset; + Fold = fold; + SamplingKeyColumnName = samplingKeyColumnName; + } + + public IDataView Dataset { get; set; } - public int? Fold { get; set; } + public int Fold { get; set; } public string? SamplingKeyColumnName { get; set; } } From 981930af5f8661f8f1bfb0d01dc4856be8ce7068 Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Mon, 12 Jun 2023 16:36:03 -0700 Subject: [PATCH 5/6] fix comment --- src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs | 2 +- .../API/MulticlassClassificationExperiment.cs | 2 +- src/Microsoft.ML.AutoML/API/RegressionExperiment.cs | 2 +- .../AutoMLExperiment/Runner/SweepablePipelineRunner.cs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs b/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs index 99b7fdc000..edc3257d11 100644 --- a/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs +++ b/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs @@ -391,7 +391,7 @@ public TrialResult Run(TrialSettings settings) { var stopWatch = new Stopwatch(); stopWatch.Start(); - var fold = datasetManager.Fold ?? 5; + var fold = datasetManager.Fold; var metrics = _context.BinaryClassification.CrossValidateNonCalibrated(datasetManager.Dataset, pipeline, fold, metricManager.LabelColumn); // now we just randomly pick a model, but a better way is to provide option to pick a model which score is the cloest to average or the best. diff --git a/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs b/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs index 19ba4b7c39..d58a69249d 100644 --- a/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs +++ b/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs @@ -369,7 +369,7 @@ public TrialResult Run(TrialSettings settings) { var stopWatch = new Stopwatch(); stopWatch.Start(); - var fold = datasetManager.Fold ?? 5; + var fold = datasetManager.Fold; var metrics = _context.MulticlassClassification.CrossValidate(datasetManager.Dataset, pipeline, fold, metricManager.LabelColumn); // now we just randomly pick a model, but a better way is to provide option to pick a model which score is the cloest to average or the best. diff --git a/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs b/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs index e3b8e1c956..a404f4ce80 100644 --- a/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs +++ b/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs @@ -396,7 +396,7 @@ public Task RunAsync(TrialSettings settings, CancellationToken ct) { var stopWatch = new Stopwatch(); stopWatch.Start(); - var fold = datasetManager.Fold ?? 5; + var fold = datasetManager.Fold; var metrics = _context.Regression.CrossValidate(datasetManager.Dataset, pipeline, fold, metricManager.LabelColumn); // now we just randomly pick a model, but a better way is to provide option to pick a model which score is the cloest to average or the best. diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs index df49f8a4ef..5cc5e5feb1 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs @@ -40,7 +40,7 @@ public TrialResult Run(TrialSettings settings) var mlnetPipeline = _pipeline.BuildFromOption(_mLContext, parameter); if (_datasetManager is ICrossValidateDatasetManager crossValidateDatasetManager) { - var datasetSplit = _mLContext!.Data.CrossValidationSplit(crossValidateDatasetManager.Dataset, crossValidateDatasetManager.Fold ?? 5, crossValidateDatasetManager.SamplingKeyColumnName); + var datasetSplit = _mLContext!.Data.CrossValidationSplit(crossValidateDatasetManager.Dataset, crossValidateDatasetManager.Fold, crossValidateDatasetManager.SamplingKeyColumnName); var metrics = new List(); var models = new List(); foreach (var split in datasetSplit) From eac687902e8970ab8f7edbf45f632423e892f398 Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Tue, 13 Jun 2023 16:00:49 -0700 Subject: [PATCH 6/6] revert tests --- .../TrainerEstimators/TreeEstimators.cs | 56 ------------------- 1 file changed, 56 deletions(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs index b4c7fbfb9d..5f073c58e5 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs @@ -21,8 +21,6 @@ using Xunit; using FluentAssertions; using System.IO; -using static Microsoft.ML.DataOperationsCatalog; -using System.Data; namespace Microsoft.ML.Tests.TrainerEstimators { @@ -52,60 +50,6 @@ public void FastTreeBinaryEstimator() Done(); } - [Fact] - public void FastTreeBinaryEstimatorOnLongLengthArray() - { - var dataset = ML.Data.LoadFromEnumerable(GenerateIEnumerableWithMaxLongLength()); - var trainer = ML.BinaryClassification.Trainers.FastTree( - new FastTreeBinaryTrainer.Options - { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 5, - DiskTranspose = true, - LabelColumnName = nameof(SingleFeatureWithBooleanLabel.Label), - FeatureColumnName = nameof(SingleFeatureWithBooleanLabel.Feature) - }); - TestEstimatorCore(trainer, dataset); - Done(); - } - - class SingleFeatureWithBooleanLabel - { - public bool Label { get; set; } - - [VectorType(1)] - public float[] Feature { get; set; } - } - - private IEnumerable GenerateIEnumerableWithMaxLongLength() - { - var currentLabel = true; - var currentFloat = 0f; - var length = 0L; - var bufferLength = 2 >> 15; - var featureBuffer = new float[bufferLength]; - var labelBuffer = new bool[bufferLength]; - while (length < int.MaxValue / 2) - { - for (int i = 0; i < bufferLength; i++) - { - featureBuffer[i] = currentFloat; - labelBuffer[i] = currentLabel; - currentFloat++; - currentLabel = !currentLabel; - } - - var buffer = Enumerable.Zip(featureBuffer, labelBuffer, (f, l) => new SingleFeatureWithBooleanLabel { Feature = new float[] { f }, Label = l }); - foreach (var item in buffer) - { - yield return item; - } - - length += bufferLength; - } - } - [LightGBMFact] public void LightGBMBinaryEstimator() {