diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 9d969b6be2..2b9bcb6cf9 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -33,6 +33,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TestFramework" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Predictor.Tests", "test\Microsoft.ML.Predictor.Tests\Microsoft.ML.Predictor.Tests.csproj", "{6B047E09-39C9-4583-96F3-685D84CA4117}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Functional.Tests", "test\Microsoft.ML.Functional.Tests\Microsoft.ML.Functional.Tests.csproj", "{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.ResultProcessor", "src\Microsoft.ML.ResultProcessor\Microsoft.ML.ResultProcessor.csproj", "{3769FCC3-9AFF-4C37-97E9-6854324681DF}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.FastTree", "src\Microsoft.ML.FastTree\Microsoft.ML.FastTree.csproj", "{B7B593C5-FB8C-4ADA-A638-5B53B47D087E}" @@ -928,6 +930,18 @@ Global {5E920CAC-5A28-42FB-936E-49C472130953}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU {5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU {5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.Build.0 = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1011,6 +1025,7 @@ Global {85D0CAFD-2FE8-496A-88C7-585D35B94243} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {31D38B21-102B-41C0-9E0A-2FE0BF68D123} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {5E920CAC-5A28-42FB-936E-49C472130953} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/build/Dependencies.props b/build/Dependencies.props index 896ca68978..9d2174267b 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -43,6 +43,8 @@ 0.11.3 0.0.3-test + 0.0.7-test + 0.0.4-test diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index edaf2d55c5..d95d047faf 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -17,7 +17,12 @@ public static class DatasetUtils /// Downloads the housing dataset from the ML.NET repo. /// public static string DownloadHousingRegressionDataset() - => Download("https://raw.githubusercontent.com/dotnet/machinelearning/024bd4452e1d3660214c757237a19d6123f951ca/test/data/housing.txt", "housing.txt"); + { + var fileName = "housing.txt"; + if (!File.Exists(fileName)) + Download("https://raw.githubusercontent.com/dotnet/machinelearning/024bd4452e1d3660214c757237a19d6123f951ca/test/data/housing.txt", fileName); + return fileName; + } public static IDataView LoadHousingRegressionDataset(MLContext mlContext) { diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs new file mode 100644 index 0000000000..29088298d3 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -0,0 +1,24 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.SamplesUtils; +using Microsoft.ML.Trainers.HalLearners; +using Xunit; + +namespace Microsoft.ML.Functional.Tests +{ + internal static class Common + { + public static void CheckMetrics(RegressionMetrics metrics) + { + // Perform sanity checks on the metrics + Assert.True(metrics.Rms >= 0); + Assert.True(metrics.L1 >= 0); + Assert.True(metrics.L2 >= 0); + Assert.True(metrics.RSquared <= 1); + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj new file mode 100644 index 0000000000..106db8f36c --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj @@ -0,0 +1,52 @@ + + + + + false + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs new file mode 100644 index 0000000000..7e0ff2eb44 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Xunit; + +namespace Microsoft.ML.Functional.Tests +{ + public class PredictionScenarios + { + /// + /// Reconfigurable predictions: The following should be possible: A user trains a binary classifier, + /// and through the test evaluator gets a PR curve, the based on the PR curve picks a new threshold + /// and configures the scorer (or more precisely instantiates a new scorer over the same model parameters) + /// with some threshold derived from that. + /// + [Fact] + public void ReconfigurablePrediction() + { + var mlContext = new MLContext(seed: 789); + + // Get the dataset, create a train and test + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); + (var train, var test) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); + + // Create a pipeline to train on the housing data + var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { + "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", + "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) + .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); + + var model = pipeline.Fit(train); + + var scoredTest = model.Transform(test); + var metrics = mlContext.Regression.Evaluate(scoredTest); + + Common.CheckMetrics(metrics); + + // Todo #2465: Allow the setting of threshold and thresholdColumn for scoring. + // This is no longer possible in the API + //var newModel = new BinaryPredictionTransformer>(ml, model.Model, trainData.Schema, model.FeatureColumn, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability); + //var newScoredTest = newModel.Transform(pipeline.Transform(testData)); + //var newMetrics = mlContext.BinaryClassification.Evaluate(scoredTest); + // And the Threshold and ThresholdColumn properties are not settable. + //var predictor = model.LastTransformer; + //predictor.Threshold = 0.01; // Not possible + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs new file mode 100644 index 0000000000..b9bb617285 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers.HalLearners; +using Xunit; + +namespace Microsoft.ML.Functional.Tests +{ + public class ValidationScenarios + { + /// + /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with + /// a data source (optionally with stratification column), come up with an instantiable transform + /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate + /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of + /// metrics, trained pipelines, and scored test data for each fold. + /// + [Fact] + void CrossValidation() + { + var mlContext = new MLContext(seed: 789); + + // Get the dataset + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); + + // Create a pipeline to train on the sentiment data + var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { + "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", + "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) + .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); + + // Compute the CV result + var cvResult = mlContext.Regression.CrossValidate(data, pipeline, numFolds: 5); + + // Check that the results are valid + Assert.IsType(cvResult[0].metrics); + Assert.IsType>>(cvResult[0].model); + Assert.True(cvResult[0].scoredTestData is IDataView); + Assert.Equal(5, cvResult.Length); + + // And validate the metrics + foreach (var result in cvResult) + Common.CheckMetrics(result.metrics); + } + } +} diff --git a/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj b/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj index a153655b27..a5dbaca9f8 100644 --- a/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj +++ b/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj @@ -11,7 +11,7 @@ - + diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 6d6ba61191..1bdfa5048b 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -158,7 +158,24 @@ public static class TestDatasets name = "housing", trainFilename = "housing.txt", testFilename = "housing.txt", - loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}" + loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}", + GetLoaderColumns = () => + { + return new[] { + new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), + new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), + new TextLoader.Column("PercentResidental", DataKind.R4, 2), + new TextLoader.Column("PercentNonRetail", DataKind.R4, 3), + new TextLoader.Column("CharlesRiver", DataKind.R4, 4), + new TextLoader.Column("NitricOxides", DataKind.R4, 5), + new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6), + new TextLoader.Column("PercentPre40s", DataKind.R4, 7), + new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), + new TextLoader.Column("HighwayDistance", DataKind.R4, 9), + new TextLoader.Column("TaxRate", DataKind.R4, 10), + new TextLoader.Column("TeacherRatio", DataKind.R4, 11), + }; + } }; public static TestDataset generatedRegressionDatasetmacro = new TestDataset diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index 2d56666b7f..37f4b25c1e 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -46,7 +46,7 @@ - - + + diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs deleted file mode 100644 index a4e3afc2cc..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with - /// a data source (optionally with stratification column), come up with an instantiable transform - /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate - /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of - /// evaluations and optionally trained pipes. (People always want metrics out of xfold, - /// they sometimes want the actual models too.) - /// - [Fact] - void CrossValidation() - { - var ml = new MLContext(seed: 1, conc: 1); - - var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - - // Pipeline. - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { ConvergenceTolerance = 1f, NumThreads = 1, })); - - var cvResult = ml.BinaryClassification.CrossValidate(data, pipeline); - } - } -} diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs deleted file mode 100644 index 254dd73e45..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs +++ /dev/null @@ -1,47 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Reconfigurable predictions: The following should be possible: A user trains a binary classifier, - /// and through the test evaluator gets a PR curve, the based on the PR curve picks a new threshold - /// and configures the scorer (or more precisely instantiates a new scorer over the same predictor) - /// with some threshold derived from that. - /// - [Fact] - public void ReconfigurablePrediction() - { - var ml = new MLContext(seed: 1, conc: 1); - var dataReader = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - - var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true); - - // Pipeline. - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Fit(data); - - var trainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { NumThreads = 1 }); - - var trainData = ml.Data.Cache(pipeline.Transform(data)); // Cache the data right before the trainer to boost the training speed. - var model = trainer.Fit(trainData); - - var scoredTest = model.Transform(pipeline.Transform(testData)); - var metrics = ml.BinaryClassification.Evaluate(scoredTest); - - var newModel = new BinaryPredictionTransformer>(ml, model.Model, trainData.Schema, model.FeatureColumn, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability); - var newScoredTest = newModel.Transform(pipeline.Transform(testData)); - var newMetrics = ml.BinaryClassification.Evaluate(scoredTest); - } - } -}