From 2e4d4b071b82028230566e510d0c8eb04ceb7f40 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 19 Feb 2019 16:55:06 -0800 Subject: [PATCH 1/9] Adding functional tests for all training and evaluation tasks --- test/Microsoft.ML.Functional.Tests/Common.cs | 71 +++++ .../Datasets/Iris.cs | 82 ++++++ .../Datasets/MnistOneClass.cs | 24 ++ .../Datasets/Sentiment.cs | 24 ++ .../Datasets/TrivialMatrixFactorization.cs | 45 +++ .../Evaluation.cs | 278 ++++++++++++++++++ .../Prediction.cs | 3 +- .../Validation.cs | 6 +- test/Microsoft.ML.TestFramework/Datasets.cs | 12 + .../Scenarios/Api/Estimators/Evaluation.cs | 39 --- 10 files changed, 542 insertions(+), 42 deletions(-) create mode 100644 test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs create mode 100644 test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs create mode 100644 test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs create mode 100644 test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs create mode 100644 test/Microsoft.ML.Functional.Tests/Evaluation.cs delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 5c80ab21a7..b097d7ddf1 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -7,6 +7,7 @@ using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Data.Evaluators.Metrics; using Microsoft.ML.Functional.Tests.Datasets; using Xunit; @@ -160,6 +161,76 @@ public static void AssertEqual(TypeTestData testType1, TypeTestData testType2) Assert.True(testType1.Ug.Equals(testType2.Ug)); } + /// + /// Check that a object is valid. + /// + /// + /// TODO #2644: At times, AnomalyDetection.Evaluate will return a set of NaN metrics. + /// + /// The metrics object. + public static void CheckMetrics(AnomalyDetectionMetrics metrics) + { + // Perform sanity checks on the metrics. + Assert.InRange(metrics.Auc, 0, 1); + Assert.InRange(metrics.DrAtK, 0, 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void CheckMetrics(BinaryClassificationMetrics metrics) + { + // Perform sanity checks on the metrics. + Assert.InRange(metrics.Accuracy, 0, 1); + Assert.InRange(metrics.Auc, 0, 1); + Assert.InRange(metrics.Auprc, 0, 1); + Assert.InRange(metrics.F1Score, 0, 1); + Assert.InRange(metrics.NegativePrecision, 0, 1); + Assert.InRange(metrics.NegativeRecall, 0, 1); + Assert.InRange(metrics.PositivePrecision, 0, 1); + Assert.InRange(metrics.PositiveRecall, 0, 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void CheckMetrics(ClusteringMetrics metrics) + { + // Perform sanity checks on the metrics. + Assert.True(metrics.AvgMinScore >= 0); + Assert.True(metrics.Dbi >= 0); + if (!double.IsNaN(metrics.Nmi)) + Assert.True(metrics.Nmi >= 0 && metrics.Nmi <= 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void CheckMetrics(MultiClassClassifierMetrics metrics) + { + // Perform sanity checks on the metrics. + Assert.InRange(metrics.AccuracyMacro, 0, 1); + Assert.InRange(metrics.AccuracyMicro, 0, 1); + Assert.True(metrics.LogLoss >= 0); + Assert.InRange(metrics.TopKAccuracy, 0, 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void CheckMetrics(RankerMetrics metrics) + { + // Perform sanity checks on the metrics. + foreach (var dcg in metrics.Dcg) + Assert.True(dcg >= 0); + foreach (var ndcg in metrics.Ndcg) + Assert.InRange(ndcg, 0, 100); + } + /// /// Check that a object is valid. /// diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs new file mode 100644 index 0000000000..fde531bd6e --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs @@ -0,0 +1,82 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +using System; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class for the Iris dataset. + /// + /// + /// This class has annotations for automatic deserialization from a file, and contains helper methods + /// for reading from a file and for generating a random dataset as an IEnumerable. + /// + internal sealed class Iris + { + [LoadColumn(0)] + public float Label { get; set; } + + [LoadColumn(1)] + public float SepalLength { get; set; } + + [LoadColumn(2)] + public float SepalWidth { get; set; } + + [LoadColumn(4)] + public float PetalLength { get; set; } + + [LoadColumn(5)] + public float PetalWidth { get; set; } + + /// + /// The list of columns commonly used as features. + /// + public static readonly string[] Features = new string[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" }; + + public static IDataView LoadAsRankingProblem(MLContext mlContext, string filePath, bool hasHeader, char separatorChar, int seed = 1) + { + // Load the Iris data. + var data = mlContext.Data.ReadFromTextFile(filePath, hasHeader: hasHeader, separatorChar: separatorChar); + + // Create a function that generates a random groupId. + var rng = new Random(seed); + Action generateGroupId = (input, output) => + { + output.Label = input.Label; + // The standard set used in tests has 150 rows + output.GroupId = (ushort)rng.Next(0, 30); + output.PetalLength = input.PetalLength; + output.PetalWidth = input.PetalWidth; + output.SepalLength = input.SepalLength; + output.SepalWidth = input.SepalWidth; + }; + + // Describe a pipeline that generates a groupId and converts it to a key. + var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null) + .Append(mlContext.Transforms.Conversion.MapValueToKey("GroupId")); + + // Transform the data + var transformedData = pipeline.Fit(data).Transform(data); + + return transformedData; + } + } + + /// + /// A class for the Iris dataset with a GroupId column. + /// + internal sealed class IrisWithGroup + { + public float Label { get; set; } + public ushort GroupId { get; set; } + public float SepalLength { get; set; } + public float SepalWidth { get; set; } + public float PetalLength { get; set; } + public float PetalWidth { get; set; } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs new file mode 100644 index 0000000000..a4ea599d42 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs @@ -0,0 +1,24 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class containing one property per . + /// + /// + /// This class has annotations for automatic deserialization from a file, and contains helper methods + /// for reading from a file and for generating a random dataset as an IEnumerable. + /// + internal sealed class MnistOneClass + { + [LoadColumn(0)] + public float Label { get; set; } + + [LoadColumn(1, 784), VectorType(784)] + public float[] Features { get; set; } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs new file mode 100644 index 0000000000..6886c6e094 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs @@ -0,0 +1,24 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class containing one property per . + /// + /// + /// This class has annotations for automatic deserialization from a file, and contains helper methods + /// for reading from a file and for generating a random dataset as an IEnumerable. + /// + internal sealed class TweetSentiment + { + [LoadColumn(0), ColumnName("Label")] + public bool Sentiment { get; set; } + + [LoadColumn(1)] + public string SentimentText { get; set; } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs new file mode 100644 index 0000000000..7d8c1b6398 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs @@ -0,0 +1,45 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +using System; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class containing one property per . + /// + /// + /// This class has annotations for automatic deserialization from a file, and contains helper methods + /// for reading from a file and for generating a random dataset as an IEnumerable. + /// + internal sealed class TrivialMatrixFactorization + { + [LoadColumn(0)] + public float Label { get; set; } + + [LoadColumn(1)] + public uint MatrixColumnIndex { get; set; } + + [LoadColumn(2)] + public uint MatrixRowIndex { get; set; } + + public static IDataView LoadAndFeaturizeFromTextFile(MLContext mlContext, string filePath, bool hasHeader, char separatorChar) + { + // Load the data from a textfile. + var data = mlContext.Data.ReadFromTextFile(filePath, hasHeader: hasHeader, separatorChar: separatorChar); + + // Describe a pipeline to translate the uints to keys. + var pipeline = mlContext.Transforms.Conversion.MapValueToKey("MatrixColumnIndex") + .Append(mlContext.Transforms.Conversion.MapValueToKey("MatrixRowIndex")); + + // Transform the data. + var transformedData = pipeline.Fit(data).Transform(data); + + return transformedData; + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs new file mode 100644 index 0000000000..0ada188bb0 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -0,0 +1,278 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.KMeans; +using Microsoft.ML.Trainers.PCA; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + public class Evaluation : BaseTestClass + { + public Evaluation(ITestOutputHelper output): base(output) + { + } + + /// + /// Train and Evaluate: Anomaly Detection. + /// + [Fact] + public void TrainAndEvaluateAnomalyDetection() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var trainData = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.mnistOneClass.trainFilename), + hasHeader: TestDatasets.mnistOneClass.fileHasHeader, + separatorChar: TestDatasets.mnistOneClass.fileSeparator); + var testData = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.mnistOneClass.testFilename), + hasHeader: TestDatasets.mnistOneClass.fileHasHeader, + separatorChar: TestDatasets.mnistOneClass.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(); + + // Train the model. + var model = pipeline.Fit(trainData); + + // Evaulate the model. + // Note Issue #2464: Using the train dataset will cause NaN metrics to be returned. + var scoredTest = model.Transform(testData); + var metrics = mlContext.AnomalyDetection.Evaluate(scoredTest); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Train and Evaluate: Binary Classification. + /// + [Fact] + public void TrainAndEvaluateBinaryClassification() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.LogisticRegression( + new LogisticRegression.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Train and Evaluate: Clustering. + /// + [Fact] + public void TrainAndEvaluateClustering() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Clustering.Trainers.KMeans(new KMeansPlusPlusTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Clustering.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Train and Evaluate: Multiclass Classification. + /// + [Fact] + public void TrainAndEvaluateMulticlassClassification() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( + new SdcaMultiClassTrainer.Options { NumThreads = 1})); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.MulticlassClassification.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Train and Evaluate: Ranking. + /// + [Fact] + public void TrainAndEvaluateRanking() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = Iris.LoadAsRankingProblem(mlContext, + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Ranking.Trainers.FastTree(new FastTreeRankingTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Ranking.Evaluate(scoredData, label: "Label", groupId: "GroupId"); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Train and Evaluate: Recommendation. + /// + [Fact] + public void TrainAndEvaluateRecommendation() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Get the dataset. + var data = TrivialMatrixFactorization.LoadAndFeaturizeFromTextFile( + mlContext, + GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename), + TestDatasets.trivialMatrixFactorization.fileHasHeader, + TestDatasets.trivialMatrixFactorization.fileSeparator); + + // Create a pipeline to train on the sentiment data. + var pipeline = mlContext.Recommendation().Trainers.MatrixFactorization( + new MatrixFactorizationTrainer.Options{ + MatrixColumnIndexColumnName = "MatrixColumnIndex", + MatrixRowIndexColumnName = "MatrixRowIndex", + LabelColumnName = "Label", + NumberOfIterations = 3, + NumberOfThreads = 1, + ApproximationRank = 4, + }); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Recommendation().Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Train and Evaluate: Regression. + /// + [Fact] + public void TrainAndEvaluateRegression() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Get the dataset. + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) + .Read(GetDataPath(TestDatasets.housing.trainFilename)); + + // Create a pipeline to train on the sentiment data. + var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { + "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", + "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) + .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Regression.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.CheckMetrics(metrics); + } + + /// + /// Evaluate With Precision-Recall Curves + /// + /// + /// This is currently not possible using the APIs. + /// + [Fact] + public void TrainAndEvaluateWithPrecisionRecallCurves() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.LogisticRegression( + new LogisticRegression.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaulate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.Evaluate(scoredData); + + Common.CheckMetrics(metrics); + + // This scenario is not possible with the current set of APIs + // There could be two ways imaginable: + // 1. Getting a list of (P,R) from the Evaluator (as it has these anyways) + // Not possible. + // 2. Manually setting the classifier threshold and calling evaluate many times: + // Not currently possible: Todo #2465: Allow the setting of threshold and thresholdColumn for scoring. + // Technically, this is possible using custom mappers like so: + // 1. Get a list of all unique probability scores + // 2. For each value of probability: + // a. Write a custom mapper to produce PredictedLabel at that probability threshold + // b. Calculate Precision and Recall with these labels + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index 24cc049e8f..4f37533ebb 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -22,7 +22,8 @@ public void ReconfigurablePrediction() var mlContext = new MLContext(seed: 789); // Get the dataset, create a train and test - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index a39bd14884..cc74ff2227 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -27,7 +27,8 @@ void CrossValidation() var mlContext = new MLContext(seed: 1, conc: 1); // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); // Create a pipeline to train on the sentiment data. @@ -60,7 +61,8 @@ public void TrainWithValidationSet() var mlContext = new MLContext(seed: 1, conc: 1); // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 7197f1f64b..f654673bf2 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -14,6 +14,8 @@ public class TestDataset public string testFilename; public string validFilename; public string labelFilename; + public char fileSeparator; + public bool fileHasHeader; // REVIEW: Replace these with appropriate SubComponents! public string settings; @@ -158,6 +160,8 @@ public static class TestDatasets name = "housing", trainFilename = "housing.txt", testFilename = "housing.txt", + fileSeparator = '\t', + fileHasHeader = true, loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}", GetLoaderColumns = () => { @@ -206,6 +210,8 @@ public static class TestDatasets name = "sentiment", trainFilename = "wikipedia-detox-250-line-data.tsv", testFilename = "wikipedia-detox-250-line-test.tsv", + fileHasHeader = true, + fileSeparator = '\t', GetLoaderColumns = () => { return new[] @@ -447,6 +453,8 @@ public static class TestDatasets name = "iris", trainFilename = @"iris.txt", testFilename = @"iris.txt", + fileHasHeader = true, + fileSeparator = '\t' }; public static TestDataset irisMissing = new TestDataset() @@ -655,6 +663,8 @@ public static class TestDatasets name = "mnistOneClass", trainFilename = @"MNIST.Train.0-class.tiny.txt", testFilename = @"MNIST.Test.tiny.txt", + fileHasHeader = true, + fileSeparator = '\t', settings = "" }; @@ -704,6 +714,8 @@ public static class TestDatasets name = "trivialMatrixFactorization", trainFilename = @"trivial-train.tsv", testFilename = @"trivial-test.tsv", + fileHasHeader = true, + fileSeparator = '\t', loaderSettings = "loader=Text{col=Label:R4:0 col=User:U4[0-19]:1 col=Item:U4[0-39]:2 header+}" }; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs deleted file mode 100644 index 60fad2c0a3..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Evaluation: Similar to the simple train scenario, except instead of having some - /// predictive structure, be able to score another "test" data file, run the result - /// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot. - /// Getting metrics out of this shoudl be as straightforward and unannoying as possible. - /// - [Fact] - public void Evaluation() - { - var ml = new MLContext(seed: 1, conc: 1); - - // Pipeline. - var pipeline = ml.Data.CreateTextLoader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Append(ml.Transforms.Text.FeaturizeText("Features", "SentimentText")) - .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { NumThreads = 1 })); - - // Train. - var readerModel = pipeline.Fit(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename))); - - // Evaluate on the test set. - var dataEval = readerModel.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.testFilename))); - var metrics = ml.BinaryClassification.Evaluate(dataEval); - } - } -} From 0b02fb1b9375dda004492935e4c83314d1a07714 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 19 Feb 2019 17:00:31 -0800 Subject: [PATCH 2/9] Updating test summaries. --- test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs | 6 +----- .../Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs | 6 +----- test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs | 6 +----- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs index fde531bd6e..39f741d99a 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs @@ -10,12 +10,8 @@ namespace Microsoft.ML.Functional.Tests.Datasets { /// - /// A class for the Iris dataset. + /// A class for the Iris test dataset. /// - /// - /// This class has annotations for automatic deserialization from a file, and contains helper methods - /// for reading from a file and for generating a random dataset as an IEnumerable. - /// internal sealed class Iris { [LoadColumn(0)] diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs index a4ea599d42..163216bc64 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs @@ -7,12 +7,8 @@ namespace Microsoft.ML.Functional.Tests.Datasets { /// - /// A class containing one property per . + /// A class for reading in the MNIST One Class test dataset. /// - /// - /// This class has annotations for automatic deserialization from a file, and contains helper methods - /// for reading from a file and for generating a random dataset as an IEnumerable. - /// internal sealed class MnistOneClass { [LoadColumn(0)] diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs index 6886c6e094..2465e291b3 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs @@ -7,12 +7,8 @@ namespace Microsoft.ML.Functional.Tests.Datasets { /// - /// A class containing one property per . + /// A class for reading in the Sentiment test dataset. /// - /// - /// This class has annotations for automatic deserialization from a file, and contains helper methods - /// for reading from a file and for generating a random dataset as an IEnumerable. - /// internal sealed class TweetSentiment { [LoadColumn(0), ColumnName("Label")] From acd2f1a5010e47cdf649c50521f7799f05e6f644 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Thu, 21 Feb 2019 12:42:54 -0800 Subject: [PATCH 3/9] Updating comments to end in a period! --- .../Evaluation.cs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 0ada188bb0..80d4c789c4 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -233,7 +233,7 @@ public void TrainAndEvaluateRegression() } /// - /// Evaluate With Precision-Recall Curves + /// Evaluate With Precision-Recall Curves. /// /// /// This is currently not possible using the APIs. @@ -262,17 +262,19 @@ public void TrainAndEvaluateWithPrecisionRecallCurves() Common.CheckMetrics(metrics); - // This scenario is not possible with the current set of APIs + // This scenario is not possible with the current set of APIs. // There could be two ways imaginable: - // 1. Getting a list of (P,R) from the Evaluator (as it has these anyways) - // Not possible. + // 1. Getting a list of (P,R) from the Evaluator (as it calculates most of the information already). + // Not currently possible. // 2. Manually setting the classifier threshold and calling evaluate many times: // Not currently possible: Todo #2465: Allow the setting of threshold and thresholdColumn for scoring. - // Technically, this is possible using custom mappers like so: - // 1. Get a list of all unique probability scores + // Technically, this scenario is possible using custom mappers like so: + // 1. Get a list of all unique probability scores. + // e.g. By reading the IDataView as an IEnumerable, and keeping a hash of known probabilities up to some precision. // 2. For each value of probability: - // a. Write a custom mapper to produce PredictedLabel at that probability threshold - // b. Calculate Precision and Recall with these labels + // a. Write a custom mapper to produce PredictedLabel at that probability threshold. + // b. Calculate Precision and Recall with these labels. + // c. Append the Precision and Recall to an IList. } } } \ No newline at end of file From e9d5bad2107eee55c4869840d41e8b3d3ccf82fd Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Thu, 21 Feb 2019 13:48:20 -0800 Subject: [PATCH 4/9] Addressing PR comments. --- test/Microsoft.ML.Functional.Tests/Common.cs | 3 --- .../Evaluation.cs | 19 +++++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index b097d7ddf1..d5ab5ce056 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -164,9 +164,6 @@ public static void AssertEqual(TypeTestData testType1, TypeTestData testType2) /// /// Check that a object is valid. /// - /// - /// TODO #2644: At times, AnomalyDetection.Evaluate will return a set of NaN metrics. - /// /// The metrics object. public static void CheckMetrics(AnomalyDetectionMetrics metrics) { diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 80d4c789c4..ed6aba15dc 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers.KMeans; -using Microsoft.ML.Trainers.PCA; using Xunit; using Xunit.Abstractions; @@ -41,8 +40,8 @@ public void TrainAndEvaluateAnomalyDetection() // Train the model. var model = pipeline.Fit(trainData); - // Evaulate the model. - // Note Issue #2464: Using the train dataset will cause NaN metrics to be returned. + // Evaluate the model. + // TODO #2464: Using the train dataset will cause NaN metrics to be returned. var scoredTest = model.Transform(testData); var metrics = mlContext.AnomalyDetection.Evaluate(scoredTest); @@ -71,7 +70,7 @@ public void TrainAndEvaluateBinaryClassification() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.BinaryClassification.Evaluate(scoredData); @@ -99,7 +98,7 @@ public void TrainAndEvaluateClustering() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.Clustering.Evaluate(scoredData); @@ -128,7 +127,7 @@ public void TrainAndEvaluateMulticlassClassification() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.MulticlassClassification.Evaluate(scoredData); @@ -156,7 +155,7 @@ public void TrainAndEvaluateRanking() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.Ranking.Evaluate(scoredData, label: "Label", groupId: "GroupId"); @@ -193,7 +192,7 @@ public void TrainAndEvaluateRecommendation() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.Recommendation().Evaluate(scoredData); @@ -224,7 +223,7 @@ public void TrainAndEvaluateRegression() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.Regression.Evaluate(scoredData); @@ -256,7 +255,7 @@ public void TrainAndEvaluateWithPrecisionRecallCurves() // Train the model. var model = pipeline.Fit(data); - // Evaulate the model. + // Evaluate the model. var scoredData = model.Transform(data); var metrics = mlContext.BinaryClassification.Evaluate(scoredData); From 7012ea621829dd695b67ebeeb36be812a7148294 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 22 Feb 2019 12:11:55 -0800 Subject: [PATCH 5/9] Updating binary classification check to also check calibrated classifiers --- test/Microsoft.ML.Functional.Tests/Common.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index d5ab5ce056..ae6cb630ae 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -189,6 +189,18 @@ public static void CheckMetrics(BinaryClassificationMetrics metrics) Assert.InRange(metrics.PositiveRecall, 0, 1); } + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void CheckMetrics(CalibratedBinaryClassificationMetrics metrics) + { + Assert.InRange(metrics.Entropy, double.NegativeInfinity, 1); + Assert.InRange(metrics.LogLoss, double.NegativeInfinity, 1); + Assert.InRange(metrics.LogLossReduction, double.NegativeInfinity, 100); + CheckMetrics(metrics as BinaryClassificationMetrics); + } + /// /// Check that a object is valid. /// From 12d3dfe10d10ca40b9cfd9ee6ec8d4839941e0bb Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 22 Feb 2019 15:35:56 -0800 Subject: [PATCH 6/9] Normalizing names after merge with master. --- test/Microsoft.ML.Functional.Tests/Common.cs | 22 ++++++------------- .../Evaluation.cs | 16 +++++++------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 61c0e06ba8..bcba3a8e27 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -165,9 +165,8 @@ public static void AssertEqual(TypeTestData testType1, TypeTestData testType2) /// Check that a object is valid. /// /// The metrics object. - public static void CheckMetrics(AnomalyDetectionMetrics metrics) + public static void AssertMetrics(AnomalyDetectionMetrics metrics) { - // Perform sanity checks on the metrics. Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.DrAtK, 0, 1); } @@ -176,9 +175,8 @@ public static void CheckMetrics(AnomalyDetectionMetrics metrics) /// Check that a object is valid. /// /// The metrics object. - public static void CheckMetrics(BinaryClassificationMetrics metrics) + public static void AssertMetrics(BinaryClassificationMetrics metrics) { - // Perform sanity checks on the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); @@ -193,21 +191,20 @@ public static void CheckMetrics(BinaryClassificationMetrics metrics) /// Check that a object is valid. /// /// The metrics object. - public static void CheckMetrics(CalibratedBinaryClassificationMetrics metrics) + public static void AssertMetrics(CalibratedBinaryClassificationMetrics metrics) { Assert.InRange(metrics.Entropy, double.NegativeInfinity, 1); Assert.InRange(metrics.LogLoss, double.NegativeInfinity, 1); Assert.InRange(metrics.LogLossReduction, double.NegativeInfinity, 100); - CheckMetrics(metrics as BinaryClassificationMetrics); + AssertMetrics(metrics as BinaryClassificationMetrics); } /// /// Check that a object is valid. /// /// The metrics object. - public static void CheckMetrics(ClusteringMetrics metrics) + public static void AssertMetrics(ClusteringMetrics metrics) { - // Perform sanity checks on the metrics. Assert.True(metrics.AvgMinScore >= 0); Assert.True(metrics.Dbi >= 0); if (!double.IsNaN(metrics.Nmi)) @@ -218,9 +215,8 @@ public static void CheckMetrics(ClusteringMetrics metrics) /// Check that a object is valid. /// /// The metrics object. - public static void CheckMetrics(MultiClassClassifierMetrics metrics) + public static void AssertMetrics(MultiClassClassifierMetrics metrics) { - // Perform sanity checks on the metrics. Assert.InRange(metrics.AccuracyMacro, 0, 1); Assert.InRange(metrics.AccuracyMicro, 0, 1); Assert.True(metrics.LogLoss >= 0); @@ -231,9 +227,8 @@ public static void CheckMetrics(MultiClassClassifierMetrics metrics) /// Check that a object is valid. /// /// The metrics object. - public static void CheckMetrics(RankerMetrics metrics) + public static void AssertMetrics(RankerMetrics metrics) { - // Perform sanity checks on the metrics. foreach (var dcg in metrics.Dcg) Assert.True(dcg >= 0); foreach (var ndcg in metrics.Ndcg) @@ -246,7 +241,6 @@ public static void CheckMetrics(RankerMetrics metrics) /// The metrics object. public static void AssertMetrics(RegressionMetrics metrics) { - // Perform sanity checks on the metrics. Assert.True(metrics.Rms >= 0); Assert.True(metrics.L1 >= 0); Assert.True(metrics.L2 >= 0); @@ -259,7 +253,6 @@ public static void AssertMetrics(RegressionMetrics metrics) /// The object. public static void AssertMetricStatistics(MetricStatistics metric) { - // Perform sanity checks on the metrics. Assert.True(metric.StandardDeviation >= 0); Assert.True(metric.StandardError >= 0); } @@ -270,7 +263,6 @@ public static void AssertMetricStatistics(MetricStatistics metric) /// The metrics object. public static void AssertMetricsStatistics(RegressionMetricsStatistics metrics) { - // The mean can be any float; the standard deviation and error must be >=0. AssertMetricStatistics(metrics.Rms); AssertMetricStatistics(metrics.L1); AssertMetricStatistics(metrics.L2); diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index ed6aba15dc..6435f0806e 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -46,7 +46,7 @@ public void TrainAndEvaluateAnomalyDetection() var metrics = mlContext.AnomalyDetection.Evaluate(scoredTest); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -75,7 +75,7 @@ public void TrainAndEvaluateBinaryClassification() var metrics = mlContext.BinaryClassification.Evaluate(scoredData); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -103,7 +103,7 @@ public void TrainAndEvaluateClustering() var metrics = mlContext.Clustering.Evaluate(scoredData); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -132,7 +132,7 @@ public void TrainAndEvaluateMulticlassClassification() var metrics = mlContext.MulticlassClassification.Evaluate(scoredData); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -160,7 +160,7 @@ public void TrainAndEvaluateRanking() var metrics = mlContext.Ranking.Evaluate(scoredData, label: "Label", groupId: "GroupId"); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -197,7 +197,7 @@ public void TrainAndEvaluateRecommendation() var metrics = mlContext.Recommendation().Evaluate(scoredData); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -228,7 +228,7 @@ public void TrainAndEvaluateRegression() var metrics = mlContext.Regression.Evaluate(scoredData); // Check that the metrics returned are valid. - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); } /// @@ -259,7 +259,7 @@ public void TrainAndEvaluateWithPrecisionRecallCurves() var scoredData = model.Transform(data); var metrics = mlContext.BinaryClassification.Evaluate(scoredData); - Common.CheckMetrics(metrics); + Common.AssertMetrics(metrics); // This scenario is not possible with the current set of APIs. // There could be two ways imaginable: From 42af510535cab35587d8082bba89f1f3d71ce163 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 22 Feb 2019 15:55:36 -0800 Subject: [PATCH 7/9] Addressing PR comments. --- .../Datasets/Iris.cs | 4 +-- .../Datasets/TrivialMatrixFactorization.cs | 6 +--- .../Evaluation.cs | 31 ++++++++++++++++++- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs index 39f741d99a..d1cbfa3fad 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs @@ -45,7 +45,7 @@ public static IDataView LoadAsRankingProblem(MLContext mlContext, string filePat { output.Label = input.Label; // The standard set used in tests has 150 rows - output.GroupId = (ushort)rng.Next(0, 30); + output.GroupId = rng.Next(0, 30); output.PetalLength = input.PetalLength; output.PetalWidth = input.PetalWidth; output.SepalLength = input.SepalLength; @@ -69,7 +69,7 @@ public static IDataView LoadAsRankingProblem(MLContext mlContext, string filePat internal sealed class IrisWithGroup { public float Label { get; set; } - public ushort GroupId { get; set; } + public int GroupId { get; set; } public float SepalLength { get; set; } public float SepalWidth { get; set; } public float PetalLength { get; set; } diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs index 7d8c1b6398..005fc98c72 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs @@ -10,12 +10,8 @@ namespace Microsoft.ML.Functional.Tests.Datasets { /// - /// A class containing one property per . + /// A class describing the TrivialMatrixFactorization test dataset. /// - /// - /// This class has annotations for automatic deserialization from a file, and contains helper methods - /// for reading from a file and for generating a random dataset as an IEnumerable. - /// internal sealed class TrivialMatrixFactorization { [LoadColumn(0)] diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 6435f0806e..da3c71d621 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -50,13 +50,42 @@ public void TrainAndEvaluateAnomalyDetection() } /// - /// Train and Evaluate: Binary Classification. + /// Train and Evaluate: Binary Classification with no calibration. /// [Fact] public void TrainAndEvaluateBinaryClassification() { var mlContext = new MLContext(seed: 1, conc: 1); + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( + new SdcaNonCalibratedBinaryTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Binary Classification with a calibrated predictor. + /// + [Fact] + public void TrainAndEvaluateBinaryClassificationWithCalibration() + { + var mlContext = new MLContext(seed: 1, conc: 1); + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: TestDatasets.Sentiment.fileHasHeader, separatorChar: TestDatasets.Sentiment.fileSeparator); From 89e44cb04e8e5703f4ff6e052bddcf4a9cfd63e4 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 22 Feb 2019 16:47:25 -0800 Subject: [PATCH 8/9] Updating to fix master merge issues around test datasets. --- .../Datasets/MnistOneClass.cs | 19 ++++++++++++++----- .../Evaluation.cs | 12 ++++++------ test/Microsoft.ML.TestFramework/Datasets.cs | 2 +- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs index 163216bc64..07b26d3d9c 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs @@ -6,15 +6,24 @@ namespace Microsoft.ML.Functional.Tests.Datasets { - /// - /// A class for reading in the MNIST One Class test dataset. - /// internal sealed class MnistOneClass { - [LoadColumn(0)] + private const int _featureLength = 783; + public float Label { get; set; } - [LoadColumn(1, 784), VectorType(784)] public float[] Features { get; set; } + + public static TextLoader GetTextLoader(MLContext mlContext, bool hasHeader, char separatorChar) + { + return mlContext.Data.CreateTextLoader( + new[] { + new TextLoader.Column("Label", DataKind.R4, 0), + new TextLoader.Column("Features", DataKind.R4, 1, 1 + _featureLength) + }, + separatorChar: separatorChar, + hasHeader: hasHeader, + allowSparse: true); + } } } diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index da3c71d621..a3f6d10b05 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -27,12 +27,12 @@ public void TrainAndEvaluateAnomalyDetection() { var mlContext = new MLContext(seed: 1, conc: 1); - var trainData = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.mnistOneClass.trainFilename), - hasHeader: TestDatasets.mnistOneClass.fileHasHeader, - separatorChar: TestDatasets.mnistOneClass.fileSeparator); - var testData = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.mnistOneClass.testFilename), - hasHeader: TestDatasets.mnistOneClass.fileHasHeader, - separatorChar: TestDatasets.mnistOneClass.fileSeparator); + var trainData = MnistOneClass.GetTextLoader(mlContext, + TestDatasets.mnistOneClass.fileHasHeader, TestDatasets.mnistOneClass.fileSeparator) + .Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); + var testData = MnistOneClass.GetTextLoader(mlContext, + TestDatasets.mnistOneClass.fileHasHeader, TestDatasets.mnistOneClass.fileSeparator) + .Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); // Create a training pipeline. var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(); diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index f654673bf2..abc9862049 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -663,7 +663,7 @@ public static class TestDatasets name = "mnistOneClass", trainFilename = @"MNIST.Train.0-class.tiny.txt", testFilename = @"MNIST.Test.tiny.txt", - fileHasHeader = true, + fileHasHeader = false, fileSeparator = '\t', settings = "" }; From 051ba0171969e3a02a18f722232adf3e9bba0599 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Sun, 24 Feb 2019 14:08:41 -0800 Subject: [PATCH 9/9] Marking matrix factorization test with the new attribute. --- test/Microsoft.ML.Functional.Tests/Evaluation.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index a3f6d10b05..6ffec01b32 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -5,6 +5,7 @@ using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; +using Microsoft.ML.TestFramework.Attributes; using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers.KMeans; @@ -195,7 +196,7 @@ public void TrainAndEvaluateRanking() /// /// Train and Evaluate: Recommendation. /// - [Fact] + [MatrixFactorizationFact] public void TrainAndEvaluateRecommendation() { var mlContext = new MLContext(seed: 1, conc: 1);