diff --git a/build.proj b/build.proj index b38293fda0..26a68f7b40 100644 --- a/build.proj +++ b/build.proj @@ -7,6 +7,7 @@ + @@ -75,10 +76,19 @@ Targets="Pack" /> - - + + + + + + + https://aka.ms/tlc-resources/benchmarks/%(Identity) + $(MSBuildThisFileDirectory)/test/data/external/%(Identity) + + + diff --git a/build/ExternalBenchmarkDataFiles.props b/build/ExternalBenchmarkDataFiles.props new file mode 100644 index 0000000000..ad3d350d60 --- /dev/null +++ b/build/ExternalBenchmarkDataFiles.props @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/test/Microsoft.ML.Benchmarks/Helpers.cs b/test/Microsoft.ML.Benchmarks/Helpers.cs new file mode 100644 index 0000000000..55832fa13f --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/Helpers.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using System.Text; + +namespace Microsoft.ML.Benchmarks +{ + internal class Helpers + { + public static string DatasetNotFound = "Could not find {0} Please ensure you have run 'build.cmd -- /t:DownloadExternalTestFiles /p:IncludeBenchmarkData=true' from the root"; + } + + // Adding this class to not print anything to the console. + // This is required for the current version of BenchmarkDotNet + internal class EmptyWriter : TextWriter + { + internal static readonly EmptyWriter Instance = new EmptyWriter(); + public override Encoding Encoding => null; + } +} diff --git a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj index d78a5881fa..44eaec093e 100644 --- a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj +++ b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj @@ -1,4 +1,5 @@  + Exe 7.2 @@ -22,6 +23,7 @@ + @@ -34,9 +36,13 @@ PreserveNewest - + + + external\%(Identity) + + + PreserveNewest diff --git a/test/Microsoft.ML.Benchmarks/Numeric/Ranking.cs b/test/Microsoft.ML.Benchmarks/Numeric/Ranking.cs new file mode 100644 index 0000000000..a8c008ab04 --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/Numeric/Ranking.cs @@ -0,0 +1,105 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using BenchmarkDotNet.Attributes; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.RunTests; +using Microsoft.ML.Runtime.Tools; +using System.IO; + +namespace Microsoft.ML.Benchmarks +{ + public class Ranking + { + private string _mslrWeb10k_Validate; + private string _mslrWeb10k_Train; + private string _mslrWeb10k_Test; + private string _modelPath_MSLR; + + [GlobalSetup(Targets = new string[] { + nameof(TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking), + nameof(TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_LightGBMRanking) })] + public void SetupTrainingSpeedTests() + { + _mslrWeb10k_Validate = Path.GetFullPath(TestDatasets.MSLRWeb.validFilename); + _mslrWeb10k_Train = Path.GetFullPath(TestDatasets.MSLRWeb.trainFilename); + + if (!File.Exists(_mslrWeb10k_Validate)) + throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _mslrWeb10k_Validate)); + + if (!File.Exists(_mslrWeb10k_Train)) + throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _mslrWeb10k_Train)); + } + + [GlobalSetup(Target = nameof(Test_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking))] + public void SetupScoringSpeedTests() + { + _mslrWeb10k_Test = Path.GetFullPath(TestDatasets.MSLRWeb.testFilename); + if (!File.Exists(_mslrWeb10k_Test)) + throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _mslrWeb10k_Test)); + + SetupTrainingSpeedTests(); + _modelPath_MSLR = Path.Combine(Directory.GetCurrentDirectory(), @"FastTreeRankingModel.zip"); + + string cmd = @"TrainTest test=" + _mslrWeb10k_Validate + + " eval=RankingEvaluator{t=10}" + + " data=" + _mslrWeb10k_Train + + " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" + + " xf=HashTransform{col=GroupId}" + + " xf=NAHandleTransform{col=Features}" + + " tr=FastTreeRanking{}" + + " out={" + _modelPath_MSLR + "}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + { + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); + } + } + + [Benchmark] + public void TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking() + { + string cmd = @"TrainTest test=" + _mslrWeb10k_Validate + + " eval=RankingEvaluator{t=10}" + + " data=" + _mslrWeb10k_Train + + " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" + + " xf=HashTransform{col=GroupId} xf=NAHandleTransform{col=Features}" + + " tr=FastTreeRanking{}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + { + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); + } + } + + [Benchmark] + public void TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_LightGBMRanking() + { + string cmd = @"TrainTest test=" + _mslrWeb10k_Validate + + " eval=RankingEvaluator{t=10}" + + " data=" + _mslrWeb10k_Train + + " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" + + " xf=HashTransform{col=GroupId}" + + " xf=NAHandleTransform{col=Features}" + + " tr=LightGBMRanking{}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + { + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); + } + } + + [Benchmark] + public void Test_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking() + { + // This benchmark is profiling bulk scoring speed and not training speed. + string cmd = @"Test data=" + _mslrWeb10k_Test + " in="+ _modelPath_MSLR; + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + { + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); + } + } + } +} diff --git a/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs b/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs index 0a93b28c32..364a32f0bf 100644 --- a/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs +++ b/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs @@ -8,18 +8,9 @@ using Microsoft.ML.Runtime.RunTests; using Microsoft.ML.Runtime.Tools; using System.IO; -using System.Text; namespace Microsoft.ML.Benchmarks { - // Adding this class to not print anything to the console. - // This is required for the current version of BenchmarkDotNet - internal class EmptyWriter : TextWriter - { - internal static readonly EmptyWriter Instance = new EmptyWriter(); - public override Encoding Encoding => null; - } - public class MultiClassClassification { private string _dataPath_Wiki; @@ -35,9 +26,7 @@ public void SetupTrainingSpeedTests() _dataPath_Wiki = Path.GetFullPath(TestDatasets.WikiDetox.trainFilename); if (!File.Exists(_dataPath_Wiki)) - { - throw new FileNotFoundException($"Could not find {_dataPath_Wiki} Please ensure you have run 'build.cmd -- /t:DownloadExternalTestFiles /p:IncludeBenchmarkData=true' from the root"); - } + throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _dataPath_Wiki)); } [GlobalSetup(Target = nameof(Test_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron))] @@ -45,30 +34,51 @@ public void SetupScoringSpeedTests() { SetupTrainingSpeedTests(); _modelPath_Wiki = Path.Combine(Directory.GetCurrentDirectory(), @"WikiModel.zip"); - string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}} xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=OVA{p=AveragedPerceptron{iter=10}} out={" + _modelPath_Wiki + "}"; - using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + + string cmd = @"CV k=5 data=" + _dataPath_Wiki + + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4}" + + " xf=CategoricalTransform{col=ns}" + + " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" + + " xf=Concat{col=Features:FeaturesText,logged_in,ns}" + + " tr=OVA{p=AveragedPerceptron{iter=10}}" + + " out={" + _modelPath_Wiki + "}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { - Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); } } [Benchmark] public void CV_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron() { - string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}} xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=OVA{p=AveragedPerceptron{iter=10}}"; - using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + string cmd = @"CV k=5 data=" + _dataPath_Wiki + + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" + + " xf=Convert{col=logged_in type=R4}" + + " xf=CategoricalTransform{col=ns}" + + " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" + + " xf=Concat{col=Features:FeaturesText,logged_in,ns}" + + " tr=OVA{p=AveragedPerceptron{iter=10}}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { - Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); } } [Benchmark] public void CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass() { - string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}} xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=LightGBMMulticlass{}"; - using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + string cmd = @"CV k=5 data=" + _dataPath_Wiki + + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" + + " xf=Convert{col=logged_in type=R4}" + + " xf=CategoricalTransform{col=ns}" + + " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" + + " xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=LightGBMMulticlass{}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { - Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); } } @@ -78,29 +88,45 @@ public void Test_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron() // This benchmark is profiling bulk scoring speed and not training speed. string modelpath = Path.Combine(Directory.GetCurrentDirectory(), @"WikiModel.fold000.zip"); string cmd = @"Test data=" + _dataPath_Wiki + " in=" + modelpath; - using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { - Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); } } [Benchmark] public void CV_Multiclass_WikiDetox_WordEmbeddings_OVAAveragedPerceptron() { - string cmd = @"CV tr=OVA{p=AveragedPerceptron{iter=10}} k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor=NGramExtractorTransform{ngram=2}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesText,FeaturesWordEmbedding,logged_in,ns}"; - using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + string cmd = @"CV k=5 data=" + _dataPath_Wiki + + " tr=OVA{p=AveragedPerceptron{iter=10}}" + + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" + + " xf=Convert{col=logged_in type=R4}" + + " xf=CategoricalTransform{col=ns}" + + " xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor=NGramExtractorTransform{ngram=2}}" + + " xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D}" + + " xf=Concat{col=Features:FeaturesText,FeaturesWordEmbedding,logged_in,ns}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { - Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); } } [Benchmark] public void CV_Multiclass_WikiDetox_WordEmbeddings_SDCAMC() { - string cmd = @"CV tr=SDCAMC k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor={} charExtractor={}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesWordEmbedding,logged_in,ns}"; - using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + string cmd = @"CV k=5 data=" + _dataPath_Wiki + + " tr=SDCAMC" + + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" + + " xf=Convert{col=logged_in type=R4}" + + " xf=CategoricalTransform{col=ns}" + + " xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor={} charExtractor={}}" + + " xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D}" + + " xf=Concat{col=Features:FeaturesWordEmbedding,logged_in,ns}"; + + using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { - Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); } } } diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 888a06afce..b9d0cad9a5 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -162,6 +162,14 @@ public static class TestDatasets testFilename = "external/WikiDetoxAnnotated160kRows.tsv" }; + public static TestDataset MSLRWeb = new TestDataset + { + name = "MSLRWeb", + trainFilename = "external/MSLRWeb10KTrain720kRows.tsv", + validFilename = "external/MSLRWeb10KValidate240kRows.tsv", + testFilename = "external/MSLRWeb10KTest240kRows.tsv" + }; + public static TestDataset Sentiment = new TestDataset { name = "sentiment", diff --git a/test/data/README.md b/test/data/README.md index ea9133e33e..165c928ba7 100644 --- a/test/data/README.md +++ b/test/data/README.md @@ -46,6 +46,25 @@ Redistributing the dataset "taxi-fare-test.csv", "taxi-fare-train.csv" with attr > > The dataset is provided under terms provided by City of New York: https://opendata.cityofnewyork.us/overview/#termsofuse. +### MSLR-WEB10K, MSLR-WEB30K + +This dataset is originally from [Introducing LETOR 4.0 Datasets](https://arxiv.org/abs/1306.2597). +The dataset is under a CC-by 4.0 license. +``` +@article{DBLP:journals/corr/QinL13, + author = {Tao Qin and + Tie{-}Yan Liu}, + title = {Introducing {LETOR} 4.0 Datasets}, + journal = {CoRR}, + volume = {abs/1306.2597}, + year = {2013}, + url = {https://arxiv.org/abs/1306.2597}, + timestamp = {Mon, 01 Jul 2013 20:31:25 +0200}, + biburl = {https://dblp.uni-trier.de/rec/bib/journals/corr/QinL13}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + # Images ### Located in `images` folder