diff --git a/build.proj b/build.proj
index b38293fda0..26a68f7b40 100644
--- a/build.proj
+++ b/build.proj
@@ -7,6 +7,7 @@
+
@@ -75,10 +76,19 @@
Targets="Pack" />
-
-
+
+
+
+
+
+
+ https://aka.ms/tlc-resources/benchmarks/%(Identity)
+ $(MSBuildThisFileDirectory)/test/data/external/%(Identity)
+
+
+
diff --git a/build/ExternalBenchmarkDataFiles.props b/build/ExternalBenchmarkDataFiles.props
new file mode 100644
index 0000000000..ad3d350d60
--- /dev/null
+++ b/build/ExternalBenchmarkDataFiles.props
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test/Microsoft.ML.Benchmarks/Helpers.cs b/test/Microsoft.ML.Benchmarks/Helpers.cs
new file mode 100644
index 0000000000..55832fa13f
--- /dev/null
+++ b/test/Microsoft.ML.Benchmarks/Helpers.cs
@@ -0,0 +1,22 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+using System.Text;
+
+namespace Microsoft.ML.Benchmarks
+{
+ internal class Helpers
+ {
+ public static string DatasetNotFound = "Could not find {0} Please ensure you have run 'build.cmd -- /t:DownloadExternalTestFiles /p:IncludeBenchmarkData=true' from the root";
+ }
+
+ // Adding this class to not print anything to the console.
+ // This is required for the current version of BenchmarkDotNet
+ internal class EmptyWriter : TextWriter
+ {
+ internal static readonly EmptyWriter Instance = new EmptyWriter();
+ public override Encoding Encoding => null;
+ }
+}
diff --git a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj
index d78a5881fa..44eaec093e 100644
--- a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj
+++ b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj
@@ -1,4 +1,5 @@
+
Exe
7.2
@@ -22,6 +23,7 @@
+
@@ -34,9 +36,13 @@
PreserveNewest
-
+
+
+ external\%(Identity)
+
+
+
PreserveNewest
diff --git a/test/Microsoft.ML.Benchmarks/Numeric/Ranking.cs b/test/Microsoft.ML.Benchmarks/Numeric/Ranking.cs
new file mode 100644
index 0000000000..a8c008ab04
--- /dev/null
+++ b/test/Microsoft.ML.Benchmarks/Numeric/Ranking.cs
@@ -0,0 +1,105 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using BenchmarkDotNet.Attributes;
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.RunTests;
+using Microsoft.ML.Runtime.Tools;
+using System.IO;
+
+namespace Microsoft.ML.Benchmarks
+{
+ public class Ranking
+ {
+ private string _mslrWeb10k_Validate;
+ private string _mslrWeb10k_Train;
+ private string _mslrWeb10k_Test;
+ private string _modelPath_MSLR;
+
+ [GlobalSetup(Targets = new string[] {
+ nameof(TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking),
+ nameof(TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_LightGBMRanking) })]
+ public void SetupTrainingSpeedTests()
+ {
+ _mslrWeb10k_Validate = Path.GetFullPath(TestDatasets.MSLRWeb.validFilename);
+ _mslrWeb10k_Train = Path.GetFullPath(TestDatasets.MSLRWeb.trainFilename);
+
+ if (!File.Exists(_mslrWeb10k_Validate))
+ throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _mslrWeb10k_Validate));
+
+ if (!File.Exists(_mslrWeb10k_Train))
+ throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _mslrWeb10k_Train));
+ }
+
+ [GlobalSetup(Target = nameof(Test_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking))]
+ public void SetupScoringSpeedTests()
+ {
+ _mslrWeb10k_Test = Path.GetFullPath(TestDatasets.MSLRWeb.testFilename);
+ if (!File.Exists(_mslrWeb10k_Test))
+ throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _mslrWeb10k_Test));
+
+ SetupTrainingSpeedTests();
+ _modelPath_MSLR = Path.Combine(Directory.GetCurrentDirectory(), @"FastTreeRankingModel.zip");
+
+ string cmd = @"TrainTest test=" + _mslrWeb10k_Validate +
+ " eval=RankingEvaluator{t=10}" +
+ " data=" + _mslrWeb10k_Train +
+ " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" +
+ " xf=HashTransform{col=GroupId}" +
+ " xf=NAHandleTransform{col=Features}" +
+ " tr=FastTreeRanking{}" +
+ " out={" + _modelPath_MSLR + "}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ {
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
+ }
+ }
+
+ [Benchmark]
+ public void TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking()
+ {
+ string cmd = @"TrainTest test=" + _mslrWeb10k_Validate +
+ " eval=RankingEvaluator{t=10}" +
+ " data=" + _mslrWeb10k_Train +
+ " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" +
+ " xf=HashTransform{col=GroupId} xf=NAHandleTransform{col=Features}" +
+ " tr=FastTreeRanking{}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ {
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
+ }
+ }
+
+ [Benchmark]
+ public void TrainTest_Ranking_MSLRWeb10K_RawNumericFeatures_LightGBMRanking()
+ {
+ string cmd = @"TrainTest test=" + _mslrWeb10k_Validate +
+ " eval=RankingEvaluator{t=10}" +
+ " data=" + _mslrWeb10k_Train +
+ " loader=TextLoader{col=Label:R4:0 col=GroupId:TX:1 col=Features:R4:2-138}" +
+ " xf=HashTransform{col=GroupId}" +
+ " xf=NAHandleTransform{col=Features}" +
+ " tr=LightGBMRanking{}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ {
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
+ }
+ }
+
+ [Benchmark]
+ public void Test_Ranking_MSLRWeb10K_RawNumericFeatures_FastTreeRanking()
+ {
+ // This benchmark is profiling bulk scoring speed and not training speed.
+ string cmd = @"Test data=" + _mslrWeb10k_Test + " in="+ _modelPath_MSLR;
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ {
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
+ }
+ }
+ }
+}
diff --git a/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs b/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs
index 0a93b28c32..364a32f0bf 100644
--- a/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs
+++ b/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs
@@ -8,18 +8,9 @@
using Microsoft.ML.Runtime.RunTests;
using Microsoft.ML.Runtime.Tools;
using System.IO;
-using System.Text;
namespace Microsoft.ML.Benchmarks
{
- // Adding this class to not print anything to the console.
- // This is required for the current version of BenchmarkDotNet
- internal class EmptyWriter : TextWriter
- {
- internal static readonly EmptyWriter Instance = new EmptyWriter();
- public override Encoding Encoding => null;
- }
-
public class MultiClassClassification
{
private string _dataPath_Wiki;
@@ -35,9 +26,7 @@ public void SetupTrainingSpeedTests()
_dataPath_Wiki = Path.GetFullPath(TestDatasets.WikiDetox.trainFilename);
if (!File.Exists(_dataPath_Wiki))
- {
- throw new FileNotFoundException($"Could not find {_dataPath_Wiki} Please ensure you have run 'build.cmd -- /t:DownloadExternalTestFiles /p:IncludeBenchmarkData=true' from the root");
- }
+ throw new FileNotFoundException(string.Format(Helpers.DatasetNotFound, _dataPath_Wiki));
}
[GlobalSetup(Target = nameof(Test_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron))]
@@ -45,30 +34,51 @@ public void SetupScoringSpeedTests()
{
SetupTrainingSpeedTests();
_modelPath_Wiki = Path.Combine(Directory.GetCurrentDirectory(), @"WikiModel.zip");
- string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}} xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=OVA{p=AveragedPerceptron{iter=10}} out={" + _modelPath_Wiki + "}";
- using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+
+ string cmd = @"CV k=5 data=" + _dataPath_Wiki +
+ " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4}" +
+ " xf=CategoricalTransform{col=ns}" +
+ " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" +
+ " xf=Concat{col=Features:FeaturesText,logged_in,ns}" +
+ " tr=OVA{p=AveragedPerceptron{iter=10}}" +
+ " out={" + _modelPath_Wiki + "}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
{
- Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
}
}
[Benchmark]
public void CV_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron()
{
- string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}} xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=OVA{p=AveragedPerceptron{iter=10}}";
- using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ string cmd = @"CV k=5 data=" + _dataPath_Wiki +
+ " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" +
+ " xf=Convert{col=logged_in type=R4}" +
+ " xf=CategoricalTransform{col=ns}" +
+ " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" +
+ " xf=Concat{col=Features:FeaturesText,logged_in,ns}" +
+ " tr=OVA{p=AveragedPerceptron{iter=10}}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
{
- Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
}
}
[Benchmark]
public void CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass()
{
- string cmd = @"CV k=5 data=" + _dataPath_Wiki + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}} xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=LightGBMMulticlass{}";
- using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ string cmd = @"CV k=5 data=" + _dataPath_Wiki +
+ " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" +
+ " xf=Convert{col=logged_in type=R4}" +
+ " xf=CategoricalTransform{col=ns}" +
+ " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" +
+ " xf=Concat{col=Features:FeaturesText,logged_in,ns} tr=LightGBMMulticlass{}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
{
- Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
}
}
@@ -78,29 +88,45 @@ public void Test_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron()
// This benchmark is profiling bulk scoring speed and not training speed.
string modelpath = Path.Combine(Directory.GetCurrentDirectory(), @"WikiModel.fold000.zip");
string cmd = @"Test data=" + _dataPath_Wiki + " in=" + modelpath;
- using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
{
- Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
}
}
[Benchmark]
public void CV_Multiclass_WikiDetox_WordEmbeddings_OVAAveragedPerceptron()
{
- string cmd = @"CV tr=OVA{p=AveragedPerceptron{iter=10}} k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor=NGramExtractorTransform{ngram=2}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesText,FeaturesWordEmbedding,logged_in,ns}";
- using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ string cmd = @"CV k=5 data=" + _dataPath_Wiki +
+ " tr=OVA{p=AveragedPerceptron{iter=10}}" +
+ " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" +
+ " xf=Convert{col=logged_in type=R4}" +
+ " xf=CategoricalTransform{col=ns}" +
+ " xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor=NGramExtractorTransform{ngram=2}}" +
+ " xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D}" +
+ " xf=Concat{col=Features:FeaturesText,FeaturesWordEmbedding,logged_in,ns}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
{
- Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
}
}
[Benchmark]
public void CV_Multiclass_WikiDetox_WordEmbeddings_SDCAMC()
{
- string cmd = @"CV tr=SDCAMC k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor={} charExtractor={}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesWordEmbedding,logged_in,ns}";
- using (var tlc = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
+ string cmd = @"CV k=5 data=" + _dataPath_Wiki +
+ " tr=SDCAMC" +
+ " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" +
+ " xf=Convert{col=logged_in type=R4}" +
+ " xf=CategoricalTransform{col=ns}" +
+ " xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor={} charExtractor={}}" +
+ " xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D}" +
+ " xf=Concat{col=Features:FeaturesWordEmbedding,logged_in,ns}";
+
+ using (var environment = new ConsoleEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
{
- Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
+ Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false);
}
}
}
diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs
index 888a06afce..b9d0cad9a5 100644
--- a/test/Microsoft.ML.TestFramework/Datasets.cs
+++ b/test/Microsoft.ML.TestFramework/Datasets.cs
@@ -162,6 +162,14 @@ public static class TestDatasets
testFilename = "external/WikiDetoxAnnotated160kRows.tsv"
};
+ public static TestDataset MSLRWeb = new TestDataset
+ {
+ name = "MSLRWeb",
+ trainFilename = "external/MSLRWeb10KTrain720kRows.tsv",
+ validFilename = "external/MSLRWeb10KValidate240kRows.tsv",
+ testFilename = "external/MSLRWeb10KTest240kRows.tsv"
+ };
+
public static TestDataset Sentiment = new TestDataset
{
name = "sentiment",
diff --git a/test/data/README.md b/test/data/README.md
index ea9133e33e..165c928ba7 100644
--- a/test/data/README.md
+++ b/test/data/README.md
@@ -46,6 +46,25 @@ Redistributing the dataset "taxi-fare-test.csv", "taxi-fare-train.csv" with attr
>
> The dataset is provided under terms provided by City of New York: https://opendata.cityofnewyork.us/overview/#termsofuse.
+### MSLR-WEB10K, MSLR-WEB30K
+
+This dataset is originally from [Introducing LETOR 4.0 Datasets](https://arxiv.org/abs/1306.2597).
+The dataset is under a CC-by 4.0 license.
+```
+@article{DBLP:journals/corr/QinL13,
+ author = {Tao Qin and
+ Tie{-}Yan Liu},
+ title = {Introducing {LETOR} 4.0 Datasets},
+ journal = {CoRR},
+ volume = {abs/1306.2597},
+ year = {2013},
+ url = {https://arxiv.org/abs/1306.2597},
+ timestamp = {Mon, 01 Jul 2013 20:31:25 +0200},
+ biburl = {https://dblp.uni-trier.de/rec/bib/journals/corr/QinL13},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
# Images
### Located in `images` folder