Skip to content

Commit 17c9155

Browse files
authored
Reformatting ModelOperations and DataOperations samples to width 85 (#3923)
* Added a comment. * reformatted ModelOperations samples to width 85 * Fixed commented-on parts of MachineOperations & reformatted DataOperations * Update docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/Cache.cs Co-Authored-By: Justin Ormont <[email protected]> * Update Program.cs Got rid of test comment * Update DataViewEnumerable.tt Fixed extra whitespace * Update DataViewEnumerable.cs Fixed extra whitespace * Update DataViewEnumerable.tt Fixed extra whitespace * Update FilterRowsByColumn.tt Fixed whitespace * Update ShuffleRows.tt Fixed whitespace * Update TakeRows.tt Fixed whitespace * Update TakeRows.cs Fixed whitespace * Update SkipRows.cs Fixed whitespace * Update SkipRows.tt Fixed whitespace * Update ShuffleRows.cs Fixed whitespace * Update ShuffleRows.cs Fixed whitespace * Update ShuffleRows.cs * Update ShuffleRows.tt * Update SkipRows.tt * Update SkipRows.cs * Update FilterRowsByColumn.cs Fixed whitespace * Update FilterRowsByColumn.cs Fixed whitespace * Update DataViewEnumerable.cs * Update FilterRowsByColumn.cs Fixed extra carriage returns * Update FilterRowsByColumn.tt Fixed extra carriage returns * Update FilterRowsByColumn.cs * Update FilterRowsByColumn.tt
1 parent 7aa6513 commit 17c9155

22 files changed

+504
-224
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ public static class BootstrapSample
77
{
88
public static void Example()
99
{
10-
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
11-
// as a catalog of available operations and as the source of randomness.
10+
// Create a new context for ML.NET operations. It can be used for
11+
// exception tracking and logging, as a catalog of available operations
12+
// and as the source of randomness.
1213
var mlContext = new MLContext();
1314

1415
// Get a small dataset as an IEnumerable.
@@ -23,20 +24,27 @@ public static void Example()
2324

2425
var data = mlContext.Data.LoadFromEnumerable(rawData);
2526

26-
// Now take a bootstrap sample of this dataset to create a new dataset. The bootstrap is a resampling technique that
27-
// creates a training set of the same size by picking with replacement from the original dataset. With the bootstrap,
28-
// we expect that the resampled dataset will have about 63% of the rows of the original dataset (i.e. 1-e^-1), with some
29-
// rows represented more than once.
30-
// BootstrapSample is a streaming implementation of the boostrap that enables sampling from a dataset too large to hold in memory.
31-
// To enable streaming, BootstrapSample approximates the bootstrap by sampling each row according to a Poisson(1) distribution.
32-
// Note that this streaming approximation treats each row independently, thus the resampled dataset is not guaranteed to be the
33-
// same length as the input dataset.
34-
// Let's take a look at the behavior of the BootstrapSample by examining a few draws:
27+
// Now take a bootstrap sample of this dataset to create a new dataset.
28+
// The bootstrap is a resampling technique that creates a training set
29+
// of the same size by picking with replacement from the original
30+
// dataset. With the bootstrap, we expect that the resampled dataset
31+
// will have about 63% of the rows of the original dataset
32+
// (i.e. 1-e^-1), with some rows represented more than once.
33+
// BootstrapSample is a streaming implementation of the boostrap that
34+
// enables sampling from a dataset too large to hold in memory. To
35+
// enable streaming, BootstrapSample approximates the bootstrap by
36+
// sampling each row according to a Poisson(1) distribution. Note that
37+
// this streaming approximation treats each row independently, thus the
38+
// resampled dataset is not guaranteed to be the same length as the
39+
// input dataset. Let's take a look at the behavior of the
40+
// BootstrapSample by examining a few draws:
3541
for (int i = 0; i < 3; i++)
3642
{
3743
var resample = mlContext.Data.BootstrapSample(data, seed: i);
3844

39-
var enumerable = mlContext.Data.CreateEnumerable<DataPoint>(resample, reuseRowObject: false);
45+
var enumerable = mlContext.Data
46+
.CreateEnumerable<DataPoint>(resample, reuseRowObject: false);
47+
4048
Console.WriteLine($"Label\tFeature");
4149
foreach (var row in enumerable)
4250
{

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/Cache.cs

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,56 +8,78 @@ public static class Cache
88
{
99
public static void Example()
1010
{
11-
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
12-
// as a catalog of available operations and as the source of randomness.
11+
// Create a new context for ML.NET operations. It can be used for except
12+
// ion tracking and logging, as a catalog of available operations and as
13+
// the source of randomness.
1314
var mlContext = new MLContext();
1415

1516
var data = DatasetUtils.LoadHousingRegressionDataset(mlContext);
1617

1718
// Time how long it takes to page through the records if we don't cache.
18-
(int lines, double columnAverage, double elapsedSeconds) = TimeToScanIDataView(mlContext, data);
19-
Console.WriteLine($"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds} seconds.");
19+
(int lines, double columnAverage, double elapsedSeconds) =
20+
TimeToScanIDataView(mlContext, data);
21+
22+
Console.WriteLine($"Lines={lines}," +
23+
$"averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds}" +
24+
$"seconds.");
2025
// Expected output (time is approximate):
2126
// Lines=506, averageOfColumn0=564.17 and took 0.314 seconds.
2227

2328
// Now create a cached view of the data.
2429
var cachedData = mlContext.Data.Cache(data);
2530

26-
// Time how long it takes to page through the records the first time they're accessed after a cache is applied.
27-
// This iteration will be longer than subsequent calls, as the dataset is being accessed and stored for later.
28-
// Note that this operation may be relatively quick, as the system may have cached the file.
29-
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext, cachedData);
30-
Console.WriteLine($"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds} seconds.");
31+
// Time how long it takes to page through the records the first time
32+
// they're accessed after a cache is applied. This iteration will be
33+
// longer than subsequent calls, as the dataset is being accessed and
34+
// stored for later. Note that this operation may be relatively quick,
35+
// as the system may have cached the file.
36+
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext,
37+
cachedData);
38+
39+
Console.WriteLine($"Lines={lines}," +
40+
$"averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds}" +
41+
$"seconds.");
3142
// Expected output (time is approximate):
3243
// Lines=506, averageOfColumn0=564.17 and took 0.056 seconds.
3344

34-
// Time how long it takes to page through the records now that the data is cached. After the first iteration that caches the IDataView,
35-
// future iterations, like this one, are faster because they are pulling from data cached in memory.
36-
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext, cachedData);
37-
Console.WriteLine($"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds} seconds.");
45+
// Time how long it takes to page through the records now that the data
46+
// is cached. After the first iteration that caches the IDataView,
47+
// future iterations, like this one, are faster because they are pulling
48+
// from data cached in memory.
49+
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext,
50+
cachedData);
51+
52+
Console.WriteLine(
53+
$"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took " +
54+
$"{elapsedSeconds} seconds.");
3855
// Expected output (time is approximate):
3956
// Lines=506, averageOfColumn0=564.17 and took 0.006 seconds.
4057
}
4158

42-
private static (int lines, double columnAverage, double elapsedSeconds) TimeToScanIDataView(MLContext mlContext, IDataView data)
59+
private static (int lines, double columnAverage, double elapsedSeconds)
60+
TimeToScanIDataView(MLContext mlContext, IDataView data)
4361
{
4462
int lines = 0;
4563
double columnAverage = 0.0;
46-
var enumerable = mlContext.Data.CreateEnumerable<HousingRegression>(data, reuseRowObject: true);
64+
var enumerable = mlContext.Data
65+
.CreateEnumerable<HousingRegression>(data, reuseRowObject: true);
66+
4767
var watch = System.Diagnostics.Stopwatch.StartNew();
4868
foreach (var row in enumerable)
4969
{
5070
lines++;
51-
columnAverage += row.MedianHomeValue + row.CrimesPerCapita + row.PercentResidental + row.PercentNonRetail + row.CharlesRiver
52-
+ row.NitricOxides + row.RoomsPerDwelling + row.PercentPre40s + row.EmploymentDistance
53-
+ row.HighwayDistance + row.TaxRate + row.TeacherRatio;
71+
columnAverage += row.MedianHomeValue + row.CrimesPerCapita +
72+
row.PercentResidental + row.PercentNonRetail + row.CharlesRiver
73+
+ row.NitricOxides + row.RoomsPerDwelling + row.PercentPre40s +
74+
row.EmploymentDistance + row.HighwayDistance + row.TaxRate +
75+
row.TeacherRatio;
5476
}
5577
watch.Stop();
5678
columnAverage /= lines;
5779
var elapsed = watch.Elapsed;
5880

5981
return (lines, columnAverage, elapsed.Seconds);
60-
}
82+
}
6183

6284
/// <summary>
6385
/// A class to hold the raw housing regression rows.

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/CrossValidationSplit.cs

Lines changed: 67 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,28 @@ public static void Example()
1717
// Generate some data points.
1818
var examples = GenerateRandomDataPoints(10);
1919

20-
// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
20+
// Convert the examples list to an IDataView object, which is consumable
21+
// by ML.NET API.
2122
var dataview = mlContext.Data.LoadFromEnumerable(examples);
2223

23-
// Cross validation splits your data randomly into set of "folds", and creates groups of Train and Test sets,
24-
// where for each group, one fold is the Test and the rest of the folds the Train.
25-
// So below, we specify Group column as the column containing the sampling keys.
26-
// If we pass that column to cross validation it would be used to break data into certain chunks.
27-
var folds = mlContext.Data.CrossValidationSplit(dataview, numberOfFolds: 3, samplingKeyColumnName: "Group");
28-
var trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TrainSet, reuseRowObject: false);
29-
var testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TestSet, reuseRowObject: false);
24+
// Cross validation splits your data randomly into set of "folds", and
25+
// creates groups of Train and Test sets, where for each group, one fold
26+
// is the Test and the rest of the folds the Train. So below, we specify
27+
// Group column as the column containing the sampling keys. If we pass
28+
// that column to cross validation it would be used to break data into
29+
// certain chunks.
30+
var folds = mlContext.Data
31+
.CrossValidationSplit(dataview, numberOfFolds:3,
32+
samplingKeyColumnName: "Group");
33+
34+
var trainSet = mlContext.Data
35+
.CreateEnumerable<DataPoint>(folds[0].TrainSet,
36+
reuseRowObject: false);
37+
38+
var testSet = mlContext.Data
39+
.CreateEnumerable<DataPoint>(folds[0].TestSet,
40+
reuseRowObject: false);
41+
3042
PrintPreviewRows(trainSet, testSet);
3143

3244
// The data in the Train split.
@@ -43,8 +55,14 @@ public static void Example()
4355
// [Group, 0], [Features, 0.9060271]
4456
// [Group, 0], [Features, 0.2737045]
4557

46-
trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TrainSet, reuseRowObject: false);
47-
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TestSet, reuseRowObject: false);
58+
trainSet = mlContext.Data
59+
.CreateEnumerable<DataPoint>(folds[1].TrainSet,
60+
reuseRowObject: false);
61+
62+
testSet = mlContext.Data
63+
.CreateEnumerable<DataPoint>(folds[1].TestSet,
64+
reuseRowObject: false);
65+
4866
PrintPreviewRows(trainSet, testSet);
4967
// The data in the Train split.
5068
// [Group, 0], [Features, 0.7262433]
@@ -60,8 +78,14 @@ public static void Example()
6078
// [Group, 1], [Features, 0.2060332]
6179
// [Group, 1], [Features, 0.4421779]
6280

63-
trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TrainSet, reuseRowObject: false);
64-
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TestSet, reuseRowObject: false);
81+
trainSet = mlContext.Data
82+
.CreateEnumerable<DataPoint>(folds[2].TrainSet,
83+
reuseRowObject: false);
84+
85+
testSet = mlContext.Data
86+
.CreateEnumerable<DataPoint>(folds[2].TestSet,
87+
reuseRowObject: false);
88+
6589
PrintPreviewRows(trainSet, testSet);
6690
// The data in the Train split.
6791
// [Group, 0], [Features, 0.7262433]
@@ -79,8 +103,14 @@ public static void Example()
79103

80104
// Example of a split without specifying a sampling key column.
81105
folds = mlContext.Data.CrossValidationSplit(dataview, numberOfFolds: 3);
82-
trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TrainSet, reuseRowObject: false);
83-
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TestSet, reuseRowObject: false);
106+
trainSet = mlContext.Data
107+
.CreateEnumerable<DataPoint>(folds[0].TrainSet,
108+
reuseRowObject: false);
109+
110+
testSet = mlContext.Data
111+
.CreateEnumerable<DataPoint>(folds[0].TestSet,
112+
reuseRowObject: false);
113+
84114
PrintPreviewRows(trainSet, testSet);
85115
// The data in the Train split.
86116
// [Group, 0], [Features, 0.7262433]
@@ -96,8 +126,14 @@ public static void Example()
96126
// [Group, 2], [Features, 0.5588848]
97127
// [Group, 0], [Features, 0.9060271]
98128

99-
trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TrainSet, reuseRowObject: false);
100-
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TestSet, reuseRowObject: false);
129+
trainSet = mlContext.Data
130+
.CreateEnumerable<DataPoint>(folds[1].TrainSet,
131+
reuseRowObject: false);
132+
133+
testSet = mlContext.Data
134+
.CreateEnumerable<DataPoint>(folds[1].TestSet,
135+
reuseRowObject: false);
136+
101137
PrintPreviewRows(trainSet, testSet);
102138
// The data in the Train split.
103139
// [Group, 2], [Features, 0.7680227]
@@ -113,8 +149,13 @@ public static void Example()
113149
// [Group, 2], [Features, 0.9775497]
114150
// [Group, 0], [Features, 0.2737045]
115151

116-
trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TrainSet, reuseRowObject: false);
117-
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TestSet, reuseRowObject: false);
152+
trainSet = mlContext.Data
153+
.CreateEnumerable<DataPoint>(folds[2].TrainSet,
154+
reuseRowObject: false);
155+
156+
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TestSet,
157+
reuseRowObject: false);
158+
118159
PrintPreviewRows(trainSet, testSet);
119160
// The data in the Train split.
120161
// [Group, 0], [Features, 0.7262433]
@@ -131,7 +172,9 @@ public static void Example()
131172
// [Group, 1], [Features, 0.4421779]
132173
}
133174

134-
private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
175+
private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count,
176+
int seed = 0)
177+
135178
{
136179
var random = new Random(seed);
137180
for (int i = 0; i < count; i++)
@@ -146,7 +189,8 @@ private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int se
146189
}
147190
}
148191

149-
// Example with features and group column. A data set is a collection of such examples.
192+
// Example with features and group column. A data set is a collection of
193+
// such examples.
150194
private class DataPoint
151195
{
152196
public float Group { get; set; }
@@ -155,7 +199,9 @@ private class DataPoint
155199
}
156200

157201
// print helper
158-
private static void PrintPreviewRows(IEnumerable<DataPoint> trainSet, IEnumerable<DataPoint> testSet)
202+
private static void PrintPreviewRows(IEnumerable<DataPoint> trainSet,
203+
IEnumerable<DataPoint> testSet)
204+
159205
{
160206

161207
Console.WriteLine($"The data in the Train split.");

0 commit comments

Comments
 (0)