From 9b11cb787117c6b84814cd1eeba01d9dfe2393db Mon Sep 17 00:00:00 2001 From: emeikleham Date: Mon, 13 Nov 2023 21:23:54 -0500 Subject: [PATCH 1/3] Add sample standard deviation flag --- .../Transforms/NormalizeColumnDbl.cs | 50 ++++++++++++++----- .../Transforms/NormalizeColumnSng.cs | 50 ++++++++++++++----- .../Transforms/Normalizer.cs | 9 +++- 3 files changed, 81 insertions(+), 28 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs index 26b6b35c05..172d66155f 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs @@ -437,15 +437,25 @@ public Double[] Mean get { return _mean; } } - public Double[] StdDev + public Double[] StdDevPopulation { get { return _m2.Select((m2, i) => Math.Sqrt(m2 / _cnz[i])).ToArray(); } } + public Double[] StdDevSample + { + get { return _m2.Select((m2, i) => Math.Sqrt(m2 / Math.Max(0, _cnz[i] - 1))).ToArray(); } + } + public Double[] MeanSquareError { get { return _m2.Select((m2, i) => m2 / _cnz[i]).ToArray(); } } + public Double[] SampleVariance + { + get { return _m2.Select((m2, i) => m2 / Math.Max(0, _cnz[i] - 1)).ToArray(); } + } + public Double[] M2 { @@ -1637,15 +1647,17 @@ public sealed class MeanVarOneColumnFunctionBuilder : OneColumnFunctionBuilderBa private readonly bool _useLog; private readonly bool _useCdf; private readonly bool _fix; + private readonly bool _useSampleVariance; private readonly MeanVarDblAggregator _aggregator; private VBuffer _buffer; - private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter getSrc, bool useLog, bool useCdf) + private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter getSrc, bool useLog, bool useCdf, bool useSampleVariance) : base(host, lim, getSrc) { _useLog = useLog; _useCdf = useCdf; _fix = fix; + _useSampleVariance = useSampleVariance; _aggregator = new MeanVarDblAggregator(1, useLog); _buffer = new VBuffer(1, new TFloat[1]); } @@ -1654,7 +1666,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceCol ValueGetter getter) { host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); - return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf); + return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance); } public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, DataViewType srcType, @@ -1662,7 +1674,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance { var lim = column.MaximumExampleCount; host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); - return new MeanVarOneColumnFunctionBuilder(host, lim, false, getter, true, column.UseCdf); + return new MeanVarOneColumnFunctionBuilder(host, lim, false, getter, true, column.UseCdf, column.UseSampleVariance); } protected override bool ProcessValue(in TFloat origVal) @@ -1689,10 +1701,13 @@ private IColumnFunction CreateAffineColumnFunction() return AffineColumnFunction.Create(Host, (TFloat)0, (TFloat)0); TFloat scale; TFloat offset; + var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0]; + var variance = _useSampleVariance ? _aggregator.SampleVariance[0] : _aggregator.MeanSquareError[0]; + if (_fix) - MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], _aggregator.MeanSquareError[0], out scale, out offset); + MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], variance, out scale, out offset); else - MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], _aggregator.StdDev[0], out scale, out offset); + MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], stdDev, out scale, out offset); return AffineColumnFunction.Create(Host, scale, offset); } @@ -1703,7 +1718,9 @@ private IColumnFunction CreateCdfColumnFunction() if (_aggregator.M2[0] == 0 || _aggregator.Counts[0] == 0) return CdfColumnFunction.Create(Host, (TFloat)0, (TFloat)0, _useLog); - return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)_aggregator.StdDev[0], _useLog); + var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0]; + + return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)stdDev, _useLog); } } @@ -1712,16 +1729,18 @@ public sealed class MeanVarVecColumnFunctionBuilder : VecColumnFunctionBuilderBa private readonly bool _fix; private readonly bool _useLog; private readonly bool _useCdf; + private readonly bool _useSampleVariance; private readonly MeanVarDblAggregator _aggregator; private MeanVarVecColumnFunctionBuilder(IHost host, int cv, long lim, bool fix, - ValueGetter> getSrc, bool useLog, bool useCdf) + ValueGetter> getSrc, bool useLog, bool useCdf, bool useSampleVariance) : base(host, lim, getSrc) { _aggregator = new MeanVarDblAggregator(cv, useLog); _fix = fix; _useLog = useLog; _useCdf = useCdf; + _useSampleVariance = useSampleVariance; } public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType, @@ -1729,7 +1748,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceCol { host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); var cv = srcType.Size; - return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf); + return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance); } public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType, @@ -1738,7 +1757,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance var lim = column.MaximumExampleCount; host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); var cv = srcType.Size; - return new MeanVarVecColumnFunctionBuilder(host, cv, lim, false, getter, true, column.UseCdf); + return new MeanVarVecColumnFunctionBuilder(host, cv, lim, false, getter, true, column.UseCdf, column.UseSampleVariance); } protected override bool ProcessValue(in VBuffer buffer) @@ -1776,10 +1795,14 @@ private IColumnFunction CreateAffineColumnFunction() scale[i] = offset[i] = 0; continue; } + + var stdDev = _useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i]; + var variance = _useSampleVariance ? _aggregator.SampleVariance[i] : _aggregator.MeanSquareError[i]; + if (_fix) - MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], _aggregator.MeanSquareError[i], out scale[i], out offset[i]); + MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], variance, out scale[i], out offset[i]); else - MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], _aggregator.StdDev[i], out scale[i], out offset[i]); + MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], stdDev, out scale[i], out offset[i]); if (offset[i] != 0 && nz.Count < lim) nz.Add(i); } @@ -1819,7 +1842,8 @@ private IColumnFunction CreateCdfColumnFunction() continue; } mean[i] = (TFloat)_aggregator.Mean[i]; - stddev[i] = (TFloat)_aggregator.StdDev[i]; + stddev[i] = (TFloat)(_useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i]); + } return CdfColumnFunction.Create(Host, mean, stddev, _useLog); diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs index 861cfd7368..7296db0d98 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs @@ -438,15 +438,25 @@ public Double[] Mean get { return _mean; } } - public Double[] StdDev + public Double[] StdDevPopulation { get { return _m2.Select((m2, i) => Math.Sqrt(m2 / _cnz[i])).ToArray(); } } + public Double[] StdDevSample + { + get { return _m2.Select((m2, i) => Math.Sqrt(m2 / Math.Max(0, _cnz[i] - 1))).ToArray(); } + } + public Double[] MeanSquareError { get { return _m2.Select((m2, i) => m2 / _cnz[i]).ToArray(); } } + public Double[] SampleVariance + { + get { return _m2.Select((m2, i) => m2 / Math.Max(0, _cnz[i] - 1)).ToArray(); } + } + public Double[] M2 { @@ -1800,15 +1810,17 @@ public sealed class MeanVarOneColumnFunctionBuilder : OneColumnFunctionBuilderBa private readonly bool _useLog; private readonly bool _useCdf; private readonly bool _fix; + private readonly bool _useSampleVariance; private readonly MeanVarSngAggregator _aggregator; private VBuffer _buffer; - private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter getSrc, bool useLog, bool useCdf) + private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter getSrc, bool useLog, bool useCdf, bool useSampleVariance) : base(host, lim, getSrc) { _useLog = useLog; _useCdf = useCdf; _fix = fix; + _useSampleVariance = useSampleVariance; _aggregator = new MeanVarSngAggregator(1, useLog); _buffer = new VBuffer(1, new TFloat[1]); } @@ -1817,7 +1829,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceCol ValueGetter getter) { host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); - return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf); + return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance); } public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, DataViewType srcType, @@ -1825,7 +1837,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance { var lim = column.MaximumExampleCount; host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); - return new MeanVarOneColumnFunctionBuilder(host, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf); + return new MeanVarOneColumnFunctionBuilder(host, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf, column.UseSampleVariance); } protected override bool ProcessValue(in TFloat origVal) @@ -1852,10 +1864,13 @@ private IColumnFunction CreateAffineColumnFunction() return AffineColumnFunction.Create(Host, (TFloat)0, (TFloat)0); TFloat scale; TFloat offset; + var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0]; + var variance = _useSampleVariance ? _aggregator.SampleVariance[0] : _aggregator.MeanSquareError[0]; + if (_fix) - MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], _aggregator.MeanSquareError[0], out scale, out offset); + MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], variance, out scale, out offset); else - MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], _aggregator.StdDev[0], out scale, out offset); + MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], stdDev, out scale, out offset); return AffineColumnFunction.Create(Host, scale, offset); } @@ -1866,7 +1881,9 @@ private IColumnFunction CreateCdfColumnFunction() if (_aggregator.M2[0] == 0 || _aggregator.Counts[0] == 0) return CdfColumnFunction.Create(Host, (TFloat)0, (TFloat)0, _useLog); - return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)_aggregator.StdDev[0], _useLog); + var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0]; + + return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)stdDev, _useLog); } } @@ -1875,16 +1892,18 @@ public sealed class MeanVarVecColumnFunctionBuilder : VecColumnFunctionBuilderBa private readonly bool _fix; private readonly bool _useLog; private readonly bool _useCdf; + private readonly bool _useSampleVariance; private readonly MeanVarSngAggregator _aggregator; private MeanVarVecColumnFunctionBuilder(IHost host, int cv, long lim, bool fix, - ValueGetter> getSrc, bool useLog, bool useCdf) + ValueGetter> getSrc, bool useLog, bool useCdf, bool useSampleVariance) : base(host, lim, getSrc) { _aggregator = new MeanVarSngAggregator(cv, useLog); _fix = fix; _useLog = useLog; _useCdf = useCdf; + _useSampleVariance = useSampleVariance; } public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType, @@ -1892,7 +1911,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceCol { host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); var cv = srcType.Size; - return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf); + return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance); } public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType, @@ -1901,7 +1920,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance var lim = column.MaximumExampleCount; host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1"); var cv = srcType.Size; - return new MeanVarVecColumnFunctionBuilder(host, cv, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf); + return new MeanVarVecColumnFunctionBuilder(host, cv, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf, column.UseSampleVariance); } protected override bool ProcessValue(in VBuffer buffer) @@ -1939,10 +1958,14 @@ private IColumnFunction CreateAffineColumnFunction() scale[i] = offset[i] = 0; continue; } + + var stdDev = _useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i]; + var variance = _useSampleVariance ? _aggregator.SampleVariance[i] : _aggregator.MeanSquareError[i]; + if (_fix) - MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], _aggregator.MeanSquareError[i], out scale[i], out offset[i]); + MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], variance, out scale[i], out offset[i]); else - MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], _aggregator.StdDev[i], out scale[i], out offset[i]); + MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], stdDev, out scale[i], out offset[i]); if (offset[i] != 0 && nz.Count < lim) nz.Add(i); } @@ -1982,7 +2005,8 @@ private IColumnFunction CreateCdfColumnFunction() continue; } mean[i] = (TFloat)_aggregator.Mean[i]; - stddev[i] = (TFloat)_aggregator.StdDev[i]; + stddev[i] = (TFloat)(_useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i]); + } return CdfColumnFunction.Create(Host, mean, stddev, _useLog); diff --git a/src/Microsoft.ML.Data/Transforms/Normalizer.cs b/src/Microsoft.ML.Data/Transforms/Normalizer.cs index 325fb6519c..731e3d3ba8 100644 --- a/src/Microsoft.ML.Data/Transforms/Normalizer.cs +++ b/src/Microsoft.ML.Data/Transforms/Normalizer.cs @@ -91,6 +91,7 @@ internal static class Defaults public const bool CenterData = true; public const uint QuantileMin = 25; public const uint QuantileMax = 75; + public const bool UseSampleVariance = false; } [BestFriend] @@ -191,12 +192,14 @@ internal override IColumnFunctionBuilder MakeBuilder(IHost host, int srcIndex, D internal sealed class MeanVarianceColumnOptions : ControlZeroColumnOptionsBase { public readonly bool UseCdf; + public readonly bool UseSampleVariance; public MeanVarianceColumnOptions(string outputColumnName, string inputColumnName = null, - long maximumExampleCount = Defaults.MaximumExampleCount, bool fixZero = Defaults.EnsureZeroUntouched, bool useCdf = Defaults.MeanVarCdf) + long maximumExampleCount = Defaults.MaximumExampleCount, bool fixZero = Defaults.EnsureZeroUntouched, bool useCdf = Defaults.MeanVarCdf, bool useSampleVariance = Defaults.UseSampleVariance) : base(outputColumnName, inputColumnName ?? outputColumnName, maximumExampleCount, fixZero) { UseCdf = useCdf; + UseSampleVariance = useSampleVariance; } internal override IColumnFunctionBuilder MakeBuilder(IHost host, int srcIndex, DataViewType srcType, DataViewRowCursor cursor) @@ -207,12 +210,14 @@ internal override IColumnFunctionBuilder MakeBuilder(IHost host, int srcIndex, D internal sealed class LogMeanVarianceColumnOptions : ControlZeroColumnOptionsBase { public readonly bool UseCdf; + public readonly bool UseSampleVariance; public LogMeanVarianceColumnOptions(string outputColumnName, string inputColumnName = null, - long maximumExampleCount = Defaults.MaximumExampleCount, bool useCdf = Defaults.LogMeanVarCdf, bool fixZero = Defaults.EnsureZeroUntouched) + long maximumExampleCount = Defaults.MaximumExampleCount, bool useCdf = Defaults.LogMeanVarCdf, bool fixZero = Defaults.EnsureZeroUntouched, bool useSampleVariance = Defaults.UseSampleVariance) : base(outputColumnName, inputColumnName ?? outputColumnName, maximumExampleCount, fixZero) { UseCdf = useCdf; + UseSampleVariance = useSampleVariance; } internal override IColumnFunctionBuilder MakeBuilder(IHost host, int srcIndex, DataViewType srcType, DataViewRowCursor cursor) From 07b273f34fe35a8b11959635562ec218e7ff7a36 Mon Sep 17 00:00:00 2001 From: Michael Sharp <51342856+michaelgsharp@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:18:42 -0700 Subject: [PATCH 2/3] Update NormalizeColumnDbl.cs --- src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs index 172d66155f..c19b23b92a 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs @@ -451,12 +451,12 @@ public Double[] MeanSquareError { get { return _m2.Select((m2, i) => m2 / _cnz[i]).ToArray(); } } + public Double[] SampleVariance { get { return _m2.Select((m2, i) => m2 / Math.Max(0, _cnz[i] - 1)).ToArray(); } } - public Double[] M2 { get { return _m2; } From 006965daf3dd4066de85b946f1684bbf39f6df32 Mon Sep 17 00:00:00 2001 From: Michael Sharp <51342856+michaelgsharp@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:19:28 -0700 Subject: [PATCH 3/3] Update NormalizeColumnSng.cs --- src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs index 7296db0d98..69a0d1b88d 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs @@ -452,12 +452,12 @@ public Double[] MeanSquareError { get { return _m2.Select((m2, i) => m2 / _cnz[i]).ToArray(); } } + public Double[] SampleVariance { get { return _m2.Select((m2, i) => m2 / Math.Max(0, _cnz[i] - 1)).ToArray(); } } - public Double[] M2 { get { return _m2; }