Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 37 additions & 13 deletions src/Microsoft.ML.Data/Transforms/NormalizeColumnDbl.cs
Original file line number Diff line number Diff line change
Expand Up @@ -437,16 +437,26 @@ public Double[] Mean
get { return _mean; }
}

public Double[] StdDev
public Double[] StdDevPopulation
{
get { return _m2.Select((m2, i) => Math.Sqrt(m2 / _cnz[i])).ToArray(); }
}

public Double[] StdDevSample
{
get { return _m2.Select((m2, i) => Math.Sqrt(m2 / Math.Max(0, _cnz[i] - 1))).ToArray(); }
}

public Double[] MeanSquareError
{
get { return _m2.Select((m2, i) => m2 / _cnz[i]).ToArray(); }
}

public Double[] SampleVariance
{
get { return _m2.Select((m2, i) => m2 / Math.Max(0, _cnz[i] - 1)).ToArray(); }
}

public Double[] M2
{
get { return _m2; }
Expand Down Expand Up @@ -1637,15 +1647,17 @@ public sealed class MeanVarOneColumnFunctionBuilder : OneColumnFunctionBuilderBa
private readonly bool _useLog;
private readonly bool _useCdf;
private readonly bool _fix;
private readonly bool _useSampleVariance;
private readonly MeanVarDblAggregator _aggregator;
private VBuffer<TFloat> _buffer;

private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter<TFloat> getSrc, bool useLog, bool useCdf)
private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter<TFloat> getSrc, bool useLog, bool useCdf, bool useSampleVariance)
: base(host, lim, getSrc)
{
_useLog = useLog;
_useCdf = useCdf;
_fix = fix;
_useSampleVariance = useSampleVariance;
_aggregator = new MeanVarDblAggregator(1, useLog);
_buffer = new VBuffer<TFloat>(1, new TFloat[1]);
}
Expand All @@ -1654,15 +1666,15 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceCol
ValueGetter<TFloat> getter)
{
host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf);
return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance);
}

public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, DataViewType srcType,
ValueGetter<TFloat> getter)
{
var lim = column.MaximumExampleCount;
host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
return new MeanVarOneColumnFunctionBuilder(host, lim, false, getter, true, column.UseCdf);
return new MeanVarOneColumnFunctionBuilder(host, lim, false, getter, true, column.UseCdf, column.UseSampleVariance);
}

protected override bool ProcessValue(in TFloat origVal)
Expand All @@ -1689,10 +1701,13 @@ private IColumnFunction CreateAffineColumnFunction()
return AffineColumnFunction.Create(Host, (TFloat)0, (TFloat)0);
TFloat scale;
TFloat offset;
var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0];
var variance = _useSampleVariance ? _aggregator.SampleVariance[0] : _aggregator.MeanSquareError[0];

if (_fix)
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], _aggregator.MeanSquareError[0], out scale, out offset);
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], variance, out scale, out offset);
else
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], _aggregator.StdDev[0], out scale, out offset);
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], stdDev, out scale, out offset);

return AffineColumnFunction.Create(Host, scale, offset);
}
Expand All @@ -1703,7 +1718,9 @@ private IColumnFunction CreateCdfColumnFunction()
if (_aggregator.M2[0] == 0 || _aggregator.Counts[0] == 0)
return CdfColumnFunction.Create(Host, (TFloat)0, (TFloat)0, _useLog);

return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)_aggregator.StdDev[0], _useLog);
var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0];

return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)stdDev, _useLog);
}
}

Expand All @@ -1712,24 +1729,26 @@ public sealed class MeanVarVecColumnFunctionBuilder : VecColumnFunctionBuilderBa
private readonly bool _fix;
private readonly bool _useLog;
private readonly bool _useCdf;
private readonly bool _useSampleVariance;
private readonly MeanVarDblAggregator _aggregator;

private MeanVarVecColumnFunctionBuilder(IHost host, int cv, long lim, bool fix,
ValueGetter<VBuffer<TFloat>> getSrc, bool useLog, bool useCdf)
ValueGetter<VBuffer<TFloat>> getSrc, bool useLog, bool useCdf, bool useSampleVariance)
: base(host, lim, getSrc)
{
_aggregator = new MeanVarDblAggregator(cv, useLog);
_fix = fix;
_useLog = useLog;
_useCdf = useCdf;
_useSampleVariance = useSampleVariance;
}

public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType,
ValueGetter<VBuffer<TFloat>> getter)
{
host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
var cv = srcType.Size;
return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf);
return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance);
}

public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType,
Expand All @@ -1738,7 +1757,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance
var lim = column.MaximumExampleCount;
host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
var cv = srcType.Size;
return new MeanVarVecColumnFunctionBuilder(host, cv, lim, false, getter, true, column.UseCdf);
return new MeanVarVecColumnFunctionBuilder(host, cv, lim, false, getter, true, column.UseCdf, column.UseSampleVariance);
}

protected override bool ProcessValue(in VBuffer<TFloat> buffer)
Expand Down Expand Up @@ -1776,10 +1795,14 @@ private IColumnFunction CreateAffineColumnFunction()
scale[i] = offset[i] = 0;
continue;
}

var stdDev = _useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i];
var variance = _useSampleVariance ? _aggregator.SampleVariance[i] : _aggregator.MeanSquareError[i];

if (_fix)
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], _aggregator.MeanSquareError[i], out scale[i], out offset[i]);
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], variance, out scale[i], out offset[i]);
else
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], _aggregator.StdDev[i], out scale[i], out offset[i]);
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], stdDev, out scale[i], out offset[i]);
if (offset[i] != 0 && nz.Count < lim)
nz.Add(i);
}
Expand Down Expand Up @@ -1819,7 +1842,8 @@ private IColumnFunction CreateCdfColumnFunction()
continue;
}
mean[i] = (TFloat)_aggregator.Mean[i];
stddev[i] = (TFloat)_aggregator.StdDev[i];
stddev[i] = (TFloat)(_useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i]);

}

return CdfColumnFunction.Create(Host, mean, stddev, _useLog);
Expand Down
50 changes: 37 additions & 13 deletions src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs
Original file line number Diff line number Diff line change
Expand Up @@ -438,16 +438,26 @@ public Double[] Mean
get { return _mean; }
}

public Double[] StdDev
public Double[] StdDevPopulation
{
get { return _m2.Select((m2, i) => Math.Sqrt(m2 / _cnz[i])).ToArray(); }
}

public Double[] StdDevSample
{
get { return _m2.Select((m2, i) => Math.Sqrt(m2 / Math.Max(0, _cnz[i] - 1))).ToArray(); }
}

public Double[] MeanSquareError
{
get { return _m2.Select((m2, i) => m2 / _cnz[i]).ToArray(); }
}

public Double[] SampleVariance
{
get { return _m2.Select((m2, i) => m2 / Math.Max(0, _cnz[i] - 1)).ToArray(); }
}

public Double[] M2
{
get { return _m2; }
Expand Down Expand Up @@ -1800,15 +1810,17 @@ public sealed class MeanVarOneColumnFunctionBuilder : OneColumnFunctionBuilderBa
private readonly bool _useLog;
private readonly bool _useCdf;
private readonly bool _fix;
private readonly bool _useSampleVariance;
private readonly MeanVarSngAggregator _aggregator;
private VBuffer<TFloat> _buffer;

private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter<TFloat> getSrc, bool useLog, bool useCdf)
private MeanVarOneColumnFunctionBuilder(IHost host, long lim, bool fix, ValueGetter<TFloat> getSrc, bool useLog, bool useCdf, bool useSampleVariance)
: base(host, lim, getSrc)
{
_useLog = useLog;
_useCdf = useCdf;
_fix = fix;
_useSampleVariance = useSampleVariance;
_aggregator = new MeanVarSngAggregator(1, useLog);
_buffer = new VBuffer<TFloat>(1, new TFloat[1]);
}
Expand All @@ -1817,15 +1829,15 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceCol
ValueGetter<TFloat> getter)
{
host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf);
return new MeanVarOneColumnFunctionBuilder(host, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance);
}

public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, DataViewType srcType,
ValueGetter<TFloat> getter)
{
var lim = column.MaximumExampleCount;
host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
return new MeanVarOneColumnFunctionBuilder(host, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf);
return new MeanVarOneColumnFunctionBuilder(host, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf, column.UseSampleVariance);
}

protected override bool ProcessValue(in TFloat origVal)
Expand All @@ -1852,10 +1864,13 @@ private IColumnFunction CreateAffineColumnFunction()
return AffineColumnFunction.Create(Host, (TFloat)0, (TFloat)0);
TFloat scale;
TFloat offset;
var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0];
var variance = _useSampleVariance ? _aggregator.SampleVariance[0] : _aggregator.MeanSquareError[0];

if (_fix)
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], _aggregator.MeanSquareError[0], out scale, out offset);
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[0], variance, out scale, out offset);
else
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], _aggregator.StdDev[0], out scale, out offset);
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[0], stdDev, out scale, out offset);

return AffineColumnFunction.Create(Host, scale, offset);
}
Expand All @@ -1866,7 +1881,9 @@ private IColumnFunction CreateCdfColumnFunction()
if (_aggregator.M2[0] == 0 || _aggregator.Counts[0] == 0)
return CdfColumnFunction.Create(Host, (TFloat)0, (TFloat)0, _useLog);

return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)_aggregator.StdDev[0], _useLog);
var stdDev = _useSampleVariance ? _aggregator.StdDevSample[0] : _aggregator.StdDevPopulation[0];

return CdfColumnFunction.Create(Host, (TFloat)_aggregator.Mean[0], (TFloat)stdDev, _useLog);
}
}

Expand All @@ -1875,24 +1892,26 @@ public sealed class MeanVarVecColumnFunctionBuilder : VecColumnFunctionBuilderBa
private readonly bool _fix;
private readonly bool _useLog;
private readonly bool _useCdf;
private readonly bool _useSampleVariance;
private readonly MeanVarSngAggregator _aggregator;

private MeanVarVecColumnFunctionBuilder(IHost host, int cv, long lim, bool fix,
ValueGetter<VBuffer<TFloat>> getSrc, bool useLog, bool useCdf)
ValueGetter<VBuffer<TFloat>> getSrc, bool useLog, bool useCdf, bool useSampleVariance)
: base(host, lim, getSrc)
{
_aggregator = new MeanVarSngAggregator(cv, useLog);
_fix = fix;
_useLog = useLog;
_useCdf = useCdf;
_useSampleVariance = useSampleVariance;
}

public static IColumnFunctionBuilder Create(NormalizingEstimator.MeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType,
ValueGetter<VBuffer<TFloat>> getter)
{
host.CheckUserArg(column.MaximumExampleCount > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
var cv = srcType.Size;
return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf);
return new MeanVarVecColumnFunctionBuilder(host, cv, column.MaximumExampleCount, column.EnsureZeroUntouched, getter, false, column.UseCdf, column.UseSampleVariance);
}

public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVarianceColumnOptions column, IHost host, VectorDataViewType srcType,
Expand All @@ -1901,7 +1920,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance
var lim = column.MaximumExampleCount;
host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
var cv = srcType.Size;
return new MeanVarVecColumnFunctionBuilder(host, cv, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf);
return new MeanVarVecColumnFunctionBuilder(host, cv, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf, column.UseSampleVariance);
}

protected override bool ProcessValue(in VBuffer<TFloat> buffer)
Expand Down Expand Up @@ -1939,10 +1958,14 @@ private IColumnFunction CreateAffineColumnFunction()
scale[i] = offset[i] = 0;
continue;
}

var stdDev = _useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i];
var variance = _useSampleVariance ? _aggregator.SampleVariance[i] : _aggregator.MeanSquareError[i];

if (_fix)
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], _aggregator.MeanSquareError[i], out scale[i], out offset[i]);
MeanVarUtils.ComputeScaleAndOffsetFixZero(_aggregator.Mean[i], variance, out scale[i], out offset[i]);
else
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], _aggregator.StdDev[i], out scale[i], out offset[i]);
MeanVarUtils.ComputeScaleAndOffset(_aggregator.Mean[i], stdDev, out scale[i], out offset[i]);
if (offset[i] != 0 && nz.Count < lim)
nz.Add(i);
}
Expand Down Expand Up @@ -1982,7 +2005,8 @@ private IColumnFunction CreateCdfColumnFunction()
continue;
}
mean[i] = (TFloat)_aggregator.Mean[i];
stddev[i] = (TFloat)_aggregator.StdDev[i];
stddev[i] = (TFloat)(_useSampleVariance ? _aggregator.StdDevSample[i] : _aggregator.StdDevPopulation[i]);

}

return CdfColumnFunction.Create(Host, mean, stddev, _useLog);
Expand Down
Loading