Skip to content

Commit 6fd563f

Browse files
Merge pull request #975 from ChengYen-Tang/histogram
Numpy histogram
2 parents 62482ad + 992002c commit 6fd563f

File tree

5 files changed

+534
-6
lines changed

5 files changed

+534
-6
lines changed

THIRD-PARTY-NOTICES.TXT

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -557,4 +557,38 @@ Apache License
557557

558558
See full copyright notice and license text in
559559

560-
src/cpu/jit_utils/jitprofiling/LICENSE.BSD
560+
src/cpu/jit_utils/jitprofiling/LICENSE.BSD
561+
562+
==================
563+
Numpy
564+
565+
Copyright (c) 2005-2023, NumPy Developers.
566+
All rights reserved.
567+
568+
Redistribution and use in source and binary forms, with or without
569+
modification, are permitted provided that the following conditions are
570+
met:
571+
572+
* Redistributions of source code must retain the above copyright
573+
notice, this list of conditions and the following disclaimer.
574+
575+
* Redistributions in binary form must reproduce the above
576+
copyright notice, this list of conditions and the following
577+
disclaimer in the documentation and/or other materials provided
578+
with the distribution.
579+
580+
* Neither the name of the NumPy Developers nor the names of any
581+
contributors may be used to endorse or promote products derived
582+
from this software without specific prior written permission.
583+
584+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
585+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
586+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
587+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
588+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
589+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
590+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
591+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
592+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
593+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
594+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
2+
3+
namespace TorchSharp
4+
{
5+
public enum HistogramBinSelector
6+
{
7+
Doane,
8+
Rice,
9+
Scott,
10+
Sqrt,
11+
Stone,
12+
Sturges
13+
}
14+
}

src/TorchSharp/Tensor/torch.ComparisonOps.cs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,21 @@ public static Tensor searchsorted(Tensor sorted_sequence, Scalar values, bool ou
276276
return new Tensor(res);
277277
}
278278

279+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L679
280+
/// <summary>
281+
/// Computes a histogram of the values in a tensor.
282+
/// bins can be an integer or a 1D tensor.
283+
/// If bins is an int, it specifies the number of equal-width bins. By default, the lower and upper range of the bins is determined by the minimum and maximum elements of the input tensor. The range argument can be provided to specify a range for the bins.
284+
/// If bins is a 1D tensor, it specifies the sequence of bin edges including the rightmost edge. It should contain at least 2 elements and its elements should be increasing.
285+
/// </summary>
286+
/// <param name="input"> the input tensor. </param>
287+
/// <param name="bins"> int or 1D Tensor. If int, defines the number of equal-width bins. If tensor, defines the sequence of bin edges including the rightmost edge. </param>
288+
/// <param name="range"> Defines the range of the bins. </param>
289+
/// <param name="density"> If False, the result will contain the count (or total weight) in each bin. If True, the result is the value of the probability density function over the bins, normalized such that the integral over the range of the bins is 1. </param>
290+
/// <returns></returns>
291+
public static (Tensor hist, Tensor bin_edges) histogram(Tensor input, HistogramBinSelector bins, (double min, double max)? range = null, bool density = false)
292+
=> Utils.Histogram.histogram(input, bins, range, density);
293+
279294
// https://pytorch.org/docs/stable/generated/torch.histogram.html
280295
/// <summary>
281296
/// Computes a histogram of the values in a tensor.
@@ -305,7 +320,7 @@ public static (Tensor hist, Tensor bin_edges) histogram(Tensor input, Tensor bin
305320
/// </summary>
306321
/// <param name="input"> the input tensor. </param>
307322
/// <param name="bins"> int or 1D Tensor. If int, defines the number of equal-width bins. If tensor, defines the sequence of bin edges including the rightmost edge. </param>
308-
/// /// <param name="range"> Defines the range of the bins. </param>
323+
/// <param name="range"> Defines the range of the bins. </param>
309324
/// <param name="weight"> If provided, weight should have the same shape as input. Each value in input contributes its associated weight towards its bin’s result. </param>
310325
/// <param name="density"> If False, the result will contain the count (or total weight) in each bin. If True, the result is the value of the probability density function over the bins, normalized such that the integral over the range of the bins is 1. </param>
311326
/// <returns></returns>

src/TorchSharp/Utils/Histogram.cs

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
2+
3+
using System;
4+
using System.Collections.Generic;
5+
using System.Linq;
6+
using static TorchSharp.torch;
7+
8+
namespace TorchSharp.Utils
9+
{
10+
// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py
11+
internal static class Histogram
12+
{
13+
public static (Tensor hist, Tensor bin_edges) histogram(Tensor input, HistogramBinSelector bins, (double min, double max)? range, bool density = false)
14+
{
15+
input = RavelAndCheckWeights(input.cpu());
16+
(Tensor bin_edges, (double, double, int) uniform_bins) = GetBinEdges(input, bins, range);
17+
ScalarType ntype = ScalarType.Int32;
18+
int block = 65536;
19+
20+
21+
(double first_edge, double last_edge, int n_equal_bins) = uniform_bins;
22+
Tensor n = zeros(n_equal_bins, ntype);
23+
Tensor norm = n_equal_bins / subtract(last_edge, first_edge);
24+
25+
for (int i = 0; i < input.shape[0]; i += block) {
26+
Tensor tmp_a = input[TensorIndex.Slice(i, i + block)];
27+
Tensor keep = (tmp_a >= first_edge);
28+
keep &= (tmp_a <= last_edge);
29+
if (keep.sum().item<long>() != tmp_a.numel())
30+
tmp_a = tmp_a.masked_select(keep);
31+
32+
tmp_a = tmp_a.to_type(bin_edges.dtype);
33+
Tensor f_indices = subtract(tmp_a, first_edge) * norm;
34+
Tensor indices = f_indices.to_type(ScalarType.Int64);
35+
indices[indices == n_equal_bins] -= 1;
36+
37+
Tensor decrement = tmp_a < bin_edges[indices];
38+
indices[decrement] -= 1;
39+
Tensor increment = ((tmp_a >= bin_edges[indices + 1]) & (indices != n_equal_bins - 1));
40+
indices[increment] += 1;
41+
n += bincount(indices, minlength: n_equal_bins).to_type(ntype);
42+
}
43+
44+
if (density) {
45+
Tensor db = diff(bin_edges).to_type(ScalarType.Float32);
46+
return (n / db / n.sum(), bin_edges);
47+
}
48+
return (n, bin_edges);
49+
}
50+
51+
/// <summary>
52+
/// Computes the bins used internally by `histogram`.
53+
/// </summary>
54+
/// <param name="a"> Ravelled data array </param>
55+
/// <param name="bins"> Forwarded arguments from `histogram`. </param>
56+
/// <param name="range"> Ravelled weights array, or None </param>
57+
/// <returns></returns>
58+
private static (Tensor, (double, double, int)) GetBinEdges(Tensor a, HistogramBinSelector bins, (double min, double max)? range)
59+
{
60+
(double first_edge, double last_edge) = GetOuterEdges(a, range);
61+
if (range is not null) {
62+
Tensor keep = (a >= first_edge);
63+
keep &= (a <= last_edge);
64+
if (keep.sum().item<long>() != a.numel())
65+
a = a.masked_select(keep);
66+
}
67+
68+
int n_equal_bins;
69+
if (a.numel() == 0)
70+
n_equal_bins = 1;
71+
else {
72+
if (a.dtype != ScalarType.Float64)
73+
a = a.to_type(ScalarType.Float64);
74+
Tensor width = histBinSelectors[bins](a, range);
75+
if ((width > 0).item<bool>())
76+
n_equal_bins = ceil(subtract(last_edge, first_edge) / width).to_type(ScalarType.Int32).item<int>();
77+
else
78+
n_equal_bins = 1;
79+
}
80+
81+
Tensor bin_edges = linspace(first_edge, last_edge, n_equal_bins + 1, ScalarType.Float64, requires_grad: true);
82+
return (bin_edges, (first_edge, last_edge, n_equal_bins));
83+
}
84+
85+
/// <summary>
86+
/// Determine the outer bin edges to use, from either the data or the range argument
87+
/// </summary>
88+
/// <param name="a"></param>
89+
/// <param name="range"></param>
90+
/// <returns></returns>
91+
/// <exception cref="ArgumentException"></exception>
92+
private static (double, double) GetOuterEdges(Tensor a, (double min, double max)? range)
93+
{
94+
double first_edge, last_edge;
95+
if (range is not null) {
96+
(first_edge, last_edge) = range.Value;
97+
if (first_edge > last_edge)
98+
throw new ArgumentException("max must be larger than min in range parameter.");
99+
if (double.IsInfinity(first_edge) || double.IsNaN(first_edge) || double.IsInfinity(last_edge) || double.IsNaN(last_edge))
100+
throw new ArgumentException($"supplied range of [{first_edge}, {last_edge}] is not finite");
101+
} else if (a.numel() == 0) {
102+
(first_edge, last_edge) = (0, 1);
103+
} else {
104+
(first_edge, last_edge) = (a.min().to_type(ScalarType.Float64).item<double>(), a.max().to_type(ScalarType.Float64).item<double>());
105+
if (double.IsInfinity(first_edge) || double.IsNaN(first_edge) || double.IsInfinity(last_edge) || double.IsNaN(last_edge))
106+
throw new ArgumentException($"autodetected range of [{first_edge}, {last_edge}] is not finite");
107+
}
108+
109+
if (first_edge == last_edge)
110+
(first_edge, last_edge) = (first_edge - 0.5, last_edge + 0.5);
111+
return (first_edge, last_edge);
112+
}
113+
114+
/// <summary>
115+
/// Check a and weights have matching shapes, and ravel both
116+
///
117+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L283
118+
/// </summary>
119+
/// <param name="input"></param>
120+
/// <returns></returns>
121+
private static Tensor RavelAndCheckWeights(Tensor input)
122+
{
123+
if (input.dtype == ScalarType.Bool)
124+
input = input.to_type(ScalarType.Int8);
125+
input = input.ravel();
126+
127+
return input;
128+
}
129+
130+
#region hist_bin
131+
private static Dictionary<HistogramBinSelector, Func<Tensor, (double min, double max)?, Tensor>> histBinSelectors
132+
= new Dictionary<HistogramBinSelector, Func<Tensor, (double min, double max)?, Tensor>>()
133+
{
134+
{ HistogramBinSelector.Stone, HistBinStone },
135+
{ HistogramBinSelector.Doane, HistBinDoane },
136+
{ HistogramBinSelector.Rice, HistBinRice },
137+
{ HistogramBinSelector.Scott, HistBinScott },
138+
{ HistogramBinSelector.Sqrt, HistBinSqrt },
139+
{ HistogramBinSelector.Sturges, HistBinSturges },
140+
};
141+
142+
/// <summary>
143+
/// Square root histogram bin estimator.
144+
///
145+
/// Bin width is inversely proportional to the data size. Used by many
146+
/// programs for its simplicity.
147+
///
148+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L32
149+
/// </summary>
150+
/// <param name="x"> Input data that is to be histogrammed, trimmed to range. May not be empty. </param>
151+
/// <param name="_"></param>
152+
/// <returns> An estimate of the optimal bin width for the given data. </returns>
153+
private static Tensor HistBinSqrt(Tensor x, (double min, double max)? _)
154+
=> Ptp(x) / sqrt(x.numel());
155+
156+
/// <summary>
157+
/// Sturges histogram bin estimator.
158+
///
159+
/// A very simplistic estimator based on the assumption of normality of
160+
/// the data.This estimator has poor performance for non-normal data,
161+
/// which becomes especially obvious for large data sets.The estimate
162+
/// depends only on size of the data.
163+
///
164+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L53
165+
/// </summary>
166+
/// <param name="x"> Input data that is to be histogrammed, trimmed to range. May not be empty. </param>
167+
/// <param name="_"></param>
168+
/// <returns> An estimate of the optimal bin width for the given data. </returns>
169+
private static Tensor HistBinSturges(Tensor x, (double min, double max)? _)
170+
=> Ptp(x) / (log2(x.numel()) + 1);
171+
172+
/// <summary>
173+
/// Rice histogram bin estimator.
174+
///
175+
/// Another simple estimator with no normality assumption. It has better
176+
/// performance for large data than Sturges, but tends to overestimate
177+
/// the number of bins. The number of bins is proportional to the cube
178+
/// root of data size (asymptotically optimal). The estimate depends
179+
/// only on size of the data.
180+
///
181+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L76
182+
/// </summary>
183+
/// <param name="x"> Input data that is to be histogrammed, trimmed to range. May not be empty. </param>
184+
/// <param name="_"></param>
185+
/// <returns> An estimate of the optimal bin width for the given data. </returns>
186+
private static Tensor HistBinRice(Tensor x, (double min, double max)? _)
187+
=> Ptp(x) / (2 * pow(x.numel(), 1.0 / 3.0));
188+
189+
/// <summary>
190+
/// Scott histogram bin estimator.
191+
///
192+
/// The binwidth is proportional to the standard deviation of the data
193+
/// and inversely proportional to the cube root of data size
194+
/// (asymptotically optimal).
195+
///
196+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L100
197+
/// </summary>
198+
/// <param name="x"> Input data that is to be histogrammed, trimmed to range. May not be empty. </param>
199+
/// <param name="_"></param>
200+
/// <returns> An estimate of the optimal bin width for the given data. </returns>
201+
private static Tensor HistBinScott(Tensor x, (double min, double max)? _)
202+
=> Math.Pow(24.0 * Math.Pow(Math.PI, 0.5) / x.numel(), 1.0 / 3.0) * std(x, false);
203+
204+
/// <summary>
205+
/// Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
206+
///
207+
/// The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
208+
/// The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
209+
/// https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
210+
///
211+
/// This paper by Stone appears to be the origination of this rule.
212+
/// http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
213+
///
214+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L122
215+
/// </summary>
216+
/// <param name="x"> Input data that is to be histogrammed, trimmed to range. May not be empty. </param>
217+
/// <param name="range"> The lower and upper range of the bins. </param>
218+
/// <returns> An estimate of the optimal bin width for the given data. </returns>
219+
private static Tensor HistBinStone(Tensor x, (double min, double max)? range)
220+
{
221+
long n = x.numel();
222+
Tensor ptp_x = Ptp(x);
223+
if (n <= 1 || (ptp_x == 0).item<bool>())
224+
return 0;
225+
226+
double Jhat(int nbins)
227+
{
228+
Tensor hh = ptp_x / nbins;
229+
Tensor pk = torch.histogram(x, bins: nbins, range: range).hist / n;
230+
return ((2 - (n + 1) * pk.dot(pk)) / hh).to_type(ScalarType.Float64).item<double>();
231+
}
232+
233+
int nbinsUpperBound = Math.Max(100, Convert.ToInt32(Math.Sqrt(n)));
234+
int nbins = 0;
235+
double jhatTemp = double.PositiveInfinity;
236+
foreach (int item in Enumerable.Range(1, nbinsUpperBound + 1)) {
237+
double jhat = Jhat(item);
238+
if (jhat < jhatTemp) {
239+
jhatTemp = jhat;
240+
nbins = item;
241+
}
242+
}
243+
return ptp_x / nbins;
244+
}
245+
246+
/// <summary>
247+
/// Doane's histogram bin estimator.
248+
///
249+
/// Improved version of Sturges' formula which works better for
250+
/// non-normal data. See
251+
/// stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
252+
///
253+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L164
254+
/// </summary>
255+
/// <param name="x"> Input data that is to be histogrammed, trimmed to range. May not be empty. </param>
256+
/// <param name="_"></param>
257+
/// <returns> An estimate of the optimal bin width for the given data. </returns>
258+
private static Tensor HistBinDoane(Tensor x, (double min, double max)? _)
259+
{
260+
long size = x.numel();
261+
if (size > 2) {
262+
Tensor sg1 = sqrt(6.0 * (size - 2) / ((size + 1.0) * (size + 3)));
263+
Tensor sigma = x.std();
264+
if ((sigma > 0.0).item<bool>()) {
265+
Tensor temp = x - x.mean();
266+
temp = true_divide(temp, sigma);
267+
temp = float_power(temp, 3);
268+
Tensor g1 = temp.mean();
269+
return Ptp(x) / (1.0 + log2(size) + log2(1.0 + absolute(g1) / sg1));
270+
}
271+
}
272+
return 0.0;
273+
}
274+
#endregion
275+
276+
/// <summary>
277+
/// This implementation avoids the problem of signed integer arrays having a
278+
/// peak-to-peak value that cannot be represented with the array's data type.
279+
/// This function returns an value for signed integer arrays.
280+
///
281+
/// https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/histograms.py#L22
282+
/// </summary>
283+
/// <param name="input"></param>
284+
/// <returns></returns>
285+
private static Tensor Ptp(Tensor input)
286+
=> subtract(input.max(), input.min());
287+
}
288+
}

0 commit comments

Comments
 (0)