JayBeams  0.1
Another project to have fun coding.
histogram.hpp
Go to the documentation of this file.
1 #ifndef jb_histogram_hpp
2 #define jb_histogram_hpp
3 
5 #include <cmath>
6 #include <cstddef>
7 #include <cstdint>
8 #include <limits>
9 #include <stdexcept>
10 #include <type_traits>
11 #include <vector>
12 
13 namespace jb {
14 
15 /**
16  * A histogram class with controllable binning and range strategy.
17  *
18  * We are interested in capturing histograms of latency, and of rate
19  * measurements, and other things. The challenge is that we need good
20  * precision, but we also need to keep thousands of these histograms
21  * in memory. To increase precision in histograms one must increase
22  * the number of bins, but that increases the memory requirements.
23  *
24  * Consider, for example, a typical latency measurement that typically
25  * ranges from 1 to 400 microseconds, but sometimes can peak to
26  * seconds. Capturing the full range at microsecond precision would
27  * require MiB of memory. While this is not a problem for a single
28  * histogram, what if we want to keep this histogram per symbol?
29  *
30  * As an alternative, we can keep this histogram at full resolution
31  * between 0 and 1,000 microseconds, drop to 10 microsecond bins from
32  * there to 10,000 microseconds, then increase to 100 microsecond bins
33  * and so forth. For values higher than a prescribed limit we simply
34  * record "overflow".
35  *
36  * This class implements such a histogram, with a user-provided
37  * mapping strategy to define the bins.
38  *
39  * @tparam binning_strategy_t Please see @ref jb::binning_strategy_concept.
40  * @tparam counter_type The type used for the counters. In most
41  * applications using int is sufficient, but if you are counting
42  * really large numbers you might consider using std::uint64_t, or
43  * if you are memory constrained consider using std::int16_t.
44  */
45 template <typename binning_strategy_t, typename counter_type_t = unsigned int>
46 class histogram {
47  struct check_constraints;
48 
49 public:
50  //@{
51  /**
52  * @name type traits
53  */
54  typedef binning_strategy_t binning_strategy;
55  typedef typename binning_strategy::sample_type sample_type;
56  typedef counter_type_t counter_type;
57  //@}
58 
59  /**
60  * Construct a histogram given a mapping strategy.
61  *
62  * Most mapping strategies are likely to be stateless, so the
63  * default constructor is adequate.
64  *
65  * @param mapping Define how samples are mapped into bins.
66  */
67  explicit histogram(binning_strategy const& mapping = binning_strategy())
68  : binning_(mapping)
69  , underflow_count_(0)
70  , overflow_count_(0)
71  , observed_min_(binning_.theoretical_max())
72  , observed_max_(binning_.theoretical_min())
73  , nsamples_(0)
74  , bins_(nbins()) {
75  check_constraints checker;
76  }
77 
78  /// Return the number of samples observed to this point.
79  std::uint64_t nsamples() const {
80  return nsamples_;
81  }
82 
83  /// Return the smallest sample value observed to this point.
84  sample_type observed_min() const {
85  return observed_min_;
86  }
87 
88  /// Return the largest sample value observed to this point.
89  sample_type observed_max() const {
90  return observed_max_;
91  }
92 
93  /**
94  * Estimate the mean of the sample distribution.
95  *
96  * Estimating the mean is O(N) where N is the number of bins.
97  */
98  sample_type estimated_mean() const {
99  if (nsamples_ == 0) {
100  throw std::invalid_argument("Cannot estimate mean on an empty histogram");
101  }
102  sample_type acc = sample_type();
103  // ... use the midpoint of the underflow bin to estimate the
104  // contribution of those samples ...
105  if (underflow_count()) {
106  acc +=
107  (midpoint(observed_min(), binning_.histogram_min()) *
108  underflow_count());
109  }
110  sample_type a = binning_.histogram_min();
111  for (std::size_t i = 0; i != bins_.size(); ++i) {
112  sample_type b = binning_.bin2sample(i + 1);
113  acc += midpoint(a, b) * bins_[i];
114  a = b;
115  }
116  if (overflow_count()) {
117  acc +=
118  (midpoint(binning_.histogram_max(), observed_max()) *
119  overflow_count());
120  }
121  return acc / nsamples_;
122  }
123 
124  /**
125  * Estimate a quantile of the sample distribution.
126  *
127  * The inverse of the accumulated density function. Find the
128  * smallest value Q such that at most q * nsamples of the samples
129  * are smaller than Q. If you prefer to think in percentiles make
130  * @code
131  * q = percentile / 100.0
132  * @endcode
133  *
134  * Estimating quantiles is O(N) where N is the number of bins.
135  */
136  sample_type estimated_quantile(double q) const {
137  if (nsamples_ == 0) {
138  throw std::invalid_argument(
139  "Cannot estimate quantile for empty histogram");
140  }
141  if (q < 0 or q > 1.0) {
142  throw std::invalid_argument("Quantile value outside 0 <= q <= 1 range");
143  }
144  std::uint64_t cum_samples = 0;
145  std::uint64_t bin_samples = underflow_count();
146  if (bin_samples and q <= double(cum_samples + bin_samples) / nsamples()) {
147  double s = double(bin_samples) / nsamples();
148  double y_a = double(cum_samples) / nsamples();
149  return binning_.interpolate(
150  observed_min(), binning_.histogram_min(), y_a, s, q);
151  }
152  for (std::size_t i = 0; i != bins_.size(); ++i) {
153  cum_samples += bin_samples;
154  bin_samples = bins_[i];
155  if (bin_samples and q <= double(cum_samples + bin_samples) / nsamples()) {
156  double s = double(bin_samples) / nsamples();
157  double y_a = double(cum_samples) / nsamples();
158  return binning_.interpolate(
159  binning_.bin2sample(i), binning_.bin2sample(i + 1), y_a, s, q);
160  }
161  }
162  cum_samples += bin_samples;
163  bin_samples = overflow_count();
164  if (bin_samples and q < double(cum_samples + bin_samples) / nsamples()) {
165  double s = double(bin_samples) / nsamples();
166  double y_a = double(cum_samples) / nsamples();
167  return binning_.interpolate(
168  binning_.histogram_max(), observed_max(), y_a, s, q);
169  }
170 
171  return observed_max();
172  }
173 
174  /// Return the number of samples smaller than the histogram range.
175  std::uint64_t underflow_count() const {
176  return underflow_count_;
177  }
178 
179  /// Return the number of samples larger than the histogram range.
180  std::uint64_t overflow_count() const {
181  return overflow_count_;
182  }
183 
184  /// Return a simple summary
186  if (nsamples() == 0) {
187  return histogram_summary{0, 0, 0, 0, 0, 0, 0, 0};
188  }
189  return histogram_summary{
190  double(observed_min()), double(estimated_quantile(0.25)),
191  double(estimated_quantile(0.50)), double(estimated_quantile(0.75)),
192  double(estimated_quantile(0.90)), double(estimated_quantile(0.99)),
193  double(observed_max()), nsamples()};
194  }
195 
196  /// Record a new sample.
197  void sample(sample_type const& t) {
198  weighted_sample(t, 1);
199  }
200 
201  /// Record a new sample.
202  void weighted_sample(sample_type const& t, counter_type weight) {
203  if (weight == 0) {
204  return;
205  }
206  nsamples_ += weight;
207  if (observed_min_ > t) {
208  observed_min_ = t;
209  }
210  if (observed_max_ < t) {
211  observed_max_ = t;
212  }
213  if (binning_.histogram_min() <= t and t < binning_.histogram_max()) {
214  auto i = binning_.sample2bin(t);
215  bins_[i] += weight;
216  } else if (t < binning_.histogram_min()) {
217  underflow_count_ += weight;
218  } else {
219  overflow_count_ += weight;
220  }
221  }
222 
223  /// Reset all counters
224  void reset() {
225  histogram fresh(binning_);
226  *this = std::move(fresh);
227  }
228 
229  /// The type used to store the bins.
230  typedef std::vector<counter_type> counters;
231 
232 private:
233  /// Compute the maximum number of bins that might be needed.
234  std::size_t nbins() const {
235  std::size_t max = binning_.sample2bin(binning_.histogram_max());
236  std::size_t min = binning_.sample2bin(binning_.histogram_min());
237  return max - min;
238  }
239 
240  /// Estimate a midpoint
241  sample_type midpoint(sample_type const& a, sample_type const& b) const {
242  return (a + b) / 2;
243  }
244 
245 private:
246  binning_strategy binning_;
247  std::uint64_t underflow_count_;
248  std::uint64_t overflow_count_;
249  sample_type observed_min_;
250  sample_type observed_max_;
251  std::uint64_t nsamples_;
252  counters bins_;
253 };
254 
255 /**
256  * Verify the constraints on the histogram template class template
257  * parameters, and generate better error messages when they are not
258  * met.
259  */
260 template <typename binning_strategy, typename counter_type>
261 struct histogram<binning_strategy, counter_type>::check_constraints {
263  typedef typename histo::sample_type sample_type;
264 
266  static_assert(
267  std::is_integral<counter_type>::value,
268  "The counter_type must be an integral type");
269 
270  static_assert(
271  std::is_convertible<
272  decltype(histogram_min_return_type()), sample_type>::value,
273  "The binning_strategy must provide a min() function, "
274  "and it must return a type compatible with sample_type.");
275  static_assert(
276  std::is_convertible<
277  decltype(histogram_max_return_type()), sample_type>::value,
278  "The binning_strategy must provide a max() function, "
279  "and it must return a type compatible with sample_type.");
280 
281  static_assert(
282  std::is_convertible<
283  decltype(theoretical_min_return_type()), sample_type>::value,
284  "The binning_strategy must provide a theoretical_min() function, "
285  "and it must return a type compatible with sample_type.");
286  static_assert(
287  std::is_convertible<
288  decltype(theoretical_max_return_type()), sample_type>::value,
289  "The binning_strategy must provide a theoretical_max() function, "
290  "and it must return a type compatible with sample_type.");
291 
292  static_assert(
293  std::is_convertible<
294  decltype(interpolate_return_type()), sample_type>::value,
295  "The binning_strategy must provide a interpolate() function, "
296  "and it must return a type compatible with sample_type.");
297 
298  static_assert(
299  std::is_convertible<
300  decltype(sample2bin_return_type()), std::size_t>::value,
301  "The binning_strategy must provide a sample2bin() function, "
302  "and it must return a type compatible with std::size_t.");
303  static_assert(
304  std::is_convertible<
305  decltype(bin2sample_return_type()), sample_type>::value,
306  "The binning_strategy must provide a bin2sample() function, "
307  "and it must return a type compatible with sample_type.");
308 
309  static_assert(
310  1 == sizeof(decltype(has_less_than(std::declval<sample_type>()))),
311  "The sample_type must have a < operator.");
312  static_assert(
313  1 == sizeof(
314  decltype(has_less_than_or_equal(std::declval<sample_type>()))),
315  "The sample_type must have a <= operator.");
316  }
317 
318  auto histogram_min_return_type()
319  -> decltype(std::declval<const binning_strategy>().histogram_min());
320  auto histogram_max_return_type()
321  -> decltype(std::declval<const binning_strategy>().histogram_max());
322 
323  auto theoretical_min_return_type()
324  -> decltype(std::declval<const binning_strategy>().theoretical_min());
325  auto theoretical_max_return_type()
326  -> decltype(std::declval<const binning_strategy>().theoretical_max());
327 
328  auto sample2bin_return_type()
329  -> decltype(std::declval<const binning_strategy>().sample2bin(
330  std::declval<const sample_type>()));
331  auto bin2sample_return_type() -> decltype(
332  std::declval<const binning_strategy>().bin2sample(std::size_t(0)));
333 
334  auto interpolate_return_type()
335  -> decltype(std::declval<const binning_strategy>().interpolate(
336  std::declval<const sample_type>(), std::declval<const sample_type>(),
337  double(1.0), double(1.0), double(0.5)));
338 
339  /// An object to create a SFINAE condition.
340  struct error {
341  char fill[2];
342  };
343 
344  // Use SFINAE to discover if the variable type can be compared as we
345  // need to.
346  error has_less_than(...);
347  auto has_less_than(sample_type const& t)
348  -> decltype(static_cast<void>(t < t), char(0));
349  error has_less_than_or_equal(...);
350  auto has_less_than_or_equal(sample_type const& t)
351  -> decltype(static_cast<void>(t <= t), char(0));
352 };
353 
354 } // namespace jb
355 
356 #endif // jb_histogram_hpp
counter_type_t counter_type
Definition: histogram.hpp:56
void sample(sample_type const &t)
Record a new sample.
Definition: histogram.hpp:197
std::uint64_t underflow_count_
Definition: histogram.hpp:247
std::vector< counter_type > counters
The type used to store the bins.
Definition: histogram.hpp:230
binning_strategy::sample_type sample_type
Definition: histogram.hpp:55
void reset()
Reset all counters.
Definition: histogram.hpp:224
binning_strategy binning_
Definition: histogram.hpp:246
sample_type observed_max_
Definition: histogram.hpp:250
std::size_t nbins() const
Compute the maximum number of bins that might be needed.
Definition: histogram.hpp:234
A histogram class with controllable binning and range strategy.
Definition: histogram.hpp:46
A simple class to capture summary information about a histogram.
histogram< binning_strategy > histo
Definition: histogram.hpp:262
std::uint64_t overflow_count_
Definition: histogram.hpp:248
counters bins_
Definition: histogram.hpp:252
std::uint64_t nsamples_
Definition: histogram.hpp:251
sample_type observed_min_
Definition: histogram.hpp:249
sample_type estimated_quantile(double q) const
Estimate a quantile of the sample distribution.
Definition: histogram.hpp:136
sample_type observed_max() const
Return the largest sample value observed to this point.
Definition: histogram.hpp:89
std::uint64_t underflow_count() const
Return the number of samples smaller than the histogram range.
Definition: histogram.hpp:175
binning_strategy_t binning_strategy
Definition: histogram.hpp:54
histo::sample_type sample_type
Definition: histogram.hpp:263
sample_type midpoint(sample_type const &a, sample_type const &b) const
Estimate a midpoint.
Definition: histogram.hpp:241
histogram_summary summary() const
Return a simple summary.
Definition: histogram.hpp:185
An object to create a SFINAE condition.
Definition: histogram.hpp:340
sample_type observed_min() const
Return the smallest sample value observed to this point.
Definition: histogram.hpp:84
sample_type estimated_mean() const
Estimate the mean of the sample distribution.
Definition: histogram.hpp:98
Verify the constraints on the histogram template class template parameters, and generate better error...
Definition: histogram.hpp:261
std::uint64_t nsamples() const
Return the number of samples observed to this point.
Definition: histogram.hpp:79
void weighted_sample(sample_type const &t, counter_type weight)
Record a new sample.
Definition: histogram.hpp:202
std::uint64_t overflow_count() const
Return the number of samples larger than the histogram range.
Definition: histogram.hpp:180
histogram(binning_strategy const &mapping=binning_strategy())
Construct a histogram given a mapping strategy.
Definition: histogram.hpp:67
The top-level namespace for the JayBeams library.
Definition: as_hhmmss.hpp:7