JayBeams  0.1
Another project to have fun coding.
bm_generic_reduce.cpp
Go to the documentation of this file.
7 #include <jb/complex_traits.hpp>
8 
9 #include <boost/compute/algorithm/reduce.hpp>
10 #include <boost/compute/command_queue.hpp>
11 #include <boost/compute/container/vector.hpp>
12 
13 #include <vector>
14 
15 /// Functions and types to benchmark the generic reduction functions
16 namespace {
17 #ifndef JB_OPENCL_bm_generic_reduce_minimum_size
18 #define JB_OPENCL_bm_generic_reduce_minimum_size 16
19 #endif // JB_OPENCL_bm_generic_reduce_minimum_size
20 
21 /**
22  * The configuration class for this benchmark.
23  */
24 class config : public jb::opencl::microbenchmark_config {
25 public:
26  config();
28 
31 };
32 
33 /// Return a table with all the testcases ..
35 } // anonymous namespace
36 
37 int main(int argc, char* argv[]) {
38  auto testcases = create_testcases();
39  return jb::testing::microbenchmark_group_main<config>(argc, argv, testcases);
40 }
41 
42 namespace {
43 std::string randomize_size_help() {
44  std::ostringstream os;
45  os << "If true, the size is randomized in each iteration."
46  << " This is useful when trying to build regression models,"
47  << " but not when trying to fine tune algorithms."
48  << " The random distributes uniformly "
50  << " and the configured size of the test.";
51  return os.str();
52 }
53 
56  , randomize_size(
57  desc("randomize-size").help(randomize_size_help()), this, true)
58  , copy_data(
59  desc("copy-data")
60  .help("If set, the test copies fresh data to the OpenCL device"
61  " on each iteration. Effectively that tests copy + "
62  "reduction."
63  " Disabling this flag tests reduction assuming the data is"
64  " already on the device."),
65  this, true) {
66 }
67 
68 template <typename T>
69 struct opencl_type_traits {};
70 
71 template <>
72 struct opencl_type_traits<double> {
73  static char const* macro_prefix() {
74  return "DBL_";
75  }
76 };
77 
78 template <>
79 struct opencl_type_traits<float> {
80  static char const* macro_prefix() {
81  return "FLT_";
82  }
83 };
84 
85 /**
86  * A reducer to drive the test, find the minimum value.
87  */
88 template <typename T>
89 class reduce_min : public jb::opencl::generic_reduce<reduce_min<T>, T, T> {
90 public:
91  reduce_min(std::size_t size, boost::compute::command_queue const& queue)
93  }
94 
95  /// @returns the body of the initialization function
96  static std::string initialize_body(char const* lhs) {
97  return std::string("*") + lhs + " = " +
98  opencl_type_traits<T>::macro_prefix() + "MAX;";
99  }
100 
101  /// @returns the body of the transform function
102  static std::string
103  transform_body(char const* lhs, char const* value, char const*) {
104  return std::string("*") + lhs + " = *" + value + ";";
105  }
106 
107  /// @returns the body of the combine function
108  static std::string combine_body(char const* accumulated, char const* value) {
109  return std::string("*") + accumulated + " = min(*" + accumulated + ", *" +
110  value + ");";
111  }
112 };
113 
114 /**
115  * Base fixture to benchmark OpenCL reductions.
116  */
117 template <typename T>
118 class base_fixture {
119 public:
120  /// Constructor for default size
121  base_fixture(
122  config const& cfg, boost::compute::context& context,
123  boost::compute::command_queue& q)
124  : base_fixture(cfg, context, q) {
125  }
126 
127  /// Constructor with known size
128  base_fixture(
129  int size, config const& cfg, boost::compute::context& context,
130  boost::compute::command_queue& q)
131  : host_(size)
132  , device_(size, context)
133  , queue_(q)
134  , generator_(jb::testing::initialize_mersenne_twister<std::mt19937_64>(
136  , iteration_size_(0)
137  , avoid_optimization_(0)
138  , cfg_(cfg) {
139  int counter = 0;
140  for (auto& i : host_) {
141  i = size + 1 - ++counter;
142  }
143  boost::compute::copy(host_.begin(), host_.end(), device_.begin(), queue_);
144  queue_.finish();
145  }
146 
147  /// Run each iteration with a random size, part of testing the
148  void iteration_setup() {
149  if (cfg_.randomize_size()) {
150  // ... pick a random size to execute the test at ...
151  iteration_size_ = std::uniform_int_distribution<>(
153  host_.size() - 1)(generator_);
154  }
155  }
156 
157  /// Return the value accumulated through all iterations
158  T avoid_optimization() const {
159  return avoid_optimization_;
160  }
161 
162 protected:
163  std::vector<T> host_;
164  boost::compute::vector<T> device_;
165  boost::compute::command_queue queue_;
166  std::mt19937_64 generator_;
167  int iteration_size_;
168  T avoid_optimization_;
169  config cfg_;
170 };
171 
172 /**
173  * A fixture to benchmark the boost::compute::reduce function.
174  */
175 template <typename T>
176 class boost_fixture : public base_fixture<T> {
177 public:
178  /// Constructor for default size
179  boost_fixture(
180  config const& cfg, boost::compute::context& context,
181  boost::compute::command_queue& q)
182  : boost_fixture(1024, cfg, context, q) {
183  }
184 
185  /// Constructor with known size
186  boost_fixture(
187  int size, config const& cfg, boost::compute::context& context,
188  boost::compute::command_queue& q)
189  : base_fixture<T>(size, cfg, context, q) {
190  }
191 
192  int run() {
193  if (this->cfg_.copy_data()) {
194  // ... copy the first iteration_size_ elements to the device ...
195  (void)boost::compute::copy(
196  this->host_.begin(), this->host_.begin() + this->iteration_size_,
197  this->device_.begin(), this->queue_);
198  }
199  // ... run a reduction on the device to pick the minimum value
200  // across those elements ...
201  T result = 0;
202  boost::compute::reduce(
203  this->device_.begin(), this->device_.begin() + this->iteration_size_,
204  &result, boost::compute::min<T>(), this->queue_);
205  this->queue_.finish();
206  this->avoid_optimization_ += result;
207  // ... return the iteration size ...
208  return this->iteration_size_;
209  }
210 };
211 
212 /**
213  * A fixture to benchmark the boost::compute::reduce function.
214  */
215 template <typename T>
216 class boost_async_fixture : public base_fixture<T> {
217 public:
218  /// Constructor for default size
219  boost_async_fixture(
220  config const& cfg, boost::compute::context& context,
221  boost::compute::command_queue& q)
222  : boost_async_fixture(1024, cfg, context, q) {
223  }
224 
225  /// Constructor with known size
226  boost_async_fixture(
227  int size, config const& cfg, boost::compute::context& context,
228  boost::compute::command_queue& q)
229  : base_fixture<T>(size, cfg, context, q) {
230  }
231 
232  int run() {
233  // ... copy the first iteration_size_ elements to the device ...
234  if (this->cfg_.copy_data()) {
235  auto end = boost::compute::copy_async(
236  this->host_.begin(), this->host_.begin() + this->iteration_size_,
237  this->device_.begin(), this->queue_);
238  // ... enqueue a barrier to only start the reduction once the copy
239  // has completed ...
240  this->queue_.enqueue_barrier();
241  }
242  // ... run a reduction on the device to pick the minimum value
243  // across those elements ...
244  T result = 0;
245  boost::compute::reduce(
246  this->device_.begin(), this->device_.begin() + this->iteration_size_,
247  &result, boost::compute::min<T>(), this->queue_);
248  this->queue_.finish();
249  this->avoid_optimization_ += result;
250  // ... return the iteration size ...
251  return this->iteration_size_;
252  }
253 };
254 
255 /**
256  * A fixture to benchmark the boost::compute::reduce function.
257  */
258 template <typename T>
259 class generic_reduce_fixture : public base_fixture<T> {
260 public:
261  /// Constructor for default size
262  generic_reduce_fixture(
263  config const& cfg, boost::compute::context& context,
264  boost::compute::command_queue& q)
265  : generic_reduce_fixture(1024, cfg, context, q) {
266  }
267 
268  /// Constructor with known size
269  generic_reduce_fixture(
270  int size, config const& cfg, boost::compute::context& context,
271  boost::compute::command_queue& q)
272  : base_fixture<T>(size, cfg, context, q)
273  , reducer_(size, q) {
274  }
275 
276  int run() {
277  boost::compute::wait_list wl;
278  if (this->cfg_.copy_data()) {
279  // ... copy the first iteration_size_ elements to the device ...
280  auto end = boost::compute::copy_async(
281  this->host_.begin(), this->host_.begin() + this->iteration_size_,
282  this->device_.begin(), this->queue_);
283  wl = boost::compute::wait_list(end.get_event());
284  }
285  // ... run a reduction on the device to pick the minimum value
286  // across those elements ...
287  auto result = reducer_.execute(
288  this->device_.begin(), this->device_.begin() + this->iteration_size_,
289  wl);
290  result.wait();
291  this->avoid_optimization_ += *result.get();
292  // ... return the iteration size ...
293  return this->iteration_size_;
294  }
295 
296 private:
297  reduce_min<T> reducer_;
298 };
299 
300 /**
301  * A fixture to benchmark the boost::compute::reduce function.
302  */
303 template <typename T>
304 class std_fixture : public base_fixture<T> {
305 public:
306  /// Constructor for default size
307  std_fixture(
308  config const& cfg, boost::compute::context& context,
309  boost::compute::command_queue& q)
310  : std_fixture(1024, cfg, context, q) {
311  }
312 
313  /// Constructor with known size
314  std_fixture(
315  int size, config const& cfg, boost::compute::context& context,
316  boost::compute::command_queue& q)
317  : base_fixture<T>(size, cfg, context, q) {
318  }
319 
320  int run() {
321  auto iterator = std::min_element(
322  this->host_.begin(), this->host_.begin() + this->iteration_size_);
323  this->avoid_optimization_ += *iterator;
324  return this->iteration_size_;
325  }
326 };
327 
328 /**
329  * Create a microbenchmark test-case.
330  */
331 template <typename fixture_type>
332 std::function<void(config const&)> test_case() {
333  return [](config const& cfg) {
334  boost::compute::device device = jb::opencl::device_selector(cfg.opencl());
335  boost::compute::context context(device);
336  boost::compute::command_queue queue(context, device);
337 
338  std::cerr << "device=" << device.name() << std::endl;
339 
341  benchmark bm(cfg.microbenchmark());
342 
343  auto r = bm.run(cfg, context, queue);
344  bm.typical_output(r);
345  };
346 }
347 
348 /// A table with all the microbenchmark cases
351  {"boost:float", test_case<boost_fixture<float>>()},
352  {"boost:double", test_case<boost_fixture<double>>()},
353  {"boost_async:float", test_case<boost_async_fixture<float>>()},
354  {"boost_async:double", test_case<boost_async_fixture<double>>()},
355  {"generic_reduce:float", test_case<generic_reduce_fixture<float>>()},
356  {"generic_reduce:double", test_case<generic_reduce_fixture<double>>()},
357  {"std:float", test_case<std_fixture<float>>()},
358  {"std:double", test_case<std_fixture<double>>()},
359  };
360 }
361 } // anonymous namespace
Implement a generic reducer for OpenCL.
boost::compute::device device_selector(config const &cfg)
Select an OpenCL device matching the current configuration.
#define JB_OPENCL_bm_generic_reduce_minimum_size
The configuration shared by all OpenCL microbenchmarks.
results run(Args &&... args)
Run the microbenchmaark.
std::map< std::string, std::function< void(config const &cfg)> > microbenchmark_group
Define a representation for a group of microbenchmarks.
Helper class to easily define configuration attributes.
int main(int argc, char *argv[])
char const default_initialization_marker[]
config_object_constructors(microbenchmark_config)
static attribute_descriptor desc(std::string const &name)
Convenience function to create attribute descriptors with less typing.
Run a micro-benchmark on a given class.