1 #ifndef jb_opencl_generic_reduce_hpp 2 #define jb_opencl_generic_reduce_hpp 9 #include <boost/compute/command_queue.hpp> 10 #include <boost/compute/container/vector.hpp> 11 #include <boost/compute/memory/local_buffer.hpp> 36 template <
typename reducer,
typename input_type_t,
typename output_type_t>
82 boost::compute::device device =
queue_.get_device();
83 std::size_t local_mem_size = device.local_memory_size();
87 boost::compute::kernel sizer(
program_,
"scratch_element_size");
88 boost::compute::vector<boost::compute::ulong_> dev(1,
queue_.get_context());
89 sizer.set_arg(0, dev);
90 queue_.enqueue_1d_range_kernel(sizer, 0, 1, 1).wait();
91 std::vector<boost::compute::ulong_> host(1);
92 boost::compute::copy(dev.begin(), dev.end(), host.begin(),
queue_);
125 template <
typename InputIterator>
126 boost::compute::future<vector_iterator>
execute(
127 InputIterator begin, InputIterator end,
128 boost::compute::wait_list
const& wait = boost::compute::wait_list()) {
129 auto size = std::distance(begin, end);
134 auto workgroups =
size / workgroup_size;
135 if (workgroups == 0) {
145 static_cast<long long>(
size),
146 static_cast<long long>(workgroups * workgroup_size));
148 auto VPT = div.quot + (div.rem != 0);
154 initial_.set_arg(arg++, boost::compute::ulong_(VPT));
155 initial_.set_arg(arg++, boost::compute::ulong_(workgroup_size));
157 initial_.set_arg(arg++, begin.get_buffer());
159 arg++, boost::compute::local_buffer<output_type>(workgroup_size));
161 auto event =
queue_.enqueue_1d_range_kernel(
162 initial_, 0, workgroups * workgroup_size, workgroup_size, wait);
167 for (
auto pass_output_size = workgroups; pass_output_size > 1;
168 pass_output_size = workgroups) {
173 if (pass_output_size < workgroup_size) {
181 workgroup_size =
jb::p2ceil(pass_output_size) / 2;
183 workgroups = pass_output_size / workgroup_size;
190 static_cast<long long>(pass_output_size),
191 static_cast<long long>(workgroups * workgroup_size));
192 auto VPT = div.quot + (div.rem != 0);
199 arg++, boost::compute::ulong_(workgroup_size) );
200 intermediate_.set_arg(arg++, boost::compute::ulong_(pass_output_size));
203 arg++, boost::compute::local_buffer<output_type>(workgroup_size));
206 event =
queue_.enqueue_1d_range_kernel(
207 intermediate_, 0, workgroups * workgroup_size, workgroup_size,
208 boost::compute::wait_list(event));
216 return boost::compute::make_future(
ping_.begin(), event);
231 boost::compute::future<vector_iterator>
execute(
232 boost::compute::vector<input_type>
const& src,
233 boost::compute::wait_list
const& wait = boost::compute::wait_list()) {
234 return execute(src.begin(), src.end(), wait);
237 static boost::compute::program
239 std::ostringstream os;
240 auto device = queue.get_device();
241 if (device.supports_extension(
"cl_khr_fp64")) {
242 os <<
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\n";
244 os <<
"typedef " << boost::compute::type_name<input_type_t>()
245 <<
" reduce_input_t;\n";
246 os <<
"typedef " << boost::compute::type_name<output_type_t>()
247 <<
" reduce_output_t;\n";
248 os <<
"inline void reduce_initialize(reduce_output_t* lhs) {\n";
249 os <<
" " << reducer::initialize_body(
"lhs") <<
"\n";
251 os <<
"inline void reduce_transform(\n";
252 os <<
" reduce_output_t* lhs, reduce_input_t const* value,\n";
253 os <<
" unsigned long offset) {\n";
254 os <<
" " << reducer::transform_body(
"lhs",
"value",
"offset") <<
"\n";
256 os <<
"inline void reduce_combine(\n";
257 os <<
" reduce_output_t* accumulated, reduce_output_t* value) {\n";
258 os <<
" " << reducer::combine_body(
"accumulated",
"value") <<
"\n";
262 auto program = boost::compute::program::create_with_source(
263 os.str(), queue.get_context());
266 }
catch (boost::compute::opencl_error
const& ex) {
267 JB_LOG(
error) <<
"errors building program: " << ex.what() <<
"\n" 268 << program.build_log() <<
"\n";
269 JB_LOG(
error) <<
"Program body\n================\n" 270 << os.str() <<
"\n================\n";
286 boost::compute::vector<output_type>
ping_;
287 boost::compute::vector<output_type>
pong_;
293 #endif // jb_opencl_generic_reduce_hpp
input_type_t input_type
The host type used to represent the input into the reduction.
boost::compute::kernel initial_
boost::compute::future< vector_iterator > execute(InputIterator begin, InputIterator end, boost::compute::wait_list const &wait=boost::compute::wait_list())
Schedule the execution of a reduction.
std::size_t effective_workgroup_size_
boost::compute::vector< output_type > pong_
Implement a generic reducer for OpenCL.
boost::compute::command_queue queue_
typename boost::compute::vector< input_type >::iterator vector_iterator
The type of the vector used to store the results.
char const generic_reduce_program_source[]
Contains the code for the kernels used in computing the argmax.
static boost::compute::program create_program(boost::compute::command_queue const &queue)
boost::compute::kernel intermediate_
#define JB_ASSERT_THROW(PRED)
output_type_t output_type
The host type representing the output of the reduction.
constexpr std::uint64_t p2ceil(std::uint64_t n)
Find the smallest power of 2 larger than n for a 64-bit integer.
boost::compute::vector< output_type > ping_
std::size_t scratch_size_
std::size_t sizeof_output_type_
boost::compute::future< vector_iterator > execute(boost::compute::vector< input_type > const &src, boost::compute::wait_list const &wait=boost::compute::wait_list())
Schedule a reduction for a full vector.
generic_reduce(std::size_t size, boost::compute::command_queue const &queue)
Constructor.
boost::compute::program program_
std::size_t max_workgroup_size_
The top-level namespace for the JayBeams library.