3 // typedef ... reduce_input_t;
4 // typedef ... reduce_output_t;
6 // void reduce_initialize(reduce_output_t* dst);
7 // void reduce_transform(
8 // reduce_output_t* dst, reduce_input_t const* src, unsigned long offset);
9 // void reduce_combine(
10 // reduce_output_t* accumulated, reduce_output_t const* value);
12 inline void reduce_combine_local2local(
13 __local reduce_output_t* accumulated, __local reduce_output_t* value) {
14 reduce_output_t tmp = *value;
15 reduce_output_t acc = *accumulated;
16 reduce_combine(&acc, &tmp);
20 inline void reduce_transform_global(
22 __global reduce_input_t const* src, unsigned long offset) {
23 reduce_input_t val = src[offset];
24 reduce_transform(lhs, &val, offset);
27 void reduce_scratch(__local reduce_output_t* scratch, uint size);
29 // First phase of a generic transform-reduce. Need to transform
30 __kernel void generic_transform_reduce_initial(
31 __global reduce_output_t* dst,
32 unsigned long VPT, unsigned long TPB,
33 unsigned long const N , __global reduce_input_t const* src,
34 __local reduce_output_t* scratch) {
35 uint const offset = get_group_id(0) * TPB;
36 uint const lid = get_local_id(0);
39 reduce_initialize(&p);
40 for (uint i = 0; i < VPT; ++i) {
41 ulong loc = offset + lid + i * get_global_size(0);
44 reduce_transform_global(&tmp, src, loc);
45 reduce_combine(&p, &tmp);
49 reduce_scratch(scratch, TPB);
51 dst[get_group_id(0)] = scratch[0];
55 __kernel void generic_transform_reduce_intermediate(
56 __global reduce_output_t* dst,
57 unsigned long VPT, unsigned long TPB,
58 unsigned long const N, __global reduce_output_t const *src,
59 __local reduce_output_t* scratch) {
60 uint const offset = get_group_id(0) * TPB;
61 uint const lid = get_local_id(0);
64 reduce_initialize(&p);
65 for (uint i = 0; i < VPT; ++i) {
66 uint loc = offset + lid + i * get_global_size(0);
68 reduce_output_t tmp = src[loc];
69 reduce_combine(&p, &tmp);
73 reduce_scratch(scratch, TPB);
75 dst[get_group_id(0)] = scratch[0];
79 void reduce_scratch(__local reduce_output_t* scratch, uint size) {
80 uint const lid = get_local_id(0);
81 for (int i = size / 2; i >= 1; i >>= 1) {
82 barrier(CLK_LOCAL_MEM_FENCE);
83 // keep the processing threads consolidated ...
85 reduce_combine_local2local(scratch + lid, scratch + lid + i);
88 barrier(CLK_LOCAL_MEM_FENCE);
91 __kernel void scratch_element_size(__global ulong* result) {
92 uint const lid = get_local_id(0);
93 uint const gid = get_local_id(0);
94 if (gid == 0 && lid == 0) {
95 *result = sizeof(reduce_output_t);