JayBeams  0.1
Another project to have fun coding.
generic_reduce_program.cl
Go to the documentation of this file.
1 /* -*- c -*- */
2 
3 // typedef ... reduce_input_t;
4 // typedef ... reduce_output_t;
5 
6 // void reduce_initialize(reduce_output_t* dst);
7 // void reduce_transform(
8 // reduce_output_t* dst, reduce_input_t const* src, unsigned long offset);
9 // void reduce_combine(
10 // reduce_output_t* accumulated, reduce_output_t const* value);
11 
12 inline void reduce_combine_local2local(
13  __local reduce_output_t* accumulated, __local reduce_output_t* value) {
14  reduce_output_t tmp = *value;
15  reduce_output_t acc = *accumulated;
16  reduce_combine(&acc, &tmp);
17  *accumulated = acc;
18 }
19 
20 inline void reduce_transform_global(
21  reduce_output_t* lhs,
22  __global reduce_input_t const* src, unsigned long offset) {
23  reduce_input_t val = src[offset];
24  reduce_transform(lhs, &val, offset);
25 }
26 
27 void reduce_scratch(__local reduce_output_t* scratch, uint size);
28 
29 // First phase of a generic transform-reduce. Need to transform
30 __kernel void generic_transform_reduce_initial(
31  __global reduce_output_t* dst,
32  unsigned long VPT, unsigned long TPB,
33  unsigned long const N , __global reduce_input_t const* src,
34  __local reduce_output_t* scratch) {
35  uint const offset = get_group_id(0) * TPB;
36  uint const lid = get_local_id(0);
37 
38  reduce_output_t p;
39  reduce_initialize(&p);
40  for (uint i = 0; i < VPT; ++i) {
41  ulong loc = offset + lid + i * get_global_size(0);
42  if (loc < N) {
43  reduce_output_t tmp;
44  reduce_transform_global(&tmp, src, loc);
45  reduce_combine(&p, &tmp);
46  }
47  }
48  scratch[lid] = p;
49  reduce_scratch(scratch, TPB);
50  if (lid == 0) {
51  dst[get_group_id(0)] = scratch[0];
52  }
53 }
54 
55 __kernel void generic_transform_reduce_intermediate(
56  __global reduce_output_t* dst,
57  unsigned long VPT, unsigned long TPB,
58  unsigned long const N, __global reduce_output_t const *src,
59  __local reduce_output_t* scratch) {
60  uint const offset = get_group_id(0) * TPB;
61  uint const lid = get_local_id(0);
62 
63  reduce_output_t p;
64  reduce_initialize(&p);
65  for (uint i = 0; i < VPT; ++i) {
66  uint loc = offset + lid + i * get_global_size(0);
67  if (loc < N) {
68  reduce_output_t tmp = src[loc];
69  reduce_combine(&p, &tmp);
70  }
71  }
72  scratch[lid] = p;
73  reduce_scratch(scratch, TPB);
74  if (lid == 0) {
75  dst[get_group_id(0)] = scratch[0];
76  }
77 }
78 
79 void reduce_scratch(__local reduce_output_t* scratch, uint size) {
80  uint const lid = get_local_id(0);
81  for (int i = size / 2; i >= 1; i >>= 1) {
82  barrier(CLK_LOCAL_MEM_FENCE);
83  // keep the processing threads consolidated ...
84  if (lid < i) {
85  reduce_combine_local2local(scratch + lid, scratch + lid + i);
86  }
87  }
88  barrier(CLK_LOCAL_MEM_FENCE);
89 }
90 
91 __kernel void scratch_element_size(__global ulong* result) {
92  uint const lid = get_local_id(0);
93  uint const gid = get_local_id(0);
94  if (gid == 0 && lid == 0) {
95  *result = sizeof(reduce_output_t);
96  }
97 }