QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
launch_kernel.cuh
Go to the documentation of this file.
1 #define LAUNCH_KERNEL(kernel, tp, stream, arg, ...) \
2  switch (tp.block.x) { \
3  case 32: \
4  kernel<32,__VA_ARGS__> \
5  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
6  break; \
7  case 64: \
8  kernel<64,__VA_ARGS__> \
9  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
10  break; \
11  case 96: \
12  kernel<96,__VA_ARGS__> \
13  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
14  break; \
15  case 128: \
16  kernel<128,__VA_ARGS__> \
17  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
18  break; \
19  case 160: \
20  kernel<160,__VA_ARGS__> \
21  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
22  break; \
23  case 192: \
24  kernel<192,__VA_ARGS__> \
25  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
26  break; \
27  case 224: \
28  kernel<224,__VA_ARGS__> \
29  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
30  break; \
31  case 256: \
32  kernel<256,__VA_ARGS__> \
33  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
34  break; \
35  case 288: \
36  kernel<288,__VA_ARGS__> \
37  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
38  break; \
39  case 320: \
40  kernel<320,__VA_ARGS__> \
41  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
42  break; \
43  case 352: \
44  kernel<352,__VA_ARGS__> \
45  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
46  break; \
47  case 384: \
48  kernel<384,__VA_ARGS__> \
49  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
50  break; \
51  case 416: \
52  kernel<416,__VA_ARGS__> \
53  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
54  break; \
55  case 448: \
56  kernel<448,__VA_ARGS__> \
57  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
58  break; \
59  case 480: \
60  kernel<480,__VA_ARGS__> \
61  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
62  break; \
63  case 512: \
64  kernel<512,__VA_ARGS__> \
65  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
66  break; \
67  case 544: \
68  kernel<544,__VA_ARGS__> \
69  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
70  break; \
71  case 576: \
72  kernel<576,__VA_ARGS__> \
73  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
74  break; \
75  case 608: \
76  kernel<608,__VA_ARGS__> \
77  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
78  break; \
79  case 640: \
80  kernel<640,__VA_ARGS__> \
81  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
82  break; \
83  case 672: \
84  kernel<672,__VA_ARGS__> \
85  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
86  break; \
87  case 704: \
88  kernel<704,__VA_ARGS__> \
89  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
90  break; \
91  case 736: \
92  kernel<736,__VA_ARGS__> \
93  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
94  break; \
95  case 768: \
96  kernel<768,__VA_ARGS__> \
97  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
98  break; \
99  case 800: \
100  kernel<800,__VA_ARGS__> \
101  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
102  break; \
103  case 832: \
104  kernel<832,__VA_ARGS__> \
105  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
106  break; \
107  case 864: \
108  kernel<864,__VA_ARGS__> \
109  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
110  break; \
111  case 896: \
112  kernel<896,__VA_ARGS__> \
113  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
114  break; \
115  case 928: \
116  kernel<928,__VA_ARGS__> \
117  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
118  break; \
119  case 960: \
120  kernel<960,__VA_ARGS__> \
121  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
122  break; \
123  case 992: \
124  kernel<992,__VA_ARGS__> \
125  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
126  break; \
127  case 1024: \
128  kernel<1024,__VA_ARGS__> \
129  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
130  break; \
131  default: \
132  errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
133  }
134 
135 #define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg, ...) \
136  switch (tp.block.x) { \
137  case 32: \
138  kernel<32,__VA_ARGS__> \
139  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
140  break; \
141  case 64: \
142  kernel<64,__VA_ARGS__> \
143  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
144  break; \
145  case 96: \
146  kernel<96,__VA_ARGS__> \
147  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
148  break; \
149  case 128: \
150  kernel<128,__VA_ARGS__> \
151  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
152  break; \
153  case 160: \
154  kernel<160,__VA_ARGS__> \
155  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
156  break; \
157  case 192: \
158  kernel<192,__VA_ARGS__> \
159  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
160  break; \
161  case 224: \
162  kernel<224,__VA_ARGS__> \
163  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
164  break; \
165  case 256: \
166  kernel<256,__VA_ARGS__> \
167  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
168  break; \
169  case 288: \
170  kernel<288,__VA_ARGS__> \
171  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
172  break; \
173  case 320: \
174  kernel<320,__VA_ARGS__> \
175  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
176  break; \
177  case 352: \
178  kernel<352,__VA_ARGS__> \
179  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
180  break; \
181  case 384: \
182  kernel<384,__VA_ARGS__> \
183  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
184  break; \
185  case 416: \
186  kernel<416,__VA_ARGS__> \
187  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
188  break; \
189  case 448: \
190  kernel<448,__VA_ARGS__> \
191  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
192  break; \
193  case 480: \
194  kernel<480,__VA_ARGS__> \
195  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
196  break; \
197  case 512: \
198  kernel<512,__VA_ARGS__> \
199  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \
200  break; \
201  default: \
202  errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \
203  }
204 
205 #define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg, ...) \
206  switch (tp.block.x) { \
207  case 4: kernel<4, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
208  case 8: kernel<8, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
209  case 12: kernel<12, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
210  case 16: kernel<16, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
211  case 27: kernel<27, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
212  case 32: kernel<32, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
213  case 36: kernel<36, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
214  case 54: kernel<54, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
215  case 64: kernel<64, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
216  case 72: kernel<72, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
217  case 81: kernel<81, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
218  case 96: kernel<96, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
219  case 100: kernel<100, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
220  case 108: kernel<108, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
221  case 128: kernel<128, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
222  case 144: kernel<144, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
223  case 192: kernel<192, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
224  case 200: kernel<200, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
225  case 256: kernel<256, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
226  case 288: kernel<288, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
227  case 432: kernel<432, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
228  case 500: kernel<500, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
229  case 512: kernel<512, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \
230  default: errorQuda("%s block size %d not instantiated", #kernel, tp.block.x); \
231  }