Go to the documentation of this file. 1 #define LAUNCH_KERNEL(kernel, tp, stream, arg, ...) \ 2 switch (tp.block.x) { \ 4 kernel<32,__VA_ARGS__> \ 5 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 8 kernel<64,__VA_ARGS__> \ 9 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 12 kernel<96,__VA_ARGS__> \ 13 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 16 kernel<128,__VA_ARGS__> \ 17 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 20 kernel<160,__VA_ARGS__> \ 21 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 24 kernel<192,__VA_ARGS__> \ 25 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 28 kernel<224,__VA_ARGS__> \ 29 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 32 kernel<256,__VA_ARGS__> \ 33 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 36 kernel<288,__VA_ARGS__> \ 37 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 40 kernel<320,__VA_ARGS__> \ 41 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 44 kernel<352,__VA_ARGS__> \ 45 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 48 kernel<384,__VA_ARGS__> \ 49 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 52 kernel<416,__VA_ARGS__> \ 53 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 56 kernel<448,__VA_ARGS__> \ 57 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 60 kernel<480,__VA_ARGS__> \ 61 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 64 kernel<512,__VA_ARGS__> \ 65 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 68 kernel<544,__VA_ARGS__> \ 69 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 72 kernel<576,__VA_ARGS__> \ 73 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 76 kernel<608,__VA_ARGS__> \ 77 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 80 kernel<640,__VA_ARGS__> \ 81 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 84 kernel<672,__VA_ARGS__> \ 85 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 88 kernel<704,__VA_ARGS__> \ 89 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 92 kernel<736,__VA_ARGS__> \ 93 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 96 kernel<768,__VA_ARGS__> \ 97 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 100 kernel<800,__VA_ARGS__> \ 101 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 104 kernel<832,__VA_ARGS__> \ 105 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 108 kernel<864,__VA_ARGS__> \ 109 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 112 kernel<896,__VA_ARGS__> \ 113 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 116 kernel<928,__VA_ARGS__> \ 117 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 120 kernel<960,__VA_ARGS__> \ 121 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 124 kernel<992,__VA_ARGS__> \ 125 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 128 kernel<1024,__VA_ARGS__> \ 129 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 132 errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 135 #define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg, ...) \ 136 switch (tp.block.x) { \ 138 kernel<32,__VA_ARGS__> \ 139 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 142 kernel<64,__VA_ARGS__> \ 143 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 146 kernel<96,__VA_ARGS__> \ 147 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 150 kernel<128,__VA_ARGS__> \ 151 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 154 kernel<160,__VA_ARGS__> \ 155 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 158 kernel<192,__VA_ARGS__> \ 159 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 162 kernel<224,__VA_ARGS__> \ 163 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 166 kernel<256,__VA_ARGS__> \ 167 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 170 kernel<288,__VA_ARGS__> \ 171 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 174 kernel<320,__VA_ARGS__> \ 175 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 178 kernel<352,__VA_ARGS__> \ 179 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 182 kernel<384,__VA_ARGS__> \ 183 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 186 kernel<416,__VA_ARGS__> \ 187 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 190 kernel<448,__VA_ARGS__> \ 191 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 194 kernel<480,__VA_ARGS__> \ 195 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 198 kernel<512,__VA_ARGS__> \ 199 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 202 errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 205 #define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg, ...) \ 206 switch (tp.block.x) { \ 207 case 4: kernel<4, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 208 case 8: kernel<8, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 209 case 12: kernel<12, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 210 case 16: kernel<16, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 211 case 27: kernel<27, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 212 case 32: kernel<32, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 213 case 36: kernel<36, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 214 case 54: kernel<54, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 215 case 64: kernel<64, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 216 case 72: kernel<72, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 217 case 81: kernel<81, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 218 case 96: kernel<96, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 219 case 100: kernel<100, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 220 case 108: kernel<108, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 221 case 128: kernel<128, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 222 case 144: kernel<144, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 223 case 192: kernel<192, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 224 case 200: kernel<200, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 225 case 256: kernel<256, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 226 case 288: kernel<288, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 227 case 432: kernel<432, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 228 case 500: kernel<500, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 229 case 512: kernel<512, __VA_ARGS__><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break; \ 230 default: errorQuda("%s block size %d not instantiated", #kernel, tp.block.x); \