Go to the documentation of this file. 1 #define LAUNCH_KERNEL(kernel, tp, stream, arg, ...) \ 2 switch (tp.block.x) { \ 4 kernel<32,__VA_ARGS__> \ 5 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 8 kernel<64,__VA_ARGS__> \ 9 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 12 kernel<96,__VA_ARGS__> \ 13 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 16 kernel<128,__VA_ARGS__> \ 17 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 20 kernel<160,__VA_ARGS__> \ 21 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 24 kernel<192,__VA_ARGS__> \ 25 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 28 kernel<224,__VA_ARGS__> \ 29 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 32 kernel<256,__VA_ARGS__> \ 33 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 36 kernel<288,__VA_ARGS__> \ 37 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 40 kernel<320,__VA_ARGS__> \ 41 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 44 kernel<352,__VA_ARGS__> \ 45 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 48 kernel<384,__VA_ARGS__> \ 49 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 52 kernel<416,__VA_ARGS__> \ 53 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 56 kernel<448,__VA_ARGS__> \ 57 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 60 kernel<480,__VA_ARGS__> \ 61 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 64 kernel<512,__VA_ARGS__> \ 65 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 68 kernel<544,__VA_ARGS__> \ 69 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 72 kernel<576,__VA_ARGS__> \ 73 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 76 kernel<608,__VA_ARGS__> \ 77 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 80 kernel<640,__VA_ARGS__> \ 81 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 84 kernel<672,__VA_ARGS__> \ 85 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 88 kernel<704,__VA_ARGS__> \ 89 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 92 kernel<736,__VA_ARGS__> \ 93 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 96 kernel<768,__VA_ARGS__> \ 97 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 100 kernel<800,__VA_ARGS__> \ 101 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 104 kernel<832,__VA_ARGS__> \ 105 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 108 kernel<864,__VA_ARGS__> \ 109 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 112 kernel<896,__VA_ARGS__> \ 113 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 116 kernel<928,__VA_ARGS__> \ 117 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 120 kernel<960,__VA_ARGS__> \ 121 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 124 kernel<992,__VA_ARGS__> \ 125 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 128 kernel<1024,__VA_ARGS__> \ 129 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 132 errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 135 #define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg, ...) \ 136 switch (tp.block.x) { \ 138 kernel<32,__VA_ARGS__> \ 139 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 142 kernel<64,__VA_ARGS__> \ 143 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 146 kernel<96,__VA_ARGS__> \ 147 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 150 kernel<128,__VA_ARGS__> \ 151 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 154 kernel<160,__VA_ARGS__> \ 155 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 158 kernel<192,__VA_ARGS__> \ 159 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 162 kernel<224,__VA_ARGS__> \ 163 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 166 kernel<256,__VA_ARGS__> \ 167 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 170 kernel<288,__VA_ARGS__> \ 171 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 174 kernel<320,__VA_ARGS__> \ 175 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 178 kernel<352,__VA_ARGS__> \ 179 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 182 kernel<384,__VA_ARGS__> \ 183 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 186 kernel<416,__VA_ARGS__> \ 187 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 190 kernel<448,__VA_ARGS__> \ 191 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 194 kernel<480,__VA_ARGS__> \ 195 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 198 kernel<512,__VA_ARGS__> \ 199 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg); \ 202 errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \