Go to the documentation of this file. 1 #define READ_SPINOR_DOUBLE(spinor_, stride, sp_idx, norm_idx) \ 2 double2 *spinor = (double2*)spinor_; \ 3 double2 I0 = spinor[sp_idx + 0*(stride)]; \ 4 double2 I1 = spinor[sp_idx + 1*(stride)]; \ 5 double2 I2 = spinor[sp_idx + 2*(stride)]; \ 6 double2 I3 = spinor[sp_idx + 3*(stride)]; \ 7 double2 I4 = spinor[sp_idx + 4*(stride)]; \ 8 double2 I5 = spinor[sp_idx + 5*(stride)]; \ 9 double2 I6 = spinor[sp_idx + 6*(stride)]; \ 10 double2 I7 = spinor[sp_idx + 7*(stride)]; \ 11 double2 I8 = spinor[sp_idx + 8*(stride)]; \ 12 double2 I9 = spinor[sp_idx + 9*(stride)]; \ 13 double2 I10 = spinor[sp_idx + 10*(stride)]; \ 14 double2 I11 = spinor[sp_idx + 11*(stride)]; 16 #define READ_SPINOR_GHOST_DOUBLE(spinor_, stride, sp_idx, norm_idx, dir) \ 17 double2 *spinor = (double2*)spinor_[dir]; \ 18 double2 I0 = spinor[sp_idx + 0*(stride)]; \ 19 double2 I1 = spinor[sp_idx + 1*(stride)]; \ 20 double2 I2 = spinor[sp_idx + 2*(stride)]; \ 21 double2 I3 = spinor[sp_idx + 3*(stride)]; \ 22 double2 I4 = spinor[sp_idx + 4*(stride)]; \ 23 double2 I5 = spinor[sp_idx + 5*(stride)]; 25 #define READ_SPINOR_DOUBLE_UP(spinor_, stride, sp_idx, norm_idx) \ 26 double2 *spinor = (double2*)spinor_; \ 27 double2 I0 = spinor[sp_idx + 0*(stride)]; \ 28 double2 I1 = spinor[sp_idx + 1*(stride)]; \ 29 double2 I2 = spinor[sp_idx + 2*(stride)]; \ 30 double2 I3 = spinor[sp_idx + 3*(stride)]; \ 31 double2 I4 = spinor[sp_idx + 4*(stride)]; \ 32 double2 I5 = spinor[sp_idx + 5*(stride)]; 34 #define READ_SPINOR_DOUBLE_DOWN(spinor_, stride, sp_idx, norm_idx) \ 35 double2 *spinor = (double2*)spinor_; \ 36 double2 I6 = spinor[sp_idx + 6*(stride)]; \ 37 double2 I7 = spinor[sp_idx + 7*(stride)]; \ 38 double2 I8 = spinor[sp_idx + 8*(stride)]; \ 39 double2 I9 = spinor[sp_idx + 9*(stride)]; \ 40 double2 I10 = spinor[sp_idx + 10*(stride)]; \ 41 double2 I11 = spinor[sp_idx + 11*(stride)]; 43 #define READ_SPINOR_SINGLE(spinor_, stride, sp_idx, norm_idx) \ 44 float4 *spinor = (float4*)spinor_; \ 45 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 46 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 47 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 48 float4 I3 = spinor[sp_idx + 3*(stride)]; \ 49 float4 I4 = spinor[sp_idx + 4*(stride)]; \ 50 float4 I5 = spinor[sp_idx + 5*(stride)]; 52 #define READ_SPINOR_GHOST_SINGLE(spinor_, stride, sp_idx, norm_idx, dir) \ 53 float4 *spinor = (float4*)spinor_[dir]; \ 54 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 55 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 56 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 58 #define READ_SPINOR_SINGLE_UP(spinor_, stride, sp_idx, norm_idx) \ 59 float4 *spinor = (float4*)spinor_; \ 60 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 61 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 62 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 64 #define READ_SPINOR_SINGLE_DOWN(spinor_, stride, sp_idx, norm_idx) \ 65 float4 *spinor = (float4*)spinor_; \ 66 float4 I3 = spinor[sp_idx + 3*(stride)]; \ 67 float4 I4 = spinor[sp_idx + 4*(stride)]; \ 68 float4 I5 = spinor[sp_idx + 5*(stride)]; 70 #define READ_SPINOR_HALF_(spinor_, stride, sp_idx, norm_idx) \ 71 short4 *spinor = (short4*)spinor_; \ 72 float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \ 73 float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \ 74 float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \ 75 float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \ 76 float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \ 77 float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \ 78 float C = (spinor_ ## Norm)[norm_idx]; \ 79 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 80 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 81 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 82 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 83 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 84 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 86 #define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \ 87 READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) 89 #define READ_SPINOR_GHOST_HALF_(spinor_, stride, sp_idx, norm_idx, dir) \ 90 short4 *spinor = (short4*)spinor_[dir]; \ 91 float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \ 92 float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \ 93 float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \ 94 float C = (spinor_ ## Norm)[norm_idx]; \ 95 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 96 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 97 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 99 #define READ_SPINOR_GHOST_HALF(spinor, stride, sp_idx, norm_idx, dir) \ 100 READ_SPINOR_GHOST_HALF_(spinor, stride, sp_idx, norm_idx, dir) 102 #define READ_SPINOR_HALF_UP_(spinor_, stride, sp_idx, norm_idx) \ 103 short4 *spinor = (short4*)spinor_; \ 104 float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \ 105 float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \ 106 float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \ 107 float C = (spinor_ ## Norm)[norm_idx]; \ 108 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 109 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 110 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 112 #define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \ 113 READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) 115 #define READ_SPINOR_HALF_DOWN_(spinor_, stride, sp_idx, norm_idx) \ 116 short4 *spinor = (short4*)spinor_; \ 117 float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \ 118 float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \ 119 float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \ 120 float C = (spinor_ ## Norm)[norm_idx]; \ 121 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 122 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 123 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 125 #define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \ 126 READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) 128 #define READ_ACCUM_DOUBLE(spinor_, stride) \ 129 double2 *spinor = (double2*)spinor_; \ 130 double2 accum0 = spinor[sid + 0*stride]; \ 131 double2 accum1 = spinor[sid + 1*stride]; \ 132 double2 accum2 = spinor[sid + 2*stride]; \ 133 double2 accum3 = spinor[sid + 3*stride]; \ 134 double2 accum4 = spinor[sid + 4*stride]; \ 135 double2 accum5 = spinor[sid + 5*stride]; \ 136 double2 accum6 = spinor[sid + 6*stride]; \ 137 double2 accum7 = spinor[sid + 7*stride]; \ 138 double2 accum8 = spinor[sid + 8*stride]; \ 139 double2 accum9 = spinor[sid + 9*stride]; \ 140 double2 accum10 = spinor[sid + 10*stride]; \ 141 double2 accum11 = spinor[sid + 11*stride]; 143 #define READ_ACCUM_SINGLE(spinor_, stride) \ 144 float4 *spinor = (float4*)spinor_; \ 145 float4 accum0 = spinor[sid + 0*(stride)]; \ 146 float4 accum1 = spinor[sid + 1*(stride)]; \ 147 float4 accum2 = spinor[sid + 2*(stride)]; \ 148 float4 accum3 = spinor[sid + 3*(stride)]; \ 149 float4 accum4 = spinor[sid + 4*(stride)]; \ 150 float4 accum5 = spinor[sid + 5*(stride)]; 152 #define READ_ACCUM_HALF_(spinor_, stride) \ 153 short4 *spinor = (short4*)spinor_; \ 154 float4 accum0 = short42float4(spinor[sid + 0*stride]); \ 155 float4 accum1 = short42float4(spinor[sid + 1*stride]); \ 156 float4 accum2 = short42float4(spinor[sid + 2*stride]); \ 157 float4 accum3 = short42float4(spinor[sid + 3*stride]); \ 158 float4 accum4 = short42float4(spinor[sid + 4*stride]); \ 159 float4 accum5 = short42float4(spinor[sid + 5*stride]); \ 160 float C = (spinor_ ## Norm)[sid]; \ 161 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 162 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 163 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 164 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 165 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 166 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; 168 #define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride) 170 #define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \ 171 double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 172 double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 173 double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 174 double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 175 double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 176 double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \ 177 double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \ 178 double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \ 179 double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \ 180 double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \ 181 double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \ 182 double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride)); 184 #ifdef USE_TEXTURE_OBJECTS 185 #define READ_SPINOR_GHOST_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \ 186 double2 I0 = fetch_double2((spinor)[dir], sp_idx + 0*(stride)); \ 187 double2 I1 = fetch_double2((spinor)[dir], sp_idx + 1*(stride)); \ 188 double2 I2 = fetch_double2((spinor)[dir], sp_idx + 2*(stride)); \ 189 double2 I3 = fetch_double2((spinor)[dir], sp_idx + 3*(stride)); \ 190 double2 I4 = fetch_double2((spinor)[dir], sp_idx + 4*(stride)); \ 191 double2 I5 = fetch_double2((spinor)[dir], sp_idx + 5*(stride)); 193 #define READ_SPINOR_GHOST_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \ 194 double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 195 double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 196 double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 197 double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 198 double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 199 double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); 202 #define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \ 203 double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 204 double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 205 double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 206 double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 207 double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 208 double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); 210 #define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \ 211 double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \ 212 double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \ 213 double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \ 214 double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \ 215 double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \ 216 double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride)); 218 #define READ_ACCUM_DOUBLE_TEX(spinor, stride) \ 219 double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \ 220 double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \ 221 double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \ 222 double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \ 223 double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \ 224 double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \ 225 double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \ 226 double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \ 227 double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \ 228 double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \ 229 double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \ 230 double2 accum11 = fetch_double2((spinor), sid + 11*(stride)); 232 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \ 233 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 234 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 235 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 236 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 237 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 238 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); 240 #ifdef USE_TEXTURE_OBJECTS 241 #define READ_SPINOR_GHOST_SINGLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \ 242 float4 I0 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 0*(stride)); \ 243 float4 I1 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 1*(stride)); \ 244 float4 I2 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 2*(stride)); 246 #define READ_SPINOR_GHOST_SINGLE_TEX(spinor, stride, sp_idx, norm_idx, dir) \ 247 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 248 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 249 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); 252 #define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \ 253 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 254 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 255 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 257 #define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \ 258 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 259 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 260 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); 262 #define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \ 263 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 264 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 265 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 266 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 267 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 268 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \ 269 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \ 270 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 271 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 272 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 273 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 274 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 275 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 277 #define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \ 278 READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \ 280 #ifdef USE_TEXTURE_OBJECTS 281 #define READ_SPINOR_GHOST_HALF_TEX_(spinor, stride, sp_idx, norm_idx, dir) \ 282 float4 I0 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 0*(stride)); \ 283 float4 I1 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 1*(stride)); \ 284 float4 I2 = TEX1DFETCH(float4, (spinor)[dir], sp_idx + 2*(stride)); \ 285 float C = TEX1DFETCH(float, (spinor ## Norm)[dir], norm_idx); \ 286 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 287 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 288 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; 290 #define READ_SPINOR_GHOST_HALF_TEX_(spinor, stride, sp_idx, norm_idx, dir) \ 291 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 292 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 293 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 294 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \ 295 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 296 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 297 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; 300 #define READ_SPINOR_GHOST_HALF_TEX(spinor, stride, sp_idx, norm_idx, dir) \ 301 READ_SPINOR_GHOST_HALF_TEX_(spinor, stride, sp_idx, norm_idx, dir) \ 303 #define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \ 304 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 305 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 306 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 307 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \ 308 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 309 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 310 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 312 #define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \ 313 READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \ 315 #define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \ 316 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 317 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 318 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); \ 319 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx); \ 320 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 321 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 322 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 324 #define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \ 325 READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \ 327 #define READ_ACCUM_SINGLE_TEX(spinor, stride) \ 328 float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \ 329 float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \ 330 float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \ 331 float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \ 332 float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \ 333 float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); 335 #define READ_ACCUM_HALF_TEX_(spinor, stride) \ 336 float4 accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \ 337 float4 accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \ 338 float4 accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \ 339 float4 accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \ 340 float4 accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \ 341 float4 accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \ 342 float C = TEX1DFETCH(float, (spinor ## Norm), sid); \ 343 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 344 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 345 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 346 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 347 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 348 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; 350 #define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride) 353 #define WRITE_SPINOR_DOUBLE2(stride) \ 354 double2 *out = (double2*)param.out; \ 355 out[0*(stride)+sid] = make_double2(o00_re, o00_im); \ 356 out[1*(stride)+sid] = make_double2(o01_re, o01_im); \ 357 out[2*(stride)+sid] = make_double2(o02_re, o02_im); \ 358 out[3*(stride)+sid] = make_double2(o10_re, o10_im); \ 359 out[4*(stride)+sid] = make_double2(o11_re, o11_im); \ 360 out[5*(stride)+sid] = make_double2(o12_re, o12_im); \ 361 out[6*(stride)+sid] = make_double2(o20_re, o20_im); \ 362 out[7*(stride)+sid] = make_double2(o21_re, o21_im); \ 363 out[8*(stride)+sid] = make_double2(o22_re, o22_im); \ 364 out[9*(stride)+sid] = make_double2(o30_re, o30_im); \ 365 out[10*(stride)+sid] = make_double2(o31_re, o31_im); \ 366 out[11*(stride)+sid] = make_double2(o32_re, o32_im); 368 #define WRITE_SPINOR_FLOAT4(stride) \ 369 float4 *out = (float4*)param.out; \ 370 out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \ 371 out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \ 372 out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \ 373 out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \ 374 out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \ 375 out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im); 377 #define WRITE_SPINOR_SHORT4(stride) \ 378 short4 *out = (short4*)param.out; \ 379 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 380 float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \ 381 float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \ 382 float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \ 383 float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \ 384 float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \ 385 float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \ 386 float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \ 387 float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \ 388 float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \ 389 float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \ 390 float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \ 391 c0 = fmaxf(c0, c1); \ 392 c1 = fmaxf(c2, c3); \ 393 c2 = fmaxf(c4, c5); \ 394 c3 = fmaxf(c6, c7); \ 395 c4 = fmaxf(c8, c9); \ 396 c5 = fmaxf(c10, c11); \ 397 c0 = fmaxf(c0, c1); \ 398 c1 = fmaxf(c2, c3); \ 399 c2 = fmaxf(c4, c5); \ 400 c0 = fmaxf(c0, c1); \ 401 c0 = fmaxf(c0, c2); \ 402 param.outNorm[sid] = c0; \ 403 float scale = __fdividef(MAX_SHORT, c0); \ 404 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 405 o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \ 406 o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \ 407 o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \ 408 o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \ 409 o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \ 410 out[sid+0*(stride)] = make_short4(f2i(o00_re), f2i(o00_im), f2i(o01_re), f2i(o01_im)); \ 411 out[sid+1*(stride)] = make_short4(f2i(o02_re), f2i(o02_im), f2i(o10_re), f2i(o10_im)); \ 412 out[sid+2*(stride)] = make_short4(f2i(o11_re), f2i(o11_im), f2i(o12_re), f2i(o12_im)); \ 413 out[sid+3*(stride)] = make_short4(f2i(o20_re), f2i(o20_im), f2i(o21_re), f2i(o21_im)); \ 414 out[sid+4*(stride)] = make_short4(f2i(o22_re), f2i(o22_im), f2i(o30_re), f2i(o30_im)); \ 415 out[sid+5*(stride)] = make_short4(f2i(o31_re), f2i(o31_im), f2i(o32_re), f2i(o32_im)); 417 #define WRITE_SPINOR_DOUBLE2_STR(stride) \ 418 double2 *out = (double2*)param.out; \ 419 store_streaming_double2(&out[0*stride+sid], o00_re, o00_im); \ 420 store_streaming_double2(&out[1*stride+sid], o01_re, o01_im); \ 421 store_streaming_double2(&out[2*stride+sid], o02_re, o02_im); \ 422 store_streaming_double2(&out[3*stride+sid], o10_re, o10_im); \ 423 store_streaming_double2(&out[4*stride+sid], o11_re, o11_im); \ 424 store_streaming_double2(&out[5*stride+sid], o12_re, o12_im); \ 425 store_streaming_double2(&out[6*stride+sid], o20_re, o20_im); \ 426 store_streaming_double2(&out[7*stride+sid], o21_re, o21_im); \ 427 store_streaming_double2(&out[8*stride+sid], o22_re, o22_im); \ 428 store_streaming_double2(&out[9*stride+sid], o30_re, o30_im); \ 429 store_streaming_double2(&out[10*stride+sid], o31_re, o31_im); \ 430 store_streaming_double2(&out[11*stride+sid], o32_re, o32_im); 432 #define WRITE_SPINOR_FLOAT4_STR(stride) \ 433 float4 *out = (float4*)param.out; \ 434 store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \ 435 store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \ 436 store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \ 437 store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \ 438 store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \ 439 store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im); 441 #define WRITE_SPINOR_SHORT4_STR(stride) \ 442 short4 *out = (short4*)param.out; \ 443 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 444 float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \ 445 float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \ 446 float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \ 447 float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \ 448 float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \ 449 float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \ 450 float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \ 451 float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \ 452 float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \ 453 float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \ 454 float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \ 455 c0 = fmaxf(c0, c1); \ 456 c1 = fmaxf(c2, c3); \ 457 c2 = fmaxf(c4, c5); \ 458 c3 = fmaxf(c6, c7); \ 459 c4 = fmaxf(c8, c9); \ 460 c5 = fmaxf(c10, c11); \ 461 c0 = fmaxf(c0, c1); \ 462 c1 = fmaxf(c2, c3); \ 463 c2 = fmaxf(c4, c5); \ 464 c0 = fmaxf(c0, c1); \ 465 c0 = fmaxf(c0, c2); \ 466 param.outNorm[sid] = c0; \ 467 float scale = __fdividef(MAX_SHORT, c0); \ 468 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 469 o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \ 470 o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \ 471 o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \ 472 o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \ 473 o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \ 474 store_streaming_short4(&out[0*(stride)+sid], f2i(o00_re), f2i(o00_im), f2i(o01_re), f2i(o01_im)); \ 475 store_streaming_short4(&out[1*(stride)+sid], f2i(o02_re), f2i(o02_im), f2i(o10_re), f2i(o10_im)); \ 476 store_streaming_short4(&out[2*(stride)+sid], f2i(o11_re), f2i(o11_im), f2i(o12_re), f2i(o12_im)); \ 477 store_streaming_short4(&out[3*(stride)+sid], f2i(o20_re), f2i(o20_im), f2i(o21_re), f2i(o21_im)); \ 478 store_streaming_short4(&out[4*(stride)+sid], f2i(o22_re), f2i(o22_im), f2i(o30_re), f2i(o30_im)); \ 479 store_streaming_short4(&out[5*(stride)+sid], f2i(o31_re), f2i(o31_im), f2i(o32_re), f2i(o32_im)); 483 #define READ_HALF_SPINOR READ_SPINOR_UP 485 #define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \ 486 out[0*(stride)+sid] = make_double2(a0_re, a0_im); \ 487 out[1*(stride)+sid] = make_double2(a1_re, a1_im); \ 488 out[2*(stride)+sid] = make_double2(a2_re, a2_im); \ 489 out[3*(stride)+sid] = make_double2(b0_re, b0_im); \ 490 out[4*(stride)+sid] = make_double2(b1_re, b1_im); \ 491 out[5*(stride)+sid] = make_double2(b2_re, b2_im); 493 #define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \ 494 out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \ 495 out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \ 496 out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im); 498 #define WRITE_HALF_SPINOR_SHORT4(stride, sid) \ 499 float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \ 500 float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \ 501 float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \ 502 float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \ 503 float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \ 504 float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \ 505 c0 = fmaxf(c0, c1); \ 506 c1 = fmaxf(c2, c3); \ 507 c2 = fmaxf(c4, c5); \ 508 c0 = fmaxf(c0, c1); \ 509 c0 = fmaxf(c0, c2); \ 511 float scale = __fdividef(MAX_SHORT, c0); \ 512 a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \ 513 a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \ 514 b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \ 515 out[sid+0*(stride)] = make_short4(f2i(a0_re), f2i(a0_im), f2i(a1_re), f2i(a1_im)); \ 516 out[sid+1*(stride)] = make_short4(f2i(a2_re), f2i(a2_im), f2i(b0_re), f2i(b0_im)); \ 517 out[sid+2*(stride)] = make_short4(f2i(b1_re), f2i(b1_im), f2i(b2_re), f2i(b2_im)); 521 #define WRITE_FLAVOR_SPINOR_DOUBLE2() \ 522 double2 *out = (double2*)param.out; \ 523 out[0*(param.sp_stride)+sid] = make_double2(o1_00_re, o1_00_im); \ 524 out[1*(param.sp_stride)+sid] = make_double2(o1_01_re, o1_01_im); \ 525 out[2*(param.sp_stride)+sid] = make_double2(o1_02_re, o1_02_im); \ 526 out[3*(param.sp_stride)+sid] = make_double2(o1_10_re, o1_10_im); \ 527 out[4*(param.sp_stride)+sid] = make_double2(o1_11_re, o1_11_im); \ 528 out[5*(param.sp_stride)+sid] = make_double2(o1_12_re, o1_12_im); \ 529 out[6*(param.sp_stride)+sid] = make_double2(o1_20_re, o1_20_im); \ 530 out[7*(param.sp_stride)+sid] = make_double2(o1_21_re, o1_21_im); \ 531 out[8*(param.sp_stride)+sid] = make_double2(o1_22_re, o1_22_im); \ 532 out[9*(param.sp_stride)+sid] = make_double2(o1_30_re, o1_30_im); \ 533 out[10*(param.sp_stride)+sid] = make_double2(o1_31_re, o1_31_im); \ 534 out[11*(param.sp_stride)+sid] = make_double2(o1_32_re, o1_32_im); \ 535 out[0*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_00_re, o2_00_im); \ 536 out[1*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_01_re, o2_01_im); \ 537 out[2*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_02_re, o2_02_im); \ 538 out[3*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_10_re, o2_10_im); \ 539 out[4*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_11_re, o2_11_im); \ 540 out[5*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_12_re, o2_12_im); \ 541 out[6*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_20_re, o2_20_im); \ 542 out[7*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_21_re, o2_21_im); \ 543 out[8*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_22_re, o2_22_im); \ 544 out[9*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_30_re, o2_30_im); \ 545 out[10*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_31_re, o2_31_im); \ 546 out[11*(param.sp_stride)+sid+param.fl_stride] = make_double2(o2_32_re, o2_32_im); 549 #define WRITE_FLAVOR_SPINOR_FLOAT4() \ 550 float4 *out = (float4*)param.out; \ 551 out[0*(param.sp_stride)+sid] = make_float4(o1_00_re, o1_00_im, o1_01_re, o1_01_im); \ 552 out[1*(param.sp_stride)+sid] = make_float4(o1_02_re, o1_02_im, o1_10_re, o1_10_im); \ 553 out[2*(param.sp_stride)+sid] = make_float4(o1_11_re, o1_11_im, o1_12_re, o1_12_im); \ 554 out[3*(param.sp_stride)+sid] = make_float4(o1_20_re, o1_20_im, o1_21_re, o1_21_im); \ 555 out[4*(param.sp_stride)+sid] = make_float4(o1_22_re, o1_22_im, o1_30_re, o1_30_im); \ 556 out[5*(param.sp_stride)+sid] = make_float4(o1_31_re, o1_31_im, o1_32_re, o1_32_im); \ 557 out[0*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_00_re, o2_00_im, o2_01_re, o2_01_im); \ 558 out[1*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_02_re, o2_02_im, o2_10_re, o2_10_im); \ 559 out[2*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_11_re, o2_11_im, o2_12_re, o2_12_im); \ 560 out[3*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_20_re, o2_20_im, o2_21_re, o2_21_im); \ 561 out[4*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_22_re, o2_22_im, o2_30_re, o2_30_im); \ 562 out[5*(param.sp_stride)+sid+param.fl_stride] = make_float4(o2_31_re, o2_31_im, o2_32_re, o2_32_im); 565 #define WRITE_FLAVOR_SPINOR_SHORT4() \ 566 short4 *out = (short4*)param.out; \ 567 float c0 = fmaxf(fabsf(o1_00_re), fabsf(o1_00_im)); \ 568 float c1 = fmaxf(fabsf(o1_01_re), fabsf(o1_02_im)); \ 569 float c2 = fmaxf(fabsf(o1_02_re), fabsf(o1_01_im)); \ 570 float c3 = fmaxf(fabsf(o1_10_re), fabsf(o1_10_im)); \ 571 float c4 = fmaxf(fabsf(o1_11_re), fabsf(o1_11_im)); \ 572 float c5 = fmaxf(fabsf(o1_12_re), fabsf(o1_12_im)); \ 573 float c6 = fmaxf(fabsf(o1_20_re), fabsf(o1_20_im)); \ 574 float c7 = fmaxf(fabsf(o1_21_re), fabsf(o1_21_im)); \ 575 float c8 = fmaxf(fabsf(o1_22_re), fabsf(o1_22_im)); \ 576 float c9 = fmaxf(fabsf(o1_30_re), fabsf(o1_30_im)); \ 577 float c10 = fmaxf(fabsf(o1_31_re), fabsf(o1_31_im)); \ 578 float c11 = fmaxf(fabsf(o1_32_re), fabsf(o1_32_im)); \ 579 c0 = fmaxf(c0, c1); \ 580 c1 = fmaxf(c2, c3); \ 581 c2 = fmaxf(c4, c5); \ 582 c3 = fmaxf(c6, c7); \ 583 c4 = fmaxf(c8, c9); \ 584 c5 = fmaxf(c10, c11); \ 585 c0 = fmaxf(c0, c1); \ 586 c1 = fmaxf(c2, c3); \ 587 c2 = fmaxf(c4, c5); \ 588 c0 = fmaxf(c0, c1); \ 589 c0 = fmaxf(c0, c2); \ 590 param.outNorm[sid] = c0; \ 591 float scale = __fdividef(MAX_SHORT, c0); \ 592 o1_00_re *= scale; o1_00_im *= scale; o1_01_re *= scale; o1_01_im *= scale; \ 593 o1_02_re *= scale; o1_02_im *= scale; o1_10_re *= scale; o1_10_im *= scale; \ 594 o1_11_re *= scale; o1_11_im *= scale; o1_12_re *= scale; o1_12_im *= scale; \ 595 o1_20_re *= scale; o1_20_im *= scale; o1_21_re *= scale; o1_21_im *= scale; \ 596 o1_22_re *= scale; o1_22_im *= scale; o1_30_re *= scale; o1_30_im *= scale; \ 597 o1_31_re *= scale; o1_31_im *= scale; o1_32_re *= scale; o1_32_im *= scale; \ 598 out[sid+0*(param.sp_stride)] = make_short4(f2i(o1_00_re), f2i(o1_00_im), f2i(o1_01_re), f2i(o1_01_im)); \ 599 out[sid+1*(param.sp_stride)] = make_short4(f2i(o1_02_re), f2i(o1_02_im), f2i(o1_10_re), f2i(o1_10_im)); \ 600 out[sid+2*(param.sp_stride)] = make_short4(f2i(o1_11_re), f2i(o1_11_im), f2i(o1_12_re), f2i(o1_12_im)); \ 601 out[sid+3*(param.sp_stride)] = make_short4(f2i(o1_20_re), f2i(o1_20_im), f2i(o1_21_re), f2i(o1_21_im)); \ 602 out[sid+4*(param.sp_stride)] = make_short4(f2i(o1_22_re), f2i(o1_22_im), f2i(o1_30_re), f2i(o1_30_im)); \ 603 out[sid+5*(param.sp_stride)] = make_short4(f2i(o1_31_re), f2i(o1_31_im), f2i(o1_32_re), f2i(o1_32_im)); \ 604 c0 = fmaxf(fabsf(o2_00_re), fabsf(o2_00_im)); \ 605 c1 = fmaxf(fabsf(o2_01_re), fabsf(o2_02_im)); \ 606 c2 = fmaxf(fabsf(o2_02_re), fabsf(o2_01_im)); \ 607 c3 = fmaxf(fabsf(o2_10_re), fabsf(o2_10_im)); \ 608 c4 = fmaxf(fabsf(o2_11_re), fabsf(o2_11_im)); \ 609 c5 = fmaxf(fabsf(o2_12_re), fabsf(o2_12_im)); \ 610 c6 = fmaxf(fabsf(o2_20_re), fabsf(o2_20_im)); \ 611 c7 = fmaxf(fabsf(o2_21_re), fabsf(o2_21_im)); \ 612 c8 = fmaxf(fabsf(o2_22_re), fabsf(o2_22_im)); \ 613 c9 = fmaxf(fabsf(o2_30_re), fabsf(o2_30_im)); \ 614 c10 = fmaxf(fabsf(o2_31_re), fabsf(o2_31_im)); \ 615 c11 = fmaxf(fabsf(o2_32_re), fabsf(o2_32_im)); \ 616 c0 = fmaxf(c0, c1); \ 617 c1 = fmaxf(c2, c3); \ 618 c2 = fmaxf(c4, c5); \ 619 c3 = fmaxf(c6, c7); \ 620 c4 = fmaxf(c8, c9); \ 621 c5 = fmaxf(c10, c11); \ 622 c0 = fmaxf(c0, c1); \ 623 c1 = fmaxf(c2, c3); \ 624 c2 = fmaxf(c4, c5); \ 625 c0 = fmaxf(c0, c1); \ 626 c0 = fmaxf(c0, c2); \ 627 param.outNorm[sid+param.fl_stride] = c0; \ 628 scale = __fdividef(MAX_SHORT, c0); \ 629 o2_00_re *= scale; o2_00_im *= scale; o2_01_re *= scale; o2_01_im *= scale; \ 630 o2_02_re *= scale; o2_02_im *= scale; o2_10_re *= scale; o2_10_im *= scale; \ 631 o2_11_re *= scale; o2_11_im *= scale; o2_12_re *= scale; o2_12_im *= scale; \ 632 o2_20_re *= scale; o2_20_im *= scale; o2_21_re *= scale; o2_21_im *= scale; \ 633 o2_22_re *= scale; o2_22_im *= scale; o2_30_re *= scale; o2_30_im *= scale; \ 634 o2_31_re *= scale; o2_31_im *= scale; o2_32_re *= scale; o2_32_im *= scale; \ 635 out[sid+param.fl_stride+0*(param.sp_stride)] = make_short4(f2i(o2_00_re), f2i(o2_00_im), f2i(o2_01_re), f2i(o2_01_im)); \ 636 out[sid+param.fl_stride+1*(param.sp_stride)] = make_short4(f2i(o2_02_re), f2i(o2_02_im), f2i(o2_10_re), f2i(o2_10_im)); \ 637 out[sid+param.fl_stride+2*(param.sp_stride)] = make_short4(f2i(o2_11_re), f2i(o2_11_im), f2i(o2_12_re), f2i(o2_12_im)); \ 638 out[sid+param.fl_stride+3*(param.sp_stride)] = make_short4(f2i(o2_20_re), f2i(o2_20_im), f2i(o2_21_re), f2i(o2_21_im)); \ 639 out[sid+param.fl_stride+4*(param.sp_stride)] = make_short4(f2i(o2_22_re), f2i(o2_22_im), f2i(o2_30_re), f2i(o2_30_im)); \ 640 out[sid+param.fl_stride+5*(param.sp_stride)] = make_short4(f2i(o2_31_re), f2i(o2_31_im), f2i(o2_32_re), f2i(o2_32_im)); 645 #define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \ 646 double2 I0 = fetch_double2((spinor), idx + 0*mystride); \ 647 double2 I1 = fetch_double2((spinor), idx + 1*mystride); \ 648 double2 I2 = fetch_double2((spinor), idx + 2*mystride); 650 #ifdef USE_TEXTURE_OBJECTS 651 #define READ_1ST_NBR_SPINOR_GHOST_DOUBLE_TEX(spinor, idx, mystride, dir) \ 652 double2 I0 = fetch_double2((spinor[dir]), idx + 0*mystride); \ 653 double2 I1 = fetch_double2((spinor[dir]), idx + 1*mystride); \ 654 double2 I2 = fetch_double2((spinor[dir]), idx + 2*mystride); 656 #define READ_1ST_NBR_SPINOR_GHOST_DOUBLE_TEX(spinor, idx, mystride, dir) \ 657 double2 I0 = fetch_double2((spinor), idx + 0*mystride); \ 658 double2 I1 = fetch_double2((spinor), idx + 1*mystride); \ 659 double2 I2 = fetch_double2((spinor), idx + 2*mystride); 662 #define READ_KS_NBR_SPINOR_DOUBLE_TEX(T, spinor, idx, mystride) \ 663 T##0 = fetch_double2((spinor), idx + 0*mystride); \ 664 T##1 = fetch_double2((spinor), idx + 1*mystride); \ 665 T##2 = fetch_double2((spinor), idx + 2*mystride); 667 #ifdef USE_TEXTURE_OBJECTS 668 #define READ_KS_NBR_SPINOR_GHOST_DOUBLE_TEX(T, spinor, idx, mystride, dir) \ 669 T##0 = fetch_double2((spinor)[dir], idx + 0*mystride); \ 670 T##1 = fetch_double2((spinor)[dir], idx + 1*mystride); \ 671 T##2 = fetch_double2((spinor)[dir], idx + 2*mystride); 673 #define READ_KS_NBR_SPINOR_GHOST_DOUBLE_TEX(T, spinor, idx, mystride, dir) \ 674 T##0 = fetch_double2((spinor), idx + 0*mystride); \ 675 T##1 = fetch_double2((spinor), idx + 1*mystride); \ 676 T##2 = fetch_double2((spinor), idx + 2*mystride); 679 #define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \ 680 float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 681 float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 682 float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); 684 #ifdef USE_TEXTURE_OBJECTS 685 #define READ_1ST_NBR_SPINOR_GHOST_SINGLE_TEX(spinor, idx, mystride, dir) \ 686 float2 I0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \ 687 float2 I1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \ 688 float2 I2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride); 690 #define READ_1ST_NBR_SPINOR_GHOST_SINGLE_TEX(spinor, idx, mystride, dir) \ 691 float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 692 float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 693 float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); 696 #define READ_KS_NBR_SPINOR_SINGLE_TEX(T, spinor, idx, mystride) \ 697 T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 698 T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 699 T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); 701 #ifdef USE_TEXTURE_OBJECTS 702 #define READ_KS_NBR_SPINOR_GHOST_SINGLE_TEX(T, spinor, idx, mystride, dir) \ 703 T##0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \ 704 T##1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \ 705 T##2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride); 707 #define READ_KS_NBR_SPINOR_GHOST_SINGLE_TEX(T, spinor, idx, mystride, dir) \ 708 T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 709 T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 710 T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); 713 #define READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) \ 714 float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 715 float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 716 float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \ 718 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \ 719 I0.x *= C; I0.y *= C; \ 720 I1.x *= C; I1.y *= C; \ 721 I2.x *= C; I2.y *= C;} 723 #ifdef USE_TEXTURE_OBJECTS 724 #define READ_1ST_NBR_SPINOR_GHOST_HALF_TEX_(spinor, idx, mystride, dir) \ 725 float2 I0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \ 726 float2 I1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \ 727 float2 I2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride); \ 729 float C = TEX1DFETCH(float, (spinor ## Norm)[dir], norm_idx1); \ 730 I0.x *= C; I0.y *= C; \ 731 I1.x *= C; I1.y *= C; \ 732 I2.x *= C; I2.y *= C;} 734 #define READ_1ST_NBR_SPINOR_GHOST_HALF_TEX_(spinor, idx, mystride, dir) \ 735 float2 I0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 736 float2 I1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 737 float2 I2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \ 739 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx1); \ 740 I0.x *= C; I0.y *= C; \ 741 I1.x *= C; I1.y *= C; \ 742 I2.x *= C; I2.y *= C;} 745 #define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \ 746 READ_1ST_NBR_SPINOR_HALF_TEX_(spinor, idx, mystride) 748 #define READ_1ST_NBR_SPINOR_GHOST_HALF_TEX(spinor, idx, mystride, dir) \ 749 READ_1ST_NBR_SPINOR_GHOST_HALF_TEX_(spinor, idx, mystride, dir) 751 #define READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride) \ 752 T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 753 T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 754 T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \ 756 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \ 757 (T##0).x *= C; (T##0).y *= C; \ 758 (T##1).x *= C; (T##1).y *= C; \ 759 (T##2).x *= C; (T##2).y *= C;} 761 #ifdef USE_TEXTURE_OBJECTS 762 #define READ_KS_NBR_SPINOR_GHOST_HALF_TEX_(T, spinor, idx, mystride, dir) \ 763 T##0 = TEX1DFETCH(float2, (spinor)[dir], idx + 0*mystride); \ 764 T##1 = TEX1DFETCH(float2, (spinor)[dir], idx + 1*mystride); \ 765 T##2 = TEX1DFETCH(float2, (spinor)[dir], idx + 2*mystride); \ 767 float C = TEX1DFETCH(float, (spinor ## Norm)[dir], norm_idx3); \ 768 (T##0).x *= C; (T##0).y *= C; \ 769 (T##1).x *= C; (T##1).y *= C; \ 770 (T##2).x *= C; (T##2).y *= C;} 772 #define READ_KS_NBR_SPINOR_GHOST_HALF_TEX_(T, spinor, idx, mystride, dir) \ 773 T##0 = TEX1DFETCH(float2, (spinor), idx + 0*mystride); \ 774 T##1 = TEX1DFETCH(float2, (spinor), idx + 1*mystride); \ 775 T##2 = TEX1DFETCH(float2, (spinor), idx + 2*mystride); \ 777 float C = TEX1DFETCH(float, (spinor ## Norm), norm_idx3); \ 778 (T##0).x *= C; (T##0).y *= C; \ 779 (T##1).x *= C; (T##1).y *= C; \ 780 (T##2).x *= C; (T##2).y *= C;} 783 #define READ_KS_NBR_SPINOR_HALF_TEX(T, spinor, idx, mystride) \ 784 READ_KS_NBR_SPINOR_HALF_TEX_(T, spinor, idx, mystride) 786 #define READ_KS_NBR_SPINOR_GHOST_HALF_TEX(T, spinor, idx, mystride, dir) \ 787 READ_KS_NBR_SPINOR_GHOST_HALF_TEX_(T, spinor, idx, mystride, dir) 789 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 790 double2 I0 = spinor[idx + 0*mystride]; \ 791 double2 I1 = spinor[idx + 1*mystride]; \ 792 double2 I2 = spinor[idx + 2*mystride]; 794 #define READ_1ST_NBR_SPINOR_GHOST_DOUBLE(spinor, idx, mystride, dir) \ 795 double2 I0 = spinor[dir][idx + 0*mystride]; \ 796 double2 I1 = spinor[dir][idx + 1*mystride]; \ 797 double2 I2 = spinor[dir][idx + 2*mystride]; 799 #define READ_KS_NBR_SPINOR_DOUBLE(T, spinor, idx, mystride) \ 800 T##0 = spinor[idx + 0*mystride]; \ 801 T##1 = spinor[idx + 1*mystride]; \ 802 T##2 = spinor[idx + 2*mystride]; 804 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 805 float2 I0 = spinor[idx + 0*mystride]; \ 806 float2 I1 = spinor[idx + 1*mystride]; \ 807 float2 I2 = spinor[idx + 2*mystride]; 809 #define READ_1ST_NBR_SPINOR_GHOST_SINGLE(spinor, idx, mystride, dir) \ 810 float2 I0 = spinor[dir][idx + 0*mystride]; \ 811 float2 I1 = spinor[dir][idx + 1*mystride]; \ 812 float2 I2 = spinor[dir][idx + 2*mystride]; 814 #define READ_KS_NBR_SPINOR_SINGLE(T, spinor, idx, mystride) \ 815 T##0 = spinor[idx + 0*mystride]; \ 816 T##1 = spinor[idx + 1*mystride]; \ 817 T##2 = spinor[idx + 2*mystride]; 819 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \ 822 short2 S0 = in[idx + 0*mystride]; \ 823 short2 S1 = in[idx + 1*mystride]; \ 824 short2 S2 = in[idx + 2*mystride]; \ 825 float C = inNorm[idx]; \ 826 I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \ 827 I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \ 828 I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \ 831 #define READ_KS_NBR_SPINOR_HALF(T, spinor, idx, mystride) \ 833 short2 S0 = in[idx + 0*mystride]; \ 834 short2 S1 = in[idx + 1*mystride]; \ 835 short2 S2 = in[idx + 2*mystride]; \ 836 float C = inNorm[idx]; \ 837 (T##0).x =C*short2float(S0.x); (T##0).y =C*short2float(S0.y); \ 838 (T##1).x =C*short2float(S1.x); (T##1).y =C*short2float(S1.y); \ 839 (T##2).x =C*short2float(S2.x); (T##2).y =C*short2float(S2.y); \ 843 #define WRITE_ST_SPINOR_DOUBLE2(out, sid, mystride) \ 844 ((double2*)out)[0*mystride+sid] = make_double2(o00_re, o00_im); \ 845 ((double2*)out)[1*mystride+sid] = make_double2(o01_re, o01_im); \ 846 ((double2*)out)[2*mystride+sid] = make_double2(o02_re, o02_im); 848 #define WRITE_ST_SPINOR_FLOAT2(out, sid, mystride) \ 849 ((float2*)out)[0*mystride+sid] = make_float2(o00_re, o00_im); \ 850 ((float2*)out)[1*mystride+sid] = make_float2(o01_re, o01_im); \ 851 ((float2*)out)[2*mystride+sid] = make_float2(o02_re, o02_im); 853 #define WRITE_ST_SPINOR_SHORT2(out, sid, mystride) \ 854 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 855 float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \ 856 float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \ 857 c0 = fmaxf(c0, c1); \ 858 c0 = fmaxf(c0, c2); \ 859 out ## Norm[sid] = c0; \ 860 float scale = __fdividef(MAX_SHORT, c0); \ 861 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 862 o02_re *= scale; o02_im *= scale; \ 863 ((short2*)out)[sid+0*mystride] = make_short2(f2i(o00_re), f2i(o00_im)); \ 864 ((short2*)out)[sid+1*mystride] = make_short2(f2i(o01_re), f2i(o01_im)); \ 865 ((short2*)out)[sid+2*mystride] = make_short2(f2i(o02_re), f2i(o02_im)); 868 #define WRITE_ST_SPINOR_DOUBLE2_STR(out, sid, mystride) \ 869 store_streaming_double2(&out[0*mystride+sid], o00_re, o00_im); \ 870 store_streaming_double2(&out[1*mystride+sid], o01_re, o01_im); \ 871 store_streaming_double2(&out[2*mystride+sid], o02_re, o02_im); 873 #define WRITE_ST_SPINOR_FLOAT2_STR(out, sid, mystride) \ 874 store_streaming_float2(&out[0*mystride+sid], o00_re, o00_im); \ 875 store_streaming_float2(&out[1*mystride+sid], o01_re, o01_im); \ 876 store_streaming_float2(&out[2*mystride+sid], o02_re, o02_im); 878 #define WRITE_ST_SPINOR_SHORT2_STR(out, sid, mystride) \ 879 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 880 float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \ 881 float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \ 882 c0 = fmaxf(c0, c1); \ 883 c0 = fmaxf(c0, c2); \ 884 out ## Norm[sid] = c0; \ 885 float scale = __fdividef(MAX_SHORT, c0); \ 886 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 887 o02_re *= scale; o02_im *= scale; \ 888 store_streaming_short2(&g_out[0*mystride+sid], f2i(o00_re), f2i(o00_im)); \ 889 store_streaming_short2(&g_out[1*mystride+sid], f2i(o01_re), f2i(o01_im)); \ 890 store_streaming_short2(&g_out[2*mystride+sid], f2i(o02_re), f2i(o02_im)); 892 #define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor,sid) { \ 893 double2 tmp0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \ 894 double2 tmp1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \ 895 double2 tmp2 = fetch_double2((spinor), sid + 2*(param.sp_stride)); \ 896 o00_re += tmp0.x; o00_im += tmp0.y; \ 897 o01_re += tmp1.x; o01_im += tmp1.y; \ 898 o02_re += tmp2.x; o02_im += tmp2.y; } 900 #define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor,sid) { \ 901 float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*(param.sp_stride)); \ 902 float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*(param.sp_stride)); \ 903 float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*(param.sp_stride)); \ 904 o00_re += tmp0.x; o00_im += tmp0.y; \ 905 o01_re += tmp1.x; o01_im += tmp1.y; \ 906 o02_re += tmp2.x; o02_im += tmp2.y; } 908 #define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid) { \ 909 float2 tmp0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \ 910 float2 tmp1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \ 911 float2 tmp2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \ 912 float C = TEX1DFETCH(float, (spinor##Norm), sid); \ 913 o00_re += C*tmp0.x; o00_im += C*tmp0.y; \ 914 o01_re += C*tmp1.x; o01_im += C*tmp1.y; \ 915 o02_re += C*tmp2.x; o02_im += C*tmp2.y; } 917 #define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor,sid) \ 918 READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor,sid) 920 #define READ_AND_SUM_ST_SPINOR(spinor,sid) \ 921 o00_re += spinor[0*param.sp_stride+sid].x; o00_im += spinor[0*param.sp_stride+sid].y; \ 922 o01_re += spinor[1*param.sp_stride+sid].x; o01_im += spinor[1*param.sp_stride+sid].y; \ 923 o02_re += spinor[2*param.sp_stride+sid].x; o02_im += spinor[2*param.sp_stride+sid].y; \ 925 #define READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid) \ 926 float C = spinor ## Norm[sid]; \ 927 o00_re += C*short2float(spinor[0*param.sp_stride + sid].x); \ 928 o00_im += C*short2float(spinor[0*param.sp_stride + sid].y); \ 929 o01_re += C*short2float(spinor[1*param.sp_stride + sid].x); \ 930 o01_im += C*short2float(spinor[1*param.sp_stride + sid].y); \ 931 o02_re += C*short2float(spinor[2*param.sp_stride + sid].x); \ 932 o02_im += C*short2float(spinor[2*param.sp_stride + sid].y); 934 #define READ_AND_SUM_ST_SPINOR_HALF(spinor,sid) \ 935 READ_AND_SUM_ST_SPINOR_HALF_(spinor,sid) 937 #define READ_ST_ACCUM_DOUBLE_TEX(spinor,sid) \ 938 double2 accum0 = fetch_double2((spinor), sid + 0*(param.sp_stride)); \ 939 double2 accum1 = fetch_double2((spinor), sid + 1*(param.sp_stride)); \ 940 double2 accum2 = fetch_double2((spinor), sid + 2*(param.sp_stride)); 942 #define READ_ST_ACCUM_SINGLE_TEX(spinor,sid) \ 943 float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \ 944 float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \ 945 float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); 947 #define READ_ST_ACCUM_HALF_TEX_(spinor,sid) \ 948 float2 accum0 = TEX1DFETCH(float2, (spinor), sid + 0*param.sp_stride); \ 949 float2 accum1 = TEX1DFETCH(float2, (spinor), sid + 1*param.sp_stride); \ 950 float2 accum2 = TEX1DFETCH(float2, (spinor), sid + 2*param.sp_stride); \ 951 float C = TEX1DFETCH(float, (spinor ## Norm), sid); \ 952 accum0.x *= C; accum0.y *= C; \ 953 accum1.x *= C; accum1.y *= C; \ 954 accum2.x *= C; accum2.y *= C; 956 #define READ_ST_ACCUM_HALF_TEX(spinor,sid) READ_ST_ACCUM_HALF_TEX_(spinor,sid) 958 #define READ_ST_ACCUM_DOUBLE(spinor,sid) \ 959 double2 accum0 = spinor[sid + 0*(param.sp_stride)]; \ 960 double2 accum1 = spinor[sid + 1*(param.sp_stride)]; \ 961 double2 accum2 = spinor[sid + 2*(param.sp_stride)]; 963 #define READ_ST_ACCUM_SINGLE(spinor,sid) \ 964 float2 accum0 = spinor[sid + 0*(param.sp_stride)]; \ 965 float2 accum1 = spinor[sid + 1*(param.sp_stride)]; \ 966 float2 accum2 = spinor[sid + 2*(param.sp_stride)]; 968 #define READ_ST_ACCUM_HALF(spinor,sid) \ 969 float2 accum0, accum1, accum2; \ 971 short2 S0 = x[sid + 0*param.sp_stride]; \ 972 short2 S1 = x[sid + 1*param.sp_stride]; \ 973 short2 S2 = x[sid + 2*param.sp_stride]; \ 974 float C = spinor##Norm[sid]; \ 975 accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \ 976 accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \ 977 accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \ 980 #define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \ 981 extern __shared__ char s_data[]; \ 982 spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \ 983 ((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 984 sh[0*SHARED_STRIDE] = reg##00_re; \ 985 sh[1*SHARED_STRIDE] = reg##00_im; \ 986 sh[2*SHARED_STRIDE] = reg##01_re; \ 987 sh[3*SHARED_STRIDE] = reg##01_im; \ 988 sh[4*SHARED_STRIDE] = reg##02_re; \ 989 sh[5*SHARED_STRIDE] = reg##02_im; \ 990 sh[6*SHARED_STRIDE] = reg##10_re; \ 991 sh[7*SHARED_STRIDE] = reg##10_im; \ 992 sh[8*SHARED_STRIDE] = reg##11_re; \ 993 sh[9*SHARED_STRIDE] = reg##11_im; \ 994 sh[10*SHARED_STRIDE] = reg##12_re; \ 995 sh[11*SHARED_STRIDE] = reg##12_im; \ 996 sh[12*SHARED_STRIDE] = reg##20_re; \ 997 sh[13*SHARED_STRIDE] = reg##20_im; \ 998 sh[14*SHARED_STRIDE] = reg##21_re; \ 999 sh[15*SHARED_STRIDE] = reg##21_im; \ 1000 sh[16*SHARED_STRIDE] = reg##22_re; \ 1001 sh[17*SHARED_STRIDE] = reg##22_im; \ 1002 sh[18*SHARED_STRIDE] = reg##30_re; \ 1003 sh[19*SHARED_STRIDE] = reg##30_im; \ 1004 sh[20*SHARED_STRIDE] = reg##31_re; \ 1005 sh[21*SHARED_STRIDE] = reg##31_im; \ 1006 sh[22*SHARED_STRIDE] = reg##32_re; \ 1007 sh[23*SHARED_STRIDE] = reg##32_im; 1009 #define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL 1011 #define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \ 1012 extern __shared__ char s_data[]; \ 1013 double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \ 1014 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 1015 double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \ 1016 double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \ 1017 double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \ 1018 double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \ 1019 double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \ 1020 double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \ 1021 double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \ 1022 double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \ 1023 double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \ 1024 double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \ 1025 double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \ 1026 double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]); 1028 #ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access 1030 #define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL 1032 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \ 1033 extern __shared__ char s_data[]; \ 1034 float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \ 1035 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 1036 float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \ 1037 float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \ 1038 float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \ 1039 float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \ 1040 float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \ 1041 float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]); 1043 #else // 8-byte shared memory words 1045 #define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \ 1046 extern __shared__ char s_data[]; \ 1047 float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \ 1048 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 1049 sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \ 1050 sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \ 1051 sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \ 1052 sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \ 1053 sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \ 1054 sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \ 1055 sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \ 1056 sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \ 1057 sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \ 1058 sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \ 1059 sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \ 1060 sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im); 1062 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \ 1063 extern __shared__ char s_data[]; \ 1064 float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \ 1065 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 1066 float2 tmp1, tmp2; \ 1067 tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 1068 tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 1069 tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 1070 tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 1071 tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 1072 tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); 1078 #define READ_ACCUM_FLAVOR_DOUBLE(spinor, stride, fl_stride) \ 1079 double2 flv1_accum0 = spinor[sid + 0*stride]; \ 1080 double2 flv1_accum1 = spinor[sid + 1*stride]; \ 1081 double2 flv1_accum2 = spinor[sid + 2*stride]; \ 1082 double2 flv1_accum3 = spinor[sid + 3*stride]; \ 1083 double2 flv1_accum4 = spinor[sid + 4*stride]; \ 1084 double2 flv1_accum5 = spinor[sid + 5*stride]; \ 1085 double2 flv1_accum6 = spinor[sid + 6*stride]; \ 1086 double2 flv1_accum7 = spinor[sid + 7*stride]; \ 1087 double2 flv1_accum8 = spinor[sid + 8*stride]; \ 1088 double2 flv1_accum9 = spinor[sid + 9*stride]; \ 1089 double2 flv1_accum10 = spinor[sid + 10*stride]; \ 1090 double2 flv1_accum11 = spinor[sid + 11*stride]; \ 1091 double2 flv2_accum0 = spinor[sid + fl_stride + 0*stride]; \ 1092 double2 flv2_accum1 = spinor[sid + fl_stride + 1*stride]; \ 1093 double2 flv2_accum2 = spinor[sid + fl_stride + 2*stride]; \ 1094 double2 flv2_accum3 = spinor[sid + fl_stride + 3*stride]; \ 1095 double2 flv2_accum4 = spinor[sid + fl_stride + 4*stride]; \ 1096 double2 flv2_accum5 = spinor[sid + fl_stride + 5*stride]; \ 1097 double2 flv2_accum6 = spinor[sid + fl_stride + 6*stride]; \ 1098 double2 flv2_accum7 = spinor[sid + fl_stride + 7*stride]; \ 1099 double2 flv2_accum8 = spinor[sid + fl_stride + 8*stride]; \ 1100 double2 flv2_accum9 = spinor[sid + fl_stride + 9*stride]; \ 1101 double2 flv2_accum10 = spinor[sid + fl_stride + 10*stride]; \ 1102 double2 flv2_accum11 = spinor[sid + fl_stride + 11*stride]; 1105 #define READ_ACCUM_FLAVOR_SINGLE(spinor, stride, flv_stride) \ 1106 float4 flv1_accum0 = spinor[sid + 0*(stride)]; \ 1107 float4 flv1_accum1 = spinor[sid + 1*(stride)]; \ 1108 float4 flv1_accum2 = spinor[sid + 2*(stride)]; \ 1109 float4 flv1_accum3 = spinor[sid + 3*(stride)]; \ 1110 float4 flv1_accum4 = spinor[sid + 4*(stride)]; \ 1111 float4 flv1_accum5 = spinor[sid + 5*(stride)]; \ 1112 float4 flv2_accum0 = spinor[sid + flv_stride + 0*(stride)]; \ 1113 float4 flv2_accum1 = spinor[sid + flv_stride + 1*(stride)]; \ 1114 float4 flv2_accum2 = spinor[sid + flv_stride + 2*(stride)]; \ 1115 float4 flv2_accum3 = spinor[sid + flv_stride + 3*(stride)]; \ 1116 float4 flv2_accum4 = spinor[sid + flv_stride + 4*(stride)]; \ 1117 float4 flv2_accum5 = spinor[sid + flv_stride + 5*(stride)]; 1120 #define READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride) \ 1121 float4 flv1_accum0 = short42float4(spinor[sid + 0*stride]); \ 1122 float4 flv1_accum1 = short42float4(spinor[sid + 1*stride]); \ 1123 float4 flv1_accum2 = short42float4(spinor[sid + 2*stride]); \ 1124 float4 flv1_accum3 = short42float4(spinor[sid + 3*stride]); \ 1125 float4 flv1_accum4 = short42float4(spinor[sid + 4*stride]); \ 1126 float4 flv1_accum5 = short42float4(spinor[sid + 5*stride]); \ 1127 float C = (spinor ## Norm)[sid]; \ 1128 flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \ 1129 flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \ 1130 flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \ 1131 flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \ 1132 flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \ 1133 flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \ 1134 float4 flv2_accum0 = short42float4(spinor[sid + flv_stride + 0*stride]); \ 1135 float4 flv2_accum1 = short42float4(spinor[sid + flv_stride + 1*stride]); \ 1136 float4 flv2_accum2 = short42float4(spinor[sid + flv_stride + 2*stride]); \ 1137 float4 flv2_accum3 = short42float4(spinor[sid + flv_stride + 3*stride]); \ 1138 float4 flv2_accum4 = short42float4(spinor[sid + flv_stride + 4*stride]); \ 1139 float4 flv2_accum5 = short42float4(spinor[sid + flv_stride + 5*stride]); \ 1140 C = (spinor ## Norm)[sid + fl_stride]; \ 1141 flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \ 1142 flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \ 1143 flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \ 1144 flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \ 1145 flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \ 1146 flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C; 1148 #define READ_ACCUM_FLAVOR_HALF(spinor, stride, flv_stride) READ_ACCUM_FLAVOR_HALF_(spinor, stride, flv_stride) 1151 #define READ_ACCUM_FLAVOR_DOUBLE_TEX(spinor, stride, flv_stride) \ 1152 double2 flv1_accum0 = fetch_double2((spinor), sid + 0*(stride)); \ 1153 double2 flv1_accum1 = fetch_double2((spinor), sid + 1*(stride)); \ 1154 double2 flv1_accum2 = fetch_double2((spinor), sid + 2*(stride)); \ 1155 double2 flv1_accum3 = fetch_double2((spinor), sid + 3*(stride)); \ 1156 double2 flv1_accum4 = fetch_double2((spinor), sid + 4*(stride)); \ 1157 double2 flv1_accum5 = fetch_double2((spinor), sid + 5*(stride)); \ 1158 double2 flv1_accum6 = fetch_double2((spinor), sid + 6*(stride)); \ 1159 double2 flv1_accum7 = fetch_double2((spinor), sid + 7*(stride)); \ 1160 double2 flv1_accum8 = fetch_double2((spinor), sid + 8*(stride)); \ 1161 double2 flv1_accum9 = fetch_double2((spinor), sid + 9*(stride)); \ 1162 double2 flv1_accum10 = fetch_double2((spinor), sid + 10*(stride)); \ 1163 double2 flv1_accum11 = fetch_double2((spinor), sid + 11*(stride)); \ 1164 double2 flv2_accum0 = fetch_double2((spinor), sid + flv_stride + 0*(stride)); \ 1165 double2 flv2_accum1 = fetch_double2((spinor), sid + flv_stride + 1*(stride)); \ 1166 double2 flv2_accum2 = fetch_double2((spinor), sid + flv_stride + 2*(stride)); \ 1167 double2 flv2_accum3 = fetch_double2((spinor), sid + flv_stride + 3*(stride)); \ 1168 double2 flv2_accum4 = fetch_double2((spinor), sid + flv_stride + 4*(stride)); \ 1169 double2 flv2_accum5 = fetch_double2((spinor), sid + flv_stride + 5*(stride)); \ 1170 double2 flv2_accum6 = fetch_double2((spinor), sid + flv_stride + 6*(stride)); \ 1171 double2 flv2_accum7 = fetch_double2((spinor), sid + flv_stride + 7*(stride)); \ 1172 double2 flv2_accum8 = fetch_double2((spinor), sid + flv_stride + 8*(stride)); \ 1173 double2 flv2_accum9 = fetch_double2((spinor), sid + flv_stride + 9*(stride)); \ 1174 double2 flv2_accum10 = fetch_double2((spinor), sid + flv_stride + 10*(stride)); \ 1175 double2 flv2_accum11 = fetch_double2((spinor), sid + flv_stride + 11*(stride)); 1178 #define READ_ACCUM_FLAVOR_SINGLE_TEX(spinor, stride, flv_stride) \ 1179 float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \ 1180 float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \ 1181 float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \ 1182 float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \ 1183 float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \ 1184 float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \ 1185 float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \ 1186 float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \ 1187 float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \ 1188 float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \ 1189 float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \ 1190 float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride)); 1192 #define READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride) \ 1193 float4 flv1_accum0 = TEX1DFETCH(float4, (spinor), sid + 0*(stride)); \ 1194 float4 flv1_accum1 = TEX1DFETCH(float4, (spinor), sid + 1*(stride)); \ 1195 float4 flv1_accum2 = TEX1DFETCH(float4, (spinor), sid + 2*(stride)); \ 1196 float4 flv1_accum3 = TEX1DFETCH(float4, (spinor), sid + 3*(stride)); \ 1197 float4 flv1_accum4 = TEX1DFETCH(float4, (spinor), sid + 4*(stride)); \ 1198 float4 flv1_accum5 = TEX1DFETCH(float4, (spinor), sid + 5*(stride)); \ 1199 float C = TEX1DFETCH(float, (spinor ## Norm), sid); \ 1200 flv1_accum0.x *= C; flv1_accum0.y *= C; flv1_accum0.z *= C; flv1_accum0.w *= C; \ 1201 flv1_accum1.x *= C; flv1_accum1.y *= C; flv1_accum1.z *= C; flv1_accum1.w *= C; \ 1202 flv1_accum2.x *= C; flv1_accum2.y *= C; flv1_accum2.z *= C; flv1_accum2.w *= C; \ 1203 flv1_accum3.x *= C; flv1_accum3.y *= C; flv1_accum3.z *= C; flv1_accum3.w *= C; \ 1204 flv1_accum4.x *= C; flv1_accum4.y *= C; flv1_accum4.z *= C; flv1_accum4.w *= C; \ 1205 flv1_accum5.x *= C; flv1_accum5.y *= C; flv1_accum5.z *= C; flv1_accum5.w *= C; \ 1206 float4 flv2_accum0 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 0*(stride)); \ 1207 float4 flv2_accum1 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 1*(stride)); \ 1208 float4 flv2_accum2 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 2*(stride)); \ 1209 float4 flv2_accum3 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 3*(stride)); \ 1210 float4 flv2_accum4 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 4*(stride)); \ 1211 float4 flv2_accum5 = TEX1DFETCH(float4, (spinor), sid + flv_stride + 5*(stride)); \ 1212 C = TEX1DFETCH(float, (spinor ## Norm), sid + flv_stride); \ 1213 flv2_accum0.x *= C; flv2_accum0.y *= C; flv2_accum0.z *= C; flv2_accum0.w *= C; \ 1214 flv2_accum1.x *= C; flv2_accum1.y *= C; flv2_accum1.z *= C; flv2_accum1.w *= C; \ 1215 flv2_accum2.x *= C; flv2_accum2.y *= C; flv2_accum2.z *= C; flv2_accum2.w *= C; \ 1216 flv2_accum3.x *= C; flv2_accum3.y *= C; flv2_accum3.z *= C; flv2_accum3.w *= C; \ 1217 flv2_accum4.x *= C; flv2_accum4.y *= C; flv2_accum4.z *= C; flv2_accum4.w *= C; \ 1218 flv2_accum5.x *= C; flv2_accum5.y *= C; flv2_accum5.z *= C; flv2_accum5.w *= C; 1221 #define READ_ACCUM_FLAVOR_HALF_TEX(spinor, stride, flv_stride) READ_ACCUM_HALF_FLAVOR_TEX_(spinor, stride, flv_stride) 1225 #define ASSN_ACCUM_DOUBLE(spinor, stride, fl_stride) \ 1226 accum0 = spinor[sid + fl_stride + 0*stride]; \ 1227 accum1 = spinor[sid + fl_stride + 1*stride]; \ 1228 accum2 = spinor[sid + fl_stride + 2*stride]; \ 1229 accum3 = spinor[sid + fl_stride + 3*stride]; \ 1230 accum4 = spinor[sid + fl_stride + 4*stride]; \ 1231 accum5 = spinor[sid + fl_stride + 5*stride]; \ 1232 accum6 = spinor[sid + fl_stride + 6*stride]; \ 1233 accum7 = spinor[sid + fl_stride + 7*stride]; \ 1234 accum8 = spinor[sid + fl_stride + 8*stride]; \ 1235 accum9 = spinor[sid + fl_stride + 9*stride]; \ 1236 accum10 = spinor[sid + fl_stride + 10*stride]; \ 1237 accum11 = spinor[sid + fl_stride + 11*stride]; 1239 #define ASSN_ACCUM_SINGLE(spinor, stride, fl_stride) \ 1240 accum0 = spinor[sid + fl_stride + 0*(stride)]; \ 1241 accum1 = spinor[sid + fl_stride + 1*(stride)]; \ 1242 accum2 = spinor[sid + fl_stride + 2*(stride)]; \ 1243 accum3 = spinor[sid + fl_stride + 3*(stride)]; \ 1244 accum4 = spinor[sid + fl_stride + 4*(stride)]; \ 1245 accum5 = spinor[sid + fl_stride + 5*(stride)]; 1247 #define ASSN_ACCUM_HALF_(spinor, stride, fl_stride) \ 1248 accum0 = short42float4(spinor[sid + fl_stride + 0*stride]); \ 1249 accum1 = short42float4(spinor[sid + fl_stride + 1*stride]); \ 1250 accum2 = short42float4(spinor[sid + fl_stride + 2*stride]); \ 1251 accum3 = short42float4(spinor[sid + fl_stride + 3*stride]); \ 1252 accum4 = short42float4(spinor[sid + fl_stride + 4*stride]); \ 1253 accum5 = short42float4(spinor[sid + fl_stride + 5*stride]); \ 1255 float C = (spinor ## Norm)[sid + fl_stride]; \ 1256 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 1257 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 1258 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 1259 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 1260 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 1261 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \ 1265 #define ASSN_ACCUM_HALF(spinor, stride, fl_stride) ASSN_ACCUM_HALF_(spinor, stride, fl_stride) 1269 #define ASSN_ACCUM_DOUBLE_TEX(spinor, stride, fl_stride) \ 1270 accum0 = fetch_double2((spinor), sid + fl_stride + 0*(stride)); \ 1271 accum1 = fetch_double2((spinor), sid + fl_stride + 1*(stride)); \ 1272 accum2 = fetch_double2((spinor), sid + fl_stride + 2*(stride)); \ 1273 accum3 = fetch_double2((spinor), sid + fl_stride + 3*(stride)); \ 1274 accum4 = fetch_double2((spinor), sid + fl_stride + 4*(stride)); \ 1275 accum5 = fetch_double2((spinor), sid + fl_stride + 5*(stride)); \ 1276 accum6 = fetch_double2((spinor), sid + fl_stride + 6*(stride)); \ 1277 accum7 = fetch_double2((spinor), sid + fl_stride + 7*(stride)); \ 1278 accum8 = fetch_double2((spinor), sid + fl_stride + 8*(stride)); \ 1279 accum9 = fetch_double2((spinor), sid + fl_stride + 9*(stride)); \ 1280 accum10 = fetch_double2((spinor), sid + fl_stride + 10*(stride)); \ 1281 accum11 = fetch_double2((spinor), sid + fl_stride + 11*(stride)); 1284 #define ASSN_ACCUM_SINGLE_TEX(spinor, stride, fl_stride) \ 1285 accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \ 1286 accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \ 1287 accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \ 1288 accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \ 1289 accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \ 1290 accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride)); 1292 #define ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride) \ 1293 accum0 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 0*(stride)); \ 1294 accum1 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 1*(stride)); \ 1295 accum2 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 2*(stride)); \ 1296 accum3 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 3*(stride)); \ 1297 accum4 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 4*(stride)); \ 1298 accum5 = TEX1DFETCH(float4, (spinor), sid + fl_stride + 5*(stride)); \ 1300 float C = TEX1DFETCH(float, (spinor ## Norm), sid + fl_stride); \ 1301 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 1302 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 1303 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 1304 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 1305 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 1306 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; \ 1309 #define ASSN_ACCUM_HALF_TEX(spinor, stride, fl_stride) ASSN_ACCUM_HALF_TEX_(spinor, stride, fl_stride) 1314 #define APPLY_TWIST_INV(a, b, reg)\ 1316 spinorFloat tmp_re, tmp_im;\ 1317 tmp_re = reg##00_re - a * reg##20_im;\ 1318 tmp_im = reg##00_im + a * reg##20_re;\ 1319 reg##20_re -= a * reg##00_im;\ 1320 reg##20_im += a * reg##00_re;\ 1322 reg##00_re = b * tmp_re;\ 1323 reg##00_im = b * tmp_im;\ 1327 tmp_re = reg##10_re - a * reg##30_im;\ 1328 tmp_im = reg##10_im + a * reg##30_re;\ 1329 reg##30_re -= a * reg##10_im;\ 1330 reg##30_im += a * reg##10_re;\ 1332 reg##10_re = b * tmp_re;\ 1333 reg##10_im = b * tmp_im;\ 1337 tmp_re = reg##01_re - a * reg##21_im;\ 1338 tmp_im = reg##01_im + a * reg##21_re;\ 1339 reg##21_re -= a * reg##01_im;\ 1340 reg##21_im += a * reg##01_re;\ 1342 reg##01_re = b * tmp_re;\ 1343 reg##01_im = b * tmp_im;\ 1347 tmp_re = reg##11_re - a * reg##31_im;\ 1348 tmp_im = reg##11_im + a * reg##31_re;\ 1349 reg##31_re -= a * reg##11_im;\ 1350 reg##31_im += a * reg##11_re;\ 1352 reg##11_re = b * tmp_re;\ 1353 reg##11_im = b * tmp_im;\ 1357 tmp_re = reg##02_re - a * reg##22_im;\ 1358 tmp_im = reg##02_im + a * reg##22_re;\ 1359 reg##22_re -= a * reg##02_im;\ 1360 reg##22_im += a * reg##02_re;\ 1362 reg##02_re = b * tmp_re;\ 1363 reg##02_im = b * tmp_im;\ 1367 tmp_re = reg##12_re - a * reg##32_im;\ 1368 tmp_im = reg##12_im + a * reg##32_re;\ 1369 reg##32_re -= a * reg##12_im;\ 1370 reg##32_im += a * reg##12_re;\ 1372 reg##12_re = b * tmp_re;\ 1373 reg##12_im = b * tmp_im;\ 1379 #define APPLY_TWIST(a, reg)\ 1381 spinorFloat tmp_re, tmp_im;\ 1382 tmp_re = reg##00_re - a * reg##20_im;\ 1383 tmp_im = reg##00_im + a * reg##20_re;\ 1384 reg##20_re -= a * reg##00_im;\ 1385 reg##20_im += a * reg##00_re;\ 1387 reg##00_re = tmp_re;\ 1388 reg##00_im = tmp_im;\ 1390 tmp_re = reg##10_re - a * reg##30_im;\ 1391 tmp_im = reg##10_im + a * reg##30_re;\ 1392 reg##30_re -= a * reg##10_im;\ 1393 reg##30_im += a * reg##10_re;\ 1395 reg##10_re = tmp_re;\ 1396 reg##10_im = tmp_im;\ 1398 tmp_re = reg##01_re - a * reg##21_im;\ 1399 tmp_im = reg##01_im + a * reg##21_re;\ 1400 reg##21_re -= a * reg##01_im;\ 1401 reg##21_im += a * reg##01_re;\ 1403 reg##01_re = tmp_re;\ 1404 reg##01_im = tmp_im;\ 1406 tmp_re = reg##11_re - a * reg##31_im;\ 1407 tmp_im = reg##11_im + a * reg##31_re;\ 1408 reg##31_re -= a * reg##11_im;\ 1409 reg##31_im += a * reg##11_re;\ 1411 reg##11_re = tmp_re;\ 1412 reg##11_im = tmp_im;\ 1414 tmp_re = reg##02_re - a * reg##22_im;\ 1415 tmp_im = reg##02_im + a * reg##22_re;\ 1416 reg##22_re -= a * reg##02_im;\ 1417 reg##22_im += a * reg##02_re;\ 1419 reg##02_re = tmp_re;\ 1420 reg##02_im = tmp_im;\ 1422 tmp_re = reg##12_re - a * reg##32_im;\ 1423 tmp_im = reg##12_im + a * reg##32_re;\ 1424 reg##32_re -= a * reg##12_im;\ 1425 reg##32_im += a * reg##12_re;\ 1427 reg##12_re = tmp_re;\ 1428 reg##12_im = tmp_im;\