Go to the documentation of this file. 1 #define READ_CLOVER_DOUBLE(clover_, chi) \ 2 double2* clover = (double2*)clover_; \ 3 double2 C0 = clover[sid + (18*chi+0)*param.cl_stride]; \ 4 double2 C1 = clover[sid + (18*chi+1)*param.cl_stride]; \ 5 double2 C2 = clover[sid + (18*chi+2)*param.cl_stride]; \ 6 double2 C3 = clover[sid + (18*chi+3)*param.cl_stride]; \ 7 double2 C4 = clover[sid + (18*chi+4)*param.cl_stride]; \ 8 double2 C5 = clover[sid + (18*chi+5)*param.cl_stride]; \ 9 double2 C6 = clover[sid + (18*chi+6)*param.cl_stride]; \ 10 double2 C7 = clover[sid + (18*chi+7)*param.cl_stride]; \ 11 double2 C8 = clover[sid + (18*chi+8)*param.cl_stride]; \ 12 double2 C9 = clover[sid + (18*chi+9)*param.cl_stride]; \ 13 double2 C10 = clover[sid + (18*chi+10)*param.cl_stride]; \ 14 double2 C11 = clover[sid + (18*chi+11)*param.cl_stride]; \ 15 double2 C12 = clover[sid + (18*chi+12)*param.cl_stride]; \ 16 double2 C13 = clover[sid + (18*chi+13)*param.cl_stride]; \ 17 double2 C14 = clover[sid + (18*chi+14)*param.cl_stride]; \ 18 double2 C15 = clover[sid + (18*chi+15)*param.cl_stride]; \ 19 double2 C16 = clover[sid + (18*chi+16)*param.cl_stride]; \ 20 double2 C17 = clover[sid + (18*chi+17)*param.cl_stride]; 22 #define READ_CLOVER_DOUBLE_STR(clover_, chi) \ 23 double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \ 24 double2 C10, C11, C12, C13, C14, C15, C16, C17; \ 25 double2* clover = (double2*)clover_; \ 26 load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \ 27 load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \ 28 load_streaming_double2(C2, &clover[sid + (18*chi+2)*param.cl_stride]); \ 29 load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \ 30 load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \ 31 load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \ 32 load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \ 33 load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \ 34 load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \ 35 load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \ 36 load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \ 37 load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \ 38 load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \ 39 load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \ 40 load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \ 41 load_streaming_double2(C15, &clover[sid + (18*chi+15)*param.cl_stride]); \ 42 load_streaming_double2(C16, &clover[sid + (18*chi+16)*param.cl_stride]); \ 43 load_streaming_double2(C17, &clover[sid + (18*chi+17)*param.cl_stride]); 45 #define READ_CLOVER2_DOUBLE_STR(clover_, chi) \ 46 double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9; \ 47 double2 C10, C11, C12, C13, C14, C15, C16, C17; \ 48 double2* clover = (double2*)clover_; \ 49 load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \ 50 load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \ 51 double diag = 0.5*(C0.x + C1.y); \ 52 double diag_inv = 1.0/diag; \ 53 C2 = make_double2(diag*(2-C0.y*diag_inv), diag*(2-C1.x*diag_inv)); \ 54 load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \ 55 load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \ 56 load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \ 57 load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \ 58 load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \ 59 load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \ 60 load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \ 61 load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \ 62 load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \ 63 load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \ 64 load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \ 65 load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \ 66 C15 = make_double2(-C3.x,-C3.y); \ 67 C16 = make_double2(-C4.x,-C4.y); \ 68 C17 = make_double2(-C8.x,-C8.y); \ 69 C0.x += param.rho; C0.y += param.rho; C1.x += param.rho; \ 70 C1.y += param.rho; C2.x += param.rho; C2.y += param.rho; 72 #define READ_CLOVER_SINGLE(clover_, chi) \ 73 float4 *clover = (float4*)clover_; \ 74 float4 C0 = clover[sid + (9*chi+0)*param.cl_stride]; \ 75 float4 C1 = clover[sid + (9*chi+1)*param.cl_stride]; \ 76 float4 C2 = clover[sid + (9*chi+2)*param.cl_stride]; \ 77 float4 C3 = clover[sid + (9*chi+3)*param.cl_stride]; \ 78 float4 C4 = clover[sid + (9*chi+4)*param.cl_stride]; \ 79 float4 C5 = clover[sid + (9*chi+5)*param.cl_stride]; \ 80 float4 C6 = clover[sid + (9*chi+6)*param.cl_stride]; \ 81 float4 C7 = clover[sid + (9*chi+7)*param.cl_stride]; \ 82 float4 C8 = clover[sid + (9*chi+8)*param.cl_stride]; 84 #define READ_CLOVER2_SINGLE(clover_, chi) \ 85 float4 *clover = (float4*)clover_; \ 86 float4 C0 = clover[sid + (9*chi+0)*param.cl_stride]; \ 87 float4 C1 = clover[sid + (9*chi+1)*param.cl_stride]; \ 88 float4 C2 = clover[sid + (9*chi+2)*param.cl_stride]; \ 89 float4 C3 = clover[sid + (9*chi+3)*param.cl_stride]; \ 90 float4 C4 = clover[sid + (9*chi+4)*param.cl_stride]; \ 91 float4 C5 = clover[sid + (9*chi+5)*param.cl_stride]; \ 92 float4 C6 = clover[sid + (9*chi+6)*param.cl_stride]; \ 93 float4 C7 = clover[sid + (9*chi+7)*param.cl_stride]; \ 94 float4 C8 = make_float4(-C2.x,-C2.y,-C4.x,-C4.y); \ 95 C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \ 96 C0.w += param.rho; C1.x += param.rho; C1.y += param.rho; 98 #define READ_CLOVER_HALF(clover_, chi) \ 99 short4 *clover = (short4*)clover_; \ 100 float4 C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \ 101 float4 C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \ 102 float4 C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \ 103 float4 C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \ 104 float4 C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \ 105 float4 C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \ 106 float4 C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \ 107 float4 C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \ 108 float4 C8 = short42float4(clover[sid + (9*chi+8)*param.cl_stride]); \ 109 float K = CLOVERTEXNORM[sid + chi*param.cl_stride]; \ 110 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 111 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 112 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 113 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 114 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 115 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 116 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 117 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 118 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; 120 #define READ_CLOVER2_HALF(clover_, chi) \ 121 short4 *clover = (short4*)clover_; \ 122 float4 C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \ 123 float4 C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \ 124 float4 C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \ 125 float4 C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \ 126 float4 C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \ 127 float4 C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \ 128 float4 C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \ 129 float4 C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \ 130 float K = CLOVERTEXNORM[sid + chi*param.cl_stride]; \ 131 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 132 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 133 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 134 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 135 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 136 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 137 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 138 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 139 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; \ 140 float4 C8 = make_float4(-C2.x, -C2.y, -C4.x, -C4.y); \ 141 C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \ 142 C0.w += param.rho; C1.x += param.rho; C1.y += param.rho; 144 #define READ_CLOVER_DOUBLE_TEX(clover, chi) \ 145 double2 C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \ 146 double2 C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \ 147 double2 C2 = fetch_double2((clover), sid + (18*chi+2)*param.cl_stride); \ 148 double2 C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \ 149 double2 C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \ 150 double2 C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \ 151 double2 C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \ 152 double2 C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \ 153 double2 C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \ 154 double2 C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \ 155 double2 C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \ 156 double2 C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \ 157 double2 C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \ 158 double2 C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \ 159 double2 C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \ 160 double2 C15 = fetch_double2((clover), sid + (18*chi+15)*param.cl_stride); \ 161 double2 C16 = fetch_double2((clover), sid + (18*chi+16)*param.cl_stride); \ 162 double2 C17 = fetch_double2((clover), sid + (18*chi+17)*param.cl_stride); 165 #define READ_CLOVER2_DOUBLE_TEX(clover, chi) \ 166 double2 C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \ 167 double2 C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \ 168 double diag = 0.5*(C0.x + C1.y); \ 169 double diag_inv = 1.0/diag; \ 170 double2 C2 = make_double2(diag*(2-C0.y*diag_inv), diag*(2-C1.x*diag_inv)); \ 171 double2 C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \ 172 double2 C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \ 173 double2 C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \ 174 double2 C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \ 175 double2 C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \ 176 double2 C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \ 177 double2 C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \ 178 double2 C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \ 179 double2 C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \ 180 double2 C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \ 181 double2 C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \ 182 double2 C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \ 183 double2 C15 = make_double2(-C3.x,-C3.y); \ 184 double2 C16 = make_double2(-C4.x,-C4.y); \ 185 double2 C17 = make_double2(-C8.x,-C8.y); \ 186 C0.x += param.rho; C0.y += param.rho; C1.x += param.rho; \ 187 C1.y += param.rho; C2.x += param.rho; C2.y += param.rho; 189 #define READ_CLOVER_SINGLE_TEX(clover, chi) \ 190 float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \ 191 float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \ 192 float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \ 193 float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \ 194 float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \ 195 float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \ 196 float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \ 197 float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \ 198 float4 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); 200 #define READ_CLOVER2_SINGLE_TEX(clover, chi) \ 201 float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \ 202 float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \ 203 float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \ 204 float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \ 205 float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \ 206 float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \ 207 float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \ 208 float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \ 209 float4 C8 = make_float4(-C2.x,-C2.y,-C4.x,-C4.y); \ 210 C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \ 211 C0.w += param.rho; C1.x += param.rho; C1.y += param.rho; 213 #define READ_CLOVER_HALF_TEX(clover, chi) \ 214 float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \ 215 float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \ 216 float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \ 217 float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \ 218 float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \ 219 float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \ 220 float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \ 221 float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \ 222 float4 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); \ 223 float K = TEX1DFETCH(float, (CLOVERTEXNORM), sid + chi*param.cl_stride); \ 224 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 225 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 226 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 227 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 228 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 229 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 230 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 231 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 232 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; 234 #define READ_CLOVER2_HALF_TEX(clover, chi) \ 235 float4 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \ 236 float4 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \ 237 float4 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \ 238 float4 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \ 239 float4 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \ 240 float4 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \ 241 float4 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \ 242 float4 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \ 243 float K = TEX1DFETCH(float, (CLOVERTEXNORM), sid + chi*param.cl_stride); \ 244 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 245 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 246 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 247 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 248 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 249 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 250 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 251 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 252 float4 C8 = make_float4(-C2.x, -C2.y, -C4.x, -C4.y); \ 253 C0.x += param.rho; C0.y += param.rho; C0.z += param.rho; \ 254 C0.w += param.rho; C1.x += param.rho; C1.y += param.rho; 256 #define ASSN_CLOVER_DOUBLE(clover, chi) \ 257 C0 = clover[sid + (18*chi+0)*param.cl_stride]; \ 258 C1 = clover[sid + (18*chi+1)*param.cl_stride]; \ 259 C2 = clover[sid + (18*chi+2)*param.cl_stride]; \ 260 C3 = clover[sid + (18*chi+3)*param.cl_stride]; \ 261 C4 = clover[sid + (18*chi+4)*param.cl_stride]; \ 262 C5 = clover[sid + (18*chi+5)*param.cl_stride]; \ 263 C6 = clover[sid + (18*chi+6)*param.cl_stride]; \ 264 C7 = clover[sid + (18*chi+7)*param.cl_stride]; \ 265 C8 = clover[sid + (18*chi+8)*param.cl_stride]; \ 266 C9 = clover[sid + (18*chi+9)*param.cl_stride]; \ 267 C10 = clover[sid + (18*chi+10)*param.cl_stride]; \ 268 C11 = clover[sid + (18*chi+11)*param.cl_stride]; \ 269 C12 = clover[sid + (18*chi+12)*param.cl_stride]; \ 270 C13 = clover[sid + (18*chi+13)*param.cl_stride]; \ 271 C14 = clover[sid + (18*chi+14)*param.cl_stride]; \ 272 C15 = clover[sid + (18*chi+15)*param.cl_stride]; \ 273 C16 = clover[sid + (18*chi+16)*param.cl_stride]; \ 274 C17 = clover[sid + (18*chi+17)*param.cl_stride]; 276 #define ASSN_CLOVER_DOUBLE_STR(clover, chi) \ 277 load_streaming_double2(C0, &clover[sid + (18*chi+0)*param.cl_stride]); \ 278 load_streaming_double2(C1, &clover[sid + (18*chi+1)*param.cl_stride]); \ 279 load_streaming_double2(C2, &clover[sid + (18*chi+2)*param.cl_stride]); \ 280 load_streaming_double2(C3, &clover[sid + (18*chi+3)*param.cl_stride]); \ 281 load_streaming_double2(C4, &clover[sid + (18*chi+4)*param.cl_stride]); \ 282 load_streaming_double2(C5, &clover[sid + (18*chi+5)*param.cl_stride]); \ 283 load_streaming_double2(C6, &clover[sid + (18*chi+6)*param.cl_stride]); \ 284 load_streaming_double2(C7, &clover[sid + (18*chi+7)*param.cl_stride]); \ 285 load_streaming_double2(C8, &clover[sid + (18*chi+8)*param.cl_stride]); \ 286 load_streaming_double2(C9, &clover[sid + (18*chi+9)*param.cl_stride]); \ 287 load_streaming_double2(C10, &clover[sid + (18*chi+10)*param.cl_stride]); \ 288 load_streaming_double2(C11, &clover[sid + (18*chi+11)*param.cl_stride]); \ 289 load_streaming_double2(C12, &clover[sid + (18*chi+12)*param.cl_stride]); \ 290 load_streaming_double2(C13, &clover[sid + (18*chi+13)*param.cl_stride]); \ 291 load_streaming_double2(C14, &clover[sid + (18*chi+14)*param.cl_stride]); \ 292 load_streaming_double2(C15, &clover[sid + (18*chi+15)*param.cl_stride]); \ 293 load_streaming_double2(C16, &clover[sid + (18*chi+16)*param.cl_stride]); \ 294 load_streaming_double2(C17, &clover[sid + (18*chi+17)*param.cl_stride]); 296 #define ASSN_CLOVER_SINGLE(clover, chi) \ 297 C0 = clover[sid + (9*chi+0)*param.cl_stride]; \ 298 C1 = clover[sid + (9*chi+1)*param.cl_stride]; \ 299 C2 = clover[sid + (9*chi+2)*param.cl_stride]; \ 300 C3 = clover[sid + (9*chi+3)*param.cl_stride]; \ 301 C4 = clover[sid + (9*chi+4)*param.cl_stride]; \ 302 C5 = clover[sid + (9*chi+5)*param.cl_stride]; \ 303 C6 = clover[sid + (9*chi+6)*param.cl_stride]; \ 304 C7 = clover[sid + (9*chi+7)*param.cl_stride]; \ 305 C8 = clover[sid + (9*chi+8)*param.cl_stride]; 307 #define ASSN_CLOVER_HALF(clover, chi) \ 308 C0 = short42float4(clover[sid + (9*chi+0)*param.cl_stride]); \ 309 C1 = short42float4(clover[sid + (9*chi+1)*param.cl_stride]); \ 310 C2 = short42float4(clover[sid + (9*chi+2)*param.cl_stride]); \ 311 C3 = short42float4(clover[sid + (9*chi+3)*param.cl_stride]); \ 312 C4 = short42float4(clover[sid + (9*chi+4)*param.cl_stride]); \ 313 C5 = short42float4(clover[sid + (9*chi+5)*param.cl_stride]); \ 314 C6 = short42float4(clover[sid + (9*chi+6)*param.cl_stride]); \ 315 C7 = short42float4(clover[sid + (9*chi+7)*param.cl_stride]); \ 316 C8 = short42float4(clover[sid + (9*chi+8)*param.cl_stride]); \ 317 K = TMCLOVERTEXNORM[sid + chi*param.cl_stride]; \ 318 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 319 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 320 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 321 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 322 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 323 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 324 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 325 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 326 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; 328 #define ASSN_CLOVER_DOUBLE_TEX(clover, chi) \ 329 C0 = fetch_double2((clover), sid + (18*chi+0)*param.cl_stride); \ 330 C1 = fetch_double2((clover), sid + (18*chi+1)*param.cl_stride); \ 331 C2 = fetch_double2((clover), sid + (18*chi+2)*param.cl_stride); \ 332 C3 = fetch_double2((clover), sid + (18*chi+3)*param.cl_stride); \ 333 C4 = fetch_double2((clover), sid + (18*chi+4)*param.cl_stride); \ 334 C5 = fetch_double2((clover), sid + (18*chi+5)*param.cl_stride); \ 335 C6 = fetch_double2((clover), sid + (18*chi+6)*param.cl_stride); \ 336 C7 = fetch_double2((clover), sid + (18*chi+7)*param.cl_stride); \ 337 C8 = fetch_double2((clover), sid + (18*chi+8)*param.cl_stride); \ 338 C9 = fetch_double2((clover), sid + (18*chi+9)*param.cl_stride); \ 339 C10 = fetch_double2((clover), sid + (18*chi+10)*param.cl_stride); \ 340 C11 = fetch_double2((clover), sid + (18*chi+11)*param.cl_stride); \ 341 C12 = fetch_double2((clover), sid + (18*chi+12)*param.cl_stride); \ 342 C13 = fetch_double2((clover), sid + (18*chi+13)*param.cl_stride); \ 343 C14 = fetch_double2((clover), sid + (18*chi+14)*param.cl_stride); \ 344 C15 = fetch_double2((clover), sid + (18*chi+15)*param.cl_stride); \ 345 C16 = fetch_double2((clover), sid + (18*chi+16)*param.cl_stride); \ 346 C17 = fetch_double2((clover), sid + (18*chi+17)*param.cl_stride); 348 #define ASSN_CLOVER_SINGLE_TEX(clover, chi) \ 349 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \ 350 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \ 351 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \ 352 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \ 353 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \ 354 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \ 355 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \ 356 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \ 357 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); 359 #define ASSN_CLOVER_HALF_TEX(clover, chi) \ 360 C0 = TEX1DFETCH(float4, (clover), sid + (9*chi+0)*param.cl_stride); \ 361 C1 = TEX1DFETCH(float4, (clover), sid + (9*chi+1)*param.cl_stride); \ 362 C2 = TEX1DFETCH(float4, (clover), sid + (9*chi+2)*param.cl_stride); \ 363 C3 = TEX1DFETCH(float4, (clover), sid + (9*chi+3)*param.cl_stride); \ 364 C4 = TEX1DFETCH(float4, (clover), sid + (9*chi+4)*param.cl_stride); \ 365 C5 = TEX1DFETCH(float4, (clover), sid + (9*chi+5)*param.cl_stride); \ 366 C6 = TEX1DFETCH(float4, (clover), sid + (9*chi+6)*param.cl_stride); \ 367 C7 = TEX1DFETCH(float4, (clover), sid + (9*chi+7)*param.cl_stride); \ 368 C8 = TEX1DFETCH(float4, (clover), sid + (9*chi+8)*param.cl_stride); \ 369 K = TEX1DFETCH(float, (TMCLOVERTEXNORM), sid + chi*param.cl_stride); \ 370 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 371 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 372 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 373 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 374 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 375 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 376 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 377 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 378 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; 380 #define PACK_CLOVER_DOUBLE(clover, chi) \ 381 double2 C0 = clover[idx + (18*chi+0)*param.cl_stride]; \ 382 double2 C1 = clover[idx + (18*chi+1)*param.cl_stride]; \ 383 double2 C2 = clover[idx + (18*chi+2)*param.cl_stride]; \ 384 double2 C3 = clover[idx + (18*chi+3)*param.cl_stride]; \ 385 double2 C4 = clover[idx + (18*chi+4)*param.cl_stride]; \ 386 double2 C5 = clover[idx + (18*chi+5)*param.cl_stride]; \ 387 double2 C6 = clover[idx + (18*chi+6)*param.cl_stride]; \ 388 double2 C7 = clover[idx + (18*chi+7)*param.cl_stride]; \ 389 double2 C8 = clover[idx + (18*chi+8)*param.cl_stride]; \ 390 double2 C9 = clover[idx + (18*chi+9)*param.cl_stride]; \ 391 double2 C10 = clover[idx + (18*chi+10)*param.cl_stride]; \ 392 double2 C11 = clover[idx + (18*chi+11)*param.cl_stride]; \ 393 double2 C12 = clover[idx + (18*chi+12)*param.cl_stride]; \ 394 double2 C13 = clover[idx + (18*chi+13)*param.cl_stride]; \ 395 double2 C14 = clover[idx + (18*chi+14)*param.cl_stride]; \ 396 double2 C15 = clover[idx + (18*chi+15)*param.cl_stride]; \ 397 double2 C16 = clover[idx + (18*chi+16)*param.cl_stride]; \ 398 double2 C17 = clover[idx + (18*chi+17)*param.cl_stride]; 400 #define PACK_CLOVER_SINGLE(clover, chi) \ 401 float4 C0 = clover[idx + (9*chi+0)*param.cl_stride]; \ 402 float4 C1 = clover[idx + (9*chi+1)*param.cl_stride]; \ 403 float4 C2 = clover[idx + (9*chi+2)*param.cl_stride]; \ 404 float4 C3 = clover[idx + (9*chi+3)*param.cl_stride]; \ 405 float4 C4 = clover[idx + (9*chi+4)*param.cl_stride]; \ 406 float4 C5 = clover[idx + (9*chi+5)*param.cl_stride]; \ 407 float4 C6 = clover[idx + (9*chi+6)*param.cl_stride]; \ 408 float4 C7 = clover[idx + (9*chi+7)*param.cl_stride]; \ 409 float4 C8 = clover[idx + (9*chi+8)*param.cl_stride]; 411 #define PACK_CLOVER_HALF(clover, chi) \ 412 float4 C0 = short42float4(clover[idx + (9*chi+0)*param.cl_stride]); \ 413 float4 C1 = short42float4(clover[idx + (9*chi+1)*param.cl_stride]); \ 414 float4 C2 = short42float4(clover[idx + (9*chi+2)*param.cl_stride]); \ 415 float4 C3 = short42float4(clover[idx + (9*chi+3)*param.cl_stride]); \ 416 float4 C4 = short42float4(clover[idx + (9*chi+4)*param.cl_stride]); \ 417 float4 C5 = short42float4(clover[idx + (9*chi+5)*param.cl_stride]); \ 418 float4 C6 = short42float4(clover[idx + (9*chi+6)*param.cl_stride]); \ 419 float4 C7 = short42float4(clover[idx + (9*chi+7)*param.cl_stride]); \ 420 float4 C8 = short42float4(clover[idx + (9*chi+8)*param.cl_stride]); \ 421 float K = cloverNorm[idx + chi*param.cl_stride]; \ 422 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 423 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 424 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 425 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 426 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 427 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 428 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 429 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 430 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K; 432 #define PACK_CLOVER_DOUBLE_TEX(clover, chi) \ 433 double2 C0 = fetch_double2((clover), idx + (18*chi+0)*param.cl_stride); \ 434 double2 C1 = fetch_double2((clover), idx + (18*chi+1)*param.cl_stride); \ 435 double2 C2 = fetch_double2((clover), idx + (18*chi+2)*param.cl_stride); \ 436 double2 C3 = fetch_double2((clover), idx + (18*chi+3)*param.cl_stride); \ 437 double2 C4 = fetch_double2((clover), idx + (18*chi+4)*param.cl_stride); \ 438 double2 C5 = fetch_double2((clover), idx + (18*chi+5)*param.cl_stride); \ 439 double2 C6 = fetch_double2((clover), idx + (18*chi+6)*param.cl_stride); \ 440 double2 C7 = fetch_double2((clover), idx + (18*chi+7)*param.cl_stride); \ 441 double2 C8 = fetch_double2((clover), idx + (18*chi+8)*param.cl_stride); \ 442 double2 C9 = fetch_double2((clover), idx + (18*chi+9)*param.cl_stride); \ 443 double2 C10 = fetch_double2((clover), idx + (18*chi+10)*param.cl_stride); \ 444 double2 C11 = fetch_double2((clover), idx + (18*chi+11)*param.cl_stride); \ 445 double2 C12 = fetch_double2((clover), idx + (18*chi+12)*param.cl_stride); \ 446 double2 C13 = fetch_double2((clover), idx + (18*chi+13)*param.cl_stride); \ 447 double2 C14 = fetch_double2((clover), idx + (18*chi+14)*param.cl_stride); \ 448 double2 C15 = fetch_double2((clover), idx + (18*chi+15)*param.cl_stride); \ 449 double2 C16 = fetch_double2((clover), idx + (18*chi+16)*param.cl_stride); \ 450 double2 C17 = fetch_double2((clover), idx + (18*chi+17)*param.cl_stride); 452 #define PACK_CLOVER_SINGLE_TEX(clover, chi) \ 453 float4 C0 = TEX1DFETCH(float4, (clover), idx + (9*chi+0)*param.cl_stride); \ 454 float4 C1 = TEX1DFETCH(float4, (clover), idx + (9*chi+1)*param.cl_stride); \ 455 float4 C2 = TEX1DFETCH(float4, (clover), idx + (9*chi+2)*param.cl_stride); \ 456 float4 C3 = TEX1DFETCH(float4, (clover), idx + (9*chi+3)*param.cl_stride); \ 457 float4 C4 = TEX1DFETCH(float4, (clover), idx + (9*chi+4)*param.cl_stride); \ 458 float4 C5 = TEX1DFETCH(float4, (clover), idx + (9*chi+5)*param.cl_stride); \ 459 float4 C6 = TEX1DFETCH(float4, (clover), idx + (9*chi+6)*param.cl_stride); \ 460 float4 C7 = TEX1DFETCH(float4, (clover), idx + (9*chi+7)*param.cl_stride); \ 461 float4 C8 = TEX1DFETCH(float4, (clover), idx + (9*chi+8)*param.cl_stride); 463 #define PACK_CLOVER_HALF_TEX(clover, chi) \ 464 float4 C0 = TEX1DFETCH(float4, (clover), idx + (9*chi+0)*param.cl_stride); \ 465 float4 C1 = TEX1DFETCH(float4, (clover), idx + (9*chi+1)*param.cl_stride); \ 466 float4 C2 = TEX1DFETCH(float4, (clover), idx + (9*chi+2)*param.cl_stride); \ 467 float4 C3 = TEX1DFETCH(float4, (clover), idx + (9*chi+3)*param.cl_stride); \ 468 float4 C4 = TEX1DFETCH(float4, (clover), idx + (9*chi+4)*param.cl_stride); \ 469 float4 C5 = TEX1DFETCH(float4, (clover), idx + (9*chi+5)*param.cl_stride); \ 470 float4 C6 = TEX1DFETCH(float4, (clover), idx + (9*chi+6)*param.cl_stride); \ 471 float4 C7 = TEX1DFETCH(float4, (clover), idx + (9*chi+7)*param.cl_stride); \ 472 float4 C8 = TEX1DFETCH(float4, (clover), idx + (9*chi+8)*param.cl_stride); \ 473 float K = TEX1DFETCH(float, (TMCLOVERTEXNORM), idx + chi*param.cl_stride); \ 474 C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \ 475 C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \ 476 C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \ 477 C3.x *= K; C3.y *= K; C3.z *= K; C3.w *= K; \ 478 C4.x *= K; C4.y *= K; C4.z *= K; C4.w *= K; \ 479 C5.x *= K; C5.y *= K; C5.z *= K; C5.w *= K; \ 480 C6.x *= K; C6.y *= K; C6.z *= K; C6.w *= K; \ 481 C7.x *= K; C7.y *= K; C7.z *= K; C7.w *= K; \ 482 C8.x *= K; C8.y *= K; C8.z *= K; C8.w *= K;