QUDA v0.4.0
A library for QCD on GPUs
|
00001 #define READ_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \ 00002 double2 I0 = spinor[sp_idx + 0*(stride)]; \ 00003 double2 I1 = spinor[sp_idx + 1*(stride)]; \ 00004 double2 I2 = spinor[sp_idx + 2*(stride)]; \ 00005 double2 I3 = spinor[sp_idx + 3*(stride)]; \ 00006 double2 I4 = spinor[sp_idx + 4*(stride)]; \ 00007 double2 I5 = spinor[sp_idx + 5*(stride)]; \ 00008 double2 I6 = spinor[sp_idx + 6*(stride)]; \ 00009 double2 I7 = spinor[sp_idx + 7*(stride)]; \ 00010 double2 I8 = spinor[sp_idx + 8*(stride)]; \ 00011 double2 I9 = spinor[sp_idx + 9*(stride)]; \ 00012 double2 I10 = spinor[sp_idx + 10*(stride)]; \ 00013 double2 I11 = spinor[sp_idx + 11*(stride)]; 00014 00015 #define READ_SPINOR_DOUBLE_UP(spinor, stride, sp_idx, norm_idx) \ 00016 double2 I0 = spinor[sp_idx + 0*(stride)]; \ 00017 double2 I1 = spinor[sp_idx + 1*(stride)]; \ 00018 double2 I2 = spinor[sp_idx + 2*(stride)]; \ 00019 double2 I3 = spinor[sp_idx + 3*(stride)]; \ 00020 double2 I4 = spinor[sp_idx + 4*(stride)]; \ 00021 double2 I5 = spinor[sp_idx + 5*(stride)]; 00022 00023 #define READ_SPINOR_DOUBLE_DOWN(spinor, stride, sp_idx, norm_idx) \ 00024 double2 I6 = spinor[sp_idx + 6*(stride)]; \ 00025 double2 I7 = spinor[sp_idx + 7*(stride)]; \ 00026 double2 I8 = spinor[sp_idx + 8*(stride)]; \ 00027 double2 I9 = spinor[sp_idx + 9*(stride)]; \ 00028 double2 I10 = spinor[sp_idx + 10*(stride)]; \ 00029 double2 I11 = spinor[sp_idx + 11*(stride)]; 00030 00031 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \ 00032 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 00033 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 00034 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 00035 float4 I3 = spinor[sp_idx + 3*(stride)]; \ 00036 float4 I4 = spinor[sp_idx + 4*(stride)]; \ 00037 float4 I5 = spinor[sp_idx + 5*(stride)]; 00038 00039 #define READ_SPINOR_SINGLE_UP(spinor, stride, sp_idx, norm_idx) \ 00040 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 00041 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 00042 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 00043 00044 #define READ_SPINOR_SINGLE_DOWN(spinor, stride, sp_idx, norm_idx) \ 00045 float4 I3 = spinor[sp_idx + 3*(stride)]; \ 00046 float4 I4 = spinor[sp_idx + 4*(stride)]; \ 00047 float4 I5 = spinor[sp_idx + 5*(stride)]; 00048 00049 #define READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) \ 00050 float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \ 00051 float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \ 00052 float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \ 00053 float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]); \ 00054 float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]); \ 00055 float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]); \ 00056 float C = (spinor ## Norm)[norm_idx]; \ 00057 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 00058 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 00059 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 00060 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 00061 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 00062 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 00063 00064 #define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx) \ 00065 READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx) 00066 00067 #define READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) \ 00068 float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]); \ 00069 float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]); \ 00070 float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]); \ 00071 float C = (spinor ## Norm)[norm_idx]; \ 00072 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 00073 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 00074 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 00075 00076 #define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx) \ 00077 READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx) 00078 00079 #define READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) \ 00080 float4 I3 = short42float4(spinor[sp_idx + 3*stride]); \ 00081 float4 I4 = short42float4(spinor[sp_idx + 4*stride]); \ 00082 float4 I5 = short42float4(spinor[sp_idx + 5*stride]); \ 00083 float C = (spinor ## Norm)[norm_idx]; \ 00084 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 00085 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 00086 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 00087 00088 #define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \ 00089 READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx) 00090 00091 #define READ_ACCUM_DOUBLE(spinor, stride) \ 00092 double2 accum0 = spinor[sid + 0*stride]; \ 00093 double2 accum1 = spinor[sid + 1*stride]; \ 00094 double2 accum2 = spinor[sid + 2*stride]; \ 00095 double2 accum3 = spinor[sid + 3*stride]; \ 00096 double2 accum4 = spinor[sid + 4*stride]; \ 00097 double2 accum5 = spinor[sid + 5*stride]; \ 00098 double2 accum6 = spinor[sid + 6*stride]; \ 00099 double2 accum7 = spinor[sid + 7*stride]; \ 00100 double2 accum8 = spinor[sid + 8*stride]; \ 00101 double2 accum9 = spinor[sid + 9*stride]; \ 00102 double2 accum10 = spinor[sid + 10*stride]; \ 00103 double2 accum11 = spinor[sid + 11*stride]; 00104 00105 #define READ_ACCUM_SINGLE(spinor, stride) \ 00106 float4 accum0 = spinor[sid + 0*(stride)]; \ 00107 float4 accum1 = spinor[sid + 1*(stride)]; \ 00108 float4 accum2 = spinor[sid + 2*(stride)]; \ 00109 float4 accum3 = spinor[sid + 3*(stride)]; \ 00110 float4 accum4 = spinor[sid + 4*(stride)]; \ 00111 float4 accum5 = spinor[sid + 5*(stride)]; 00112 00113 #define READ_ACCUM_HALF_(spinor, stride) \ 00114 float4 accum0 = short42float4(spinor[sid + 0*stride]); \ 00115 float4 accum1 = short42float4(spinor[sid + 1*stride]); \ 00116 float4 accum2 = short42float4(spinor[sid + 2*stride]); \ 00117 float4 accum3 = short42float4(spinor[sid + 3*stride]); \ 00118 float4 accum4 = short42float4(spinor[sid + 4*stride]); \ 00119 float4 accum5 = short42float4(spinor[sid + 5*stride]); \ 00120 float C = (spinor ## Norm)[sid]; \ 00121 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 00122 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 00123 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 00124 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 00125 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 00126 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; 00127 00128 #define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride) 00129 00130 #define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \ 00131 double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 00132 double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 00133 double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 00134 double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 00135 double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 00136 double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); \ 00137 double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \ 00138 double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \ 00139 double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \ 00140 double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \ 00141 double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \ 00142 double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride)); 00143 00144 #define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \ 00145 double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 00146 double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 00147 double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 00148 double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 00149 double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 00150 double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride)); 00151 00152 #define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \ 00153 double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride)); \ 00154 double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride)); \ 00155 double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride)); \ 00156 double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride)); \ 00157 double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \ 00158 double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride)); 00159 00160 #define READ_ACCUM_DOUBLE_TEX(spinor, stride) \ 00161 double2 accum0 = fetch_double2((spinor), sid + 0*(stride)); \ 00162 double2 accum1 = fetch_double2((spinor), sid + 1*(stride)); \ 00163 double2 accum2 = fetch_double2((spinor), sid + 2*(stride)); \ 00164 double2 accum3 = fetch_double2((spinor), sid + 3*(stride)); \ 00165 double2 accum4 = fetch_double2((spinor), sid + 4*(stride)); \ 00166 double2 accum5 = fetch_double2((spinor), sid + 5*(stride)); \ 00167 double2 accum6 = fetch_double2((spinor), sid + 6*(stride)); \ 00168 double2 accum7 = fetch_double2((spinor), sid + 7*(stride)); \ 00169 double2 accum8 = fetch_double2((spinor), sid + 8*(stride)); \ 00170 double2 accum9 = fetch_double2((spinor), sid + 9*(stride)); \ 00171 double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \ 00172 double2 accum11 = fetch_double2((spinor), sid + 11*(stride)); 00173 00174 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \ 00175 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride)); \ 00176 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride)); \ 00177 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride)); \ 00178 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride)); \ 00179 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride)); \ 00180 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride)); 00181 00182 #define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \ 00183 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride)); \ 00184 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride)); \ 00185 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride)); \ 00186 00187 #define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \ 00188 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride)); \ 00189 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride)); \ 00190 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride)); 00191 00192 #define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \ 00193 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride)); \ 00194 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride)); \ 00195 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride)); \ 00196 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride)); \ 00197 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride)); \ 00198 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride)); \ 00199 float C = tex1Dfetch((spinor ## Norm), norm_idx); \ 00200 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 00201 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 00202 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 00203 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 00204 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 00205 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 00206 00207 #define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx) \ 00208 READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \ 00209 00210 #define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \ 00211 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride)); \ 00212 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride)); \ 00213 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride)); \ 00214 float C = tex1Dfetch((spinor ## Norm), norm_idx); \ 00215 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 00216 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 00217 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 00218 00219 #define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \ 00220 READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx) \ 00221 00222 #define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \ 00223 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride)); \ 00224 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride)); \ 00225 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride)); \ 00226 float C = tex1Dfetch((spinor ## Norm), norm_idx); \ 00227 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 00228 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 00229 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 00230 00231 #define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx) \ 00232 READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx) \ 00233 00234 #define READ_ACCUM_SINGLE_TEX(spinor, stride) \ 00235 float4 accum0 = tex1Dfetch((spinor), sid + 0*(stride)); \ 00236 float4 accum1 = tex1Dfetch((spinor), sid + 1*(stride)); \ 00237 float4 accum2 = tex1Dfetch((spinor), sid + 2*(stride)); \ 00238 float4 accum3 = tex1Dfetch((spinor), sid + 3*(stride)); \ 00239 float4 accum4 = tex1Dfetch((spinor), sid + 4*(stride)); \ 00240 float4 accum5 = tex1Dfetch((spinor), sid + 5*(stride)); 00241 00242 #define READ_ACCUM_HALF_TEX_(spinor, stride) \ 00243 float4 accum0 = tex1Dfetch((spinor), sid + 0*(stride)); \ 00244 float4 accum1 = tex1Dfetch((spinor), sid + 1*(stride)); \ 00245 float4 accum2 = tex1Dfetch((spinor), sid + 2*(stride)); \ 00246 float4 accum3 = tex1Dfetch((spinor), sid + 3*(stride)); \ 00247 float4 accum4 = tex1Dfetch((spinor), sid + 4*(stride)); \ 00248 float4 accum5 = tex1Dfetch((spinor), sid + 5*(stride)); \ 00249 float C = tex1Dfetch((spinor ## Norm), sid); \ 00250 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 00251 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 00252 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 00253 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 00254 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 00255 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; 00256 00257 #define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride) 00258 00259 00260 #define WRITE_SPINOR_DOUBLE2(stride) \ 00261 out[0*(stride)+sid] = make_double2(o00_re, o00_im); \ 00262 out[1*(stride)+sid] = make_double2(o01_re, o01_im); \ 00263 out[2*(stride)+sid] = make_double2(o02_re, o02_im); \ 00264 out[3*(stride)+sid] = make_double2(o10_re, o10_im); \ 00265 out[4*(stride)+sid] = make_double2(o11_re, o11_im); \ 00266 out[5*(stride)+sid] = make_double2(o12_re, o12_im); \ 00267 out[6*(stride)+sid] = make_double2(o20_re, o20_im); \ 00268 out[7*(stride)+sid] = make_double2(o21_re, o21_im); \ 00269 out[8*(stride)+sid] = make_double2(o22_re, o22_im); \ 00270 out[9*(stride)+sid] = make_double2(o30_re, o30_im); \ 00271 out[10*(stride)+sid] = make_double2(o31_re, o31_im); \ 00272 out[11*(stride)+sid] = make_double2(o32_re, o32_im); 00273 00274 #define WRITE_SPINOR_FLOAT4(stride) \ 00275 out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \ 00276 out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \ 00277 out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \ 00278 out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \ 00279 out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \ 00280 out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im); 00281 00282 #define WRITE_SPINOR_SHORT4(stride) \ 00283 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 00284 float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \ 00285 float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \ 00286 float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \ 00287 float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \ 00288 float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \ 00289 float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \ 00290 float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \ 00291 float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \ 00292 float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \ 00293 float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \ 00294 float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \ 00295 c0 = fmaxf(c0, c1); \ 00296 c1 = fmaxf(c2, c3); \ 00297 c2 = fmaxf(c4, c5); \ 00298 c3 = fmaxf(c6, c7); \ 00299 c4 = fmaxf(c8, c9); \ 00300 c5 = fmaxf(c10, c11); \ 00301 c0 = fmaxf(c0, c1); \ 00302 c1 = fmaxf(c2, c3); \ 00303 c2 = fmaxf(c4, c5); \ 00304 c0 = fmaxf(c0, c1); \ 00305 c0 = fmaxf(c0, c2); \ 00306 outNorm[sid] = c0; \ 00307 float scale = __fdividef(MAX_SHORT, c0); \ 00308 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 00309 o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \ 00310 o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \ 00311 o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \ 00312 o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \ 00313 o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \ 00314 out[sid+0*(stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \ 00315 out[sid+1*(stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \ 00316 out[sid+2*(stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \ 00317 out[sid+3*(stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \ 00318 out[sid+4*(stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \ 00319 out[sid+5*(stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im); 00320 00321 #if (__COMPUTE_CAPABILITY__ >= 200) 00322 #define WRITE_SPINOR_DOUBLE2_STR(stride) \ 00323 store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im); \ 00324 store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im); \ 00325 store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im); \ 00326 store_streaming_double2(&out[3*sp_stride+sid], o10_re, o10_im); \ 00327 store_streaming_double2(&out[4*sp_stride+sid], o11_re, o11_im); \ 00328 store_streaming_double2(&out[5*sp_stride+sid], o12_re, o12_im); \ 00329 store_streaming_double2(&out[6*sp_stride+sid], o20_re, o20_im); \ 00330 store_streaming_double2(&out[7*sp_stride+sid], o21_re, o21_im); \ 00331 store_streaming_double2(&out[8*sp_stride+sid], o22_re, o22_im); \ 00332 store_streaming_double2(&out[9*sp_stride+sid], o30_re, o30_im); \ 00333 store_streaming_double2(&out[10*sp_stride+sid], o31_re, o31_im); \ 00334 store_streaming_double2(&out[11*sp_stride+sid], o32_re, o32_im); 00335 00336 #define WRITE_SPINOR_FLOAT4_STR(stride) \ 00337 store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \ 00338 store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \ 00339 store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \ 00340 store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \ 00341 store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \ 00342 store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im); 00343 00344 #define WRITE_SPINOR_SHORT4_STR(stride) \ 00345 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 00346 float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \ 00347 float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \ 00348 float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \ 00349 float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \ 00350 float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \ 00351 float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \ 00352 float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \ 00353 float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \ 00354 float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \ 00355 float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \ 00356 float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \ 00357 c0 = fmaxf(c0, c1); \ 00358 c1 = fmaxf(c2, c3); \ 00359 c2 = fmaxf(c4, c5); \ 00360 c3 = fmaxf(c6, c7); \ 00361 c4 = fmaxf(c8, c9); \ 00362 c5 = fmaxf(c10, c11); \ 00363 c0 = fmaxf(c0, c1); \ 00364 c1 = fmaxf(c2, c3); \ 00365 c2 = fmaxf(c4, c5); \ 00366 c0 = fmaxf(c0, c1); \ 00367 c0 = fmaxf(c0, c2); \ 00368 outNorm[sid] = c0; \ 00369 float scale = __fdividef(MAX_SHORT, c0); \ 00370 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 00371 o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \ 00372 o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \ 00373 o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \ 00374 o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \ 00375 o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \ 00376 store_streaming_short4(&out[0*(stride)+sid], (short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \ 00377 store_streaming_short4(&out[1*(stride)+sid], (short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \ 00378 store_streaming_short4(&out[2*(stride)+sid], (short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \ 00379 store_streaming_short4(&out[3*(stride)+sid], (short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \ 00380 store_streaming_short4(&out[4*(stride)+sid], (short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \ 00381 store_streaming_short4(&out[5*(stride)+sid], (short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im); 00382 #else 00383 #define WRITE_SPINOR_DOUBLE2_STR(stride) WRITE_SPINOR_DOUBLE2(stride) 00384 #define WRITE_SPINOR_FLOAT4_STR(stride) WRITE_SPINOR_FLOAT4(stride) 00385 #define WRITE_SPINOR_SHORT4_STR(stride) WRITE_SPINOR_SHORT4(stride) 00386 #endif 00387 00388 // macros used for exterior Wilson Dslash kernels and face packing 00389 00390 #define READ_HALF_SPINOR READ_SPINOR_UP 00391 00392 #define WRITE_HALF_SPINOR_DOUBLE2(stride, sid) \ 00393 out[0*(stride)+sid] = make_double2(a0_re, a0_im); \ 00394 out[1*(stride)+sid] = make_double2(a1_re, a1_im); \ 00395 out[2*(stride)+sid] = make_double2(a2_re, a2_im); \ 00396 out[3*(stride)+sid] = make_double2(b0_re, b0_im); \ 00397 out[4*(stride)+sid] = make_double2(b1_re, b1_im); \ 00398 out[5*(stride)+sid] = make_double2(b2_re, b2_im); 00399 00400 #define WRITE_HALF_SPINOR_FLOAT4(stride, sid) \ 00401 out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im); \ 00402 out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im); \ 00403 out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im); 00404 00405 #define WRITE_HALF_SPINOR_SHORT4(stride, sid) \ 00406 float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im)); \ 00407 float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im)); \ 00408 float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im)); \ 00409 float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im)); \ 00410 float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im)); \ 00411 float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im)); \ 00412 c0 = fmaxf(c0, c1); \ 00413 c1 = fmaxf(c2, c3); \ 00414 c2 = fmaxf(c4, c5); \ 00415 c0 = fmaxf(c0, c1); \ 00416 c0 = fmaxf(c0, c2); \ 00417 outNorm[sid] = c0; \ 00418 float scale = __fdividef(MAX_SHORT, c0); \ 00419 a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale; \ 00420 a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale; \ 00421 b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale; \ 00422 out[sid+0*(stride)] = make_short4((short)a0_re, (short)a0_im, (short)a1_re, (short)a1_im); \ 00423 out[sid+1*(stride)] = make_short4((short)a2_re, (short)a2_im, (short)b0_re, (short)b0_im); \ 00424 out[sid+2*(stride)] = make_short4((short)b1_re, (short)b1_im, (short)b2_re, (short)b2_im); 00425 00426 00427 /************* the following is used by staggered *****************/ 00428 00429 #define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \ 00430 double2 I0 = fetch_double2((spinor), idx + 0*mystride); \ 00431 double2 I1 = fetch_double2((spinor), idx + 1*mystride); \ 00432 double2 I2 = fetch_double2((spinor), idx + 2*mystride); 00433 00434 #define READ_3RD_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride) \ 00435 double2 T0 = fetch_double2((spinor), idx + 0*mystride); \ 00436 double2 T1 = fetch_double2((spinor), idx + 1*mystride); \ 00437 double2 T2 = fetch_double2((spinor), idx + 2*mystride); 00438 00439 #define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \ 00440 float2 I0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00441 float2 I1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00442 float2 I2 = tex1Dfetch((spinor), idx + 2*mystride); 00443 00444 #define READ_3RD_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride) \ 00445 float2 T0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00446 float2 T1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00447 float2 T2 = tex1Dfetch((spinor), idx + 2*mystride); 00448 00449 #define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \ 00450 float2 I0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00451 float2 I1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00452 float2 I2 = tex1Dfetch((spinor), idx + 2*mystride); \ 00453 { \ 00454 float C = tex1Dfetch((spinorTexHalfNorm), norm_idx1); \ 00455 I0.x *= C; I0.y *= C; \ 00456 I1.x *= C; I1.y *= C; \ 00457 I2.x *= C; I2.y *= C;} 00458 00459 #define READ_3RD_NBR_SPINOR_HALF_TEX(spinor, idx, mystride) \ 00460 float2 T0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00461 float2 T1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00462 float2 T2 = tex1Dfetch((spinor), idx + 2*mystride); \ 00463 { \ 00464 float C = tex1Dfetch((spinorTexHalfNorm), norm_idx3); \ 00465 T0.x *= C; T0.y *= C; \ 00466 T1.x *= C; T1.y *= C; \ 00467 T2.x *= C; T2.y *= C;} 00468 00469 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 00470 double2 I0 = spinor[idx + 0*mystride]; \ 00471 double2 I1 = spinor[idx + 1*mystride]; \ 00472 double2 I2 = spinor[idx + 2*mystride]; 00473 00474 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 00475 double2 T0 = spinor[idx + 0*mystride]; \ 00476 double2 T1 = spinor[idx + 1*mystride]; \ 00477 double2 T2 = spinor[idx + 2*mystride]; 00478 00479 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 00480 float2 I0 = spinor[idx + 0*mystride]; \ 00481 float2 I1 = spinor[idx + 1*mystride]; \ 00482 float2 I2 = spinor[idx + 2*mystride]; 00483 00484 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 00485 float2 T0 = spinor[idx + 0*mystride]; \ 00486 float2 T1 = spinor[idx + 1*mystride]; \ 00487 float2 T2 = spinor[idx + 2*mystride]; 00488 00489 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \ 00490 float2 I0, I1, I2; \ 00491 { \ 00492 short2 S0 = in[idx + 0*mystride]; \ 00493 short2 S1 = in[idx + 1*mystride]; \ 00494 short2 S2 = in[idx + 2*mystride]; \ 00495 float C = inNorm[idx]; \ 00496 I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \ 00497 I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \ 00498 I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \ 00499 } 00500 00501 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride) \ 00502 float2 T0, T1, T2; \ 00503 { \ 00504 short2 S0 = in[idx + 0*mystride]; \ 00505 short2 S1 = in[idx + 1*mystride]; \ 00506 short2 S2 = in[idx + 2*mystride]; \ 00507 float C = inNorm[idx]; \ 00508 T0.x =C*short2float(S0.x); T0.y =C*short2float(S0.y); \ 00509 T1.x =C*short2float(S1.x); T1.y =C*short2float(S1.y); \ 00510 T2.x =C*short2float(S2.x); T2.y =C*short2float(S2.y); \ 00511 } 00512 00513 00514 #define WRITE_ST_SPINOR_DOUBLE2(out) \ 00515 out[0*sp_stride+sid] = make_double2(o00_re, o00_im); \ 00516 out[1*sp_stride+sid] = make_double2(o01_re, o01_im); \ 00517 out[2*sp_stride+sid] = make_double2(o02_re, o02_im); 00518 00519 #define WRITE_ST_SPINOR_FLOAT2(out) \ 00520 out[0*sp_stride+sid] = make_float2(o00_re, o00_im); \ 00521 out[1*sp_stride+sid] = make_float2(o01_re, o01_im); \ 00522 out[2*sp_stride+sid] = make_float2(o02_re, o02_im); 00523 00524 #define WRITE_ST_SPINOR_SHORT2(out) \ 00525 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 00526 float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \ 00527 float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \ 00528 c0 = fmaxf(c0, c1); \ 00529 c0 = fmaxf(c0, c2); \ 00530 out ## Norm[sid] = c0; \ 00531 float scale = __fdividef(MAX_SHORT, c0); \ 00532 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 00533 o02_re *= scale; o02_im *= scale; \ 00534 out[sid+0*sp_stride] = make_short2((short)o00_re, (short)o00_im); \ 00535 out[sid+1*sp_stride] = make_short2((short)o01_re, (short)o01_im); \ 00536 out[sid+2*sp_stride] = make_short2((short)o02_re, (short)o02_im); 00537 00538 // Non-cache writes to minimize cache polution 00539 #if (__COMPUTE_CAPABILITY__ >= 200) 00540 00541 #define WRITE_ST_SPINOR_DOUBLE2_STR(out) \ 00542 store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im); \ 00543 store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im); \ 00544 store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im); 00545 00546 #define WRITE_ST_SPINOR_FLOAT2_STR(out) \ 00547 store_streaming_float2(&out[0*sp_stride+sid], o00_re, o00_im); \ 00548 store_streaming_float2(&out[1*sp_stride+sid], o01_re, o01_im); \ 00549 store_streaming_float2(&out[2*sp_stride+sid], o02_re, o02_im); 00550 00551 #define WRITE_ST_SPINOR_SHORT2_STR(out) \ 00552 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 00553 float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \ 00554 float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \ 00555 c0 = fmaxf(c0, c1); \ 00556 c0 = fmaxf(c0, c2); \ 00557 out ## Norm[sid] = c0; \ 00558 float scale = __fdividef(MAX_SHORT, c0); \ 00559 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 00560 o02_re *= scale; o02_im *= scale; \ 00561 store_streaming_short2(&g_out[0*sp_stride+sid], (short)o00_re, (short)o00_im); \ 00562 store_streaming_short2(&g_out[1*sp_stride+sid], (short)o01_re, (short)o01_im); \ 00563 store_streaming_short2(&g_out[2*sp_stride+sid], (short)o02_re, (short)o02_im); 00564 #else 00565 00566 #define WRITE_ST_SPINOR_DOUBLE2_STR() WRITE_ST_SPINOR_DOUBLE2() 00567 #define WRITE_ST_SPINOR_FLOAT4_STR() WRITE_ST_SPINOR_FLOAT4() 00568 #define WRITE_ST_SPINOR_SHORT4_STR() WRITE_ST_SPINOR_SHORT4() 00569 00570 #endif 00571 00572 #define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor) { \ 00573 double2 tmp0 = fetch_double2((spinor), sid + 0*(sp_stride)); \ 00574 double2 tmp1 = fetch_double2((spinor), sid + 1*(sp_stride)); \ 00575 double2 tmp2 = fetch_double2((spinor), sid + 2*(sp_stride)); \ 00576 o00_re += tmp0.x; o00_im += tmp0.y; \ 00577 o01_re += tmp1.x; o01_im += tmp1.y; \ 00578 o02_re += tmp2.x; o02_im += tmp2.y; } 00579 00580 #define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor) { \ 00581 float2 tmp0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \ 00582 float2 tmp1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \ 00583 float2 tmp2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \ 00584 o00_re += tmp0.x; o00_im += tmp0.y; \ 00585 o01_re += tmp1.x; o01_im += tmp1.y; \ 00586 o02_re += tmp2.x; o02_im += tmp2.y; } 00587 00588 #define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor) { \ 00589 float2 tmp0 = tex1Dfetch((spinor), sid + 0*sp_stride); \ 00590 float2 tmp1 = tex1Dfetch((spinor), sid + 1*sp_stride); \ 00591 float2 tmp2 = tex1Dfetch((spinor), sid + 2*sp_stride); \ 00592 float C = tex1Dfetch((spinor##Norm), sid); \ 00593 o00_re += C*tmp0.x; o00_im += C*tmp0.y; \ 00594 o01_re += C*tmp1.x; o01_im += C*tmp1.y; \ 00595 o02_re += C*tmp2.x; o02_im += C*tmp2.y; } 00596 00597 #define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor) \ 00598 READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor) 00599 00600 #define READ_AND_SUM_ST_SPINOR(spinor) \ 00601 o00_re += spinor[0*sp_stride+sid].x; o00_im += spinor[0*sp_stride+sid].y; \ 00602 o01_re += spinor[1*sp_stride+sid].x; o01_im += spinor[1*sp_stride+sid].y; \ 00603 o02_re += spinor[2*sp_stride+sid].x; o02_im += spinor[2*sp_stride+sid].y; \ 00604 00605 #define READ_AND_SUM_ST_SPINOR_HALF_(spinor) \ 00606 float C = spinor ## Norm[sid]; \ 00607 o00_re += C*short2float(spinor[0*sp_stride + sid].x); \ 00608 o00_im += C*short2float(spinor[0*sp_stride + sid].y); \ 00609 o01_re += C*short2float(spinor[1*sp_stride + sid].x); \ 00610 o01_im += C*short2float(spinor[1*sp_stride + sid].y); \ 00611 o02_re += C*short2float(spinor[2*sp_stride + sid].x); \ 00612 o02_im += C*short2float(spinor[2*sp_stride + sid].y); 00613 00614 #define READ_AND_SUM_ST_SPINOR_HALF(spinor) \ 00615 READ_AND_SUM_ST_SPINOR_HALF_(spinor) 00616 00617 #define READ_ST_ACCUM_DOUBLE_TEX(spinor) \ 00618 double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \ 00619 double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \ 00620 double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride)); 00621 00622 #define READ_ST_ACCUM_SINGLE_TEX(spinor) \ 00623 float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride); \ 00624 float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride); \ 00625 float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride); 00626 00627 #define READ_ST_ACCUM_HALF_TEX(spinor) \ 00628 float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride); \ 00629 float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride); \ 00630 float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride); \ 00631 float C = tex1Dfetch((accumTexHalfNorm), sid); \ 00632 accum0.x *= C; accum0.y *= C; \ 00633 accum1.x *= C; accum1.y *= C; \ 00634 accum2.x *= C; accum2.y *= C; 00635 00636 #define READ_ST_ACCUM_DOUBLE(spinor) \ 00637 double2 accum0 = spinor[sid + 0*(sp_stride)]; \ 00638 double2 accum1 = spinor[sid + 1*(sp_stride)]; \ 00639 double2 accum2 = spinor[sid + 2*(sp_stride)]; 00640 00641 #define READ_ST_ACCUM_SINGLE(spinor) \ 00642 float2 accum0 = spinor[sid + 0*(sp_stride)]; \ 00643 float2 accum1 = spinor[sid + 1*(sp_stride)]; \ 00644 float2 accum2 = spinor[sid + 2*(sp_stride)]; 00645 00646 #define READ_ST_ACCUM_HALF(spinor) \ 00647 float2 accum0, accum1, accum2; \ 00648 { \ 00649 short2 S0 = x[sid + 0*sp_stride]; \ 00650 short2 S1 = x[sid + 1*sp_stride]; \ 00651 short2 S2 = x[sid + 2*sp_stride]; \ 00652 float C = spinor##Norm[sid]; \ 00653 accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \ 00654 accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \ 00655 accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \ 00656 } 00657 00658 #define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg) \ 00659 extern __shared__ char s_data[]; \ 00660 spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \ 00661 ((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 00662 sh[0*SHARED_STRIDE] = reg##00_re; \ 00663 sh[1*SHARED_STRIDE] = reg##00_im; \ 00664 sh[2*SHARED_STRIDE] = reg##01_re; \ 00665 sh[3*SHARED_STRIDE] = reg##01_im; \ 00666 sh[4*SHARED_STRIDE] = reg##02_re; \ 00667 sh[5*SHARED_STRIDE] = reg##02_im; \ 00668 sh[6*SHARED_STRIDE] = reg##10_re; \ 00669 sh[7*SHARED_STRIDE] = reg##10_im; \ 00670 sh[8*SHARED_STRIDE] = reg##11_re; \ 00671 sh[9*SHARED_STRIDE] = reg##11_im; \ 00672 sh[10*SHARED_STRIDE] = reg##12_re; \ 00673 sh[11*SHARED_STRIDE] = reg##12_im; \ 00674 sh[12*SHARED_STRIDE] = reg##20_re; \ 00675 sh[13*SHARED_STRIDE] = reg##20_im; \ 00676 sh[14*SHARED_STRIDE] = reg##21_re; \ 00677 sh[15*SHARED_STRIDE] = reg##21_im; \ 00678 sh[16*SHARED_STRIDE] = reg##22_re; \ 00679 sh[17*SHARED_STRIDE] = reg##22_im; \ 00680 sh[18*SHARED_STRIDE] = reg##30_re; \ 00681 sh[19*SHARED_STRIDE] = reg##30_im; \ 00682 sh[20*SHARED_STRIDE] = reg##31_re; \ 00683 sh[21*SHARED_STRIDE] = reg##31_im; \ 00684 sh[22*SHARED_STRIDE] = reg##32_re; \ 00685 sh[23*SHARED_STRIDE] = reg##32_im; 00686 00687 #define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL 00688 00689 #define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz) \ 00690 extern __shared__ char s_data[]; \ 00691 double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \ 00692 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 00693 double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]); \ 00694 double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \ 00695 double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]); \ 00696 double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \ 00697 double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]); \ 00698 double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \ 00699 double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \ 00700 double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \ 00701 double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \ 00702 double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \ 00703 double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \ 00704 double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]); 00705 00706 #ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access 00707 00708 #define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL 00709 00710 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \ 00711 extern __shared__ char s_data[]; \ 00712 float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \ 00713 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 00714 float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \ 00715 float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \ 00716 float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \ 00717 float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \ 00718 float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \ 00719 float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]); 00720 00721 #else // 8-byte shared memory words 00722 00723 #define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg) \ 00724 extern __shared__ char s_data[]; \ 00725 float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \ 00726 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 00727 sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im); \ 00728 sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im); \ 00729 sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im); \ 00730 sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im); \ 00731 sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im); \ 00732 sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im); \ 00733 sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im); \ 00734 sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im); \ 00735 sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im); \ 00736 sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im); \ 00737 sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im); \ 00738 sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im); 00739 00740 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz) \ 00741 extern __shared__ char s_data[]; \ 00742 float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \ 00743 ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \ 00744 float2 tmp1, tmp2; \ 00745 tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 00746 tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 00747 tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 00748 tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 00749 tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \ 00750 tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); 00751 00752 #endif