QUDA: quda/lib/io_spinor.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #define READ_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx)    \
00002   double2 I0 = spinor[sp_idx + 0*(stride)];   \
00003   double2 I1 = spinor[sp_idx + 1*(stride)];   \
00004   double2 I2 = spinor[sp_idx + 2*(stride)];   \
00005   double2 I3 = spinor[sp_idx + 3*(stride)];   \
00006   double2 I4 = spinor[sp_idx + 4*(stride)];   \
00007   double2 I5 = spinor[sp_idx + 5*(stride)];   \
00008   double2 I6 = spinor[sp_idx + 6*(stride)];   \
00009   double2 I7 = spinor[sp_idx + 7*(stride)];   \
00010   double2 I8 = spinor[sp_idx + 8*(stride)];   \
00011   double2 I9 = spinor[sp_idx + 9*(stride)];   \
00012   double2 I10 = spinor[sp_idx + 10*(stride)]; \
00013   double2 I11 = spinor[sp_idx + 11*(stride)];
00014 
00015 #define READ_SPINOR_DOUBLE_UP(spinor, stride, sp_idx, norm_idx) \
00016   double2 I0 = spinor[sp_idx + 0*(stride)];   \
00017   double2 I1 = spinor[sp_idx + 1*(stride)];   \
00018   double2 I2 = spinor[sp_idx + 2*(stride)];   \
00019   double2 I3 = spinor[sp_idx + 3*(stride)];   \
00020   double2 I4 = spinor[sp_idx + 4*(stride)];   \
00021   double2 I5 = spinor[sp_idx + 5*(stride)];
00022 
00023 #define READ_SPINOR_DOUBLE_DOWN(spinor, stride, sp_idx, norm_idx)      \
00024   double2 I6 = spinor[sp_idx + 6*(stride)];   \
00025   double2 I7 = spinor[sp_idx + 7*(stride)];   \
00026   double2 I8 = spinor[sp_idx + 8*(stride)];   \
00027   double2 I9 = spinor[sp_idx + 9*(stride)];   \
00028   double2 I10 = spinor[sp_idx + 10*(stride)]; \
00029   double2 I11 = spinor[sp_idx + 11*(stride)];
00030 
00031 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx)       \
00032   float4 I0 = spinor[sp_idx + 0*(stride)];   \
00033   float4 I1 = spinor[sp_idx + 1*(stride)];   \
00034   float4 I2 = spinor[sp_idx + 2*(stride)];   \
00035   float4 I3 = spinor[sp_idx + 3*(stride)];   \
00036   float4 I4 = spinor[sp_idx + 4*(stride)];   \
00037   float4 I5 = spinor[sp_idx + 5*(stride)];
00038 
00039 #define READ_SPINOR_SINGLE_UP(spinor, stride, sp_idx, norm_idx)    \
00040   float4 I0 = spinor[sp_idx + 0*(stride)];   \
00041   float4 I1 = spinor[sp_idx + 1*(stride)];   \
00042   float4 I2 = spinor[sp_idx + 2*(stride)];   \
00043 
00044 #define READ_SPINOR_SINGLE_DOWN(spinor, stride, sp_idx, norm_idx)  \
00045   float4 I3 = spinor[sp_idx + 3*(stride)];   \
00046   float4 I4 = spinor[sp_idx + 4*(stride)];   \
00047   float4 I5 = spinor[sp_idx + 5*(stride)];
00048 
00049 #define READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)        \
00050   float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]);          \
00051   float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]);          \
00052   float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]);          \
00053   float4 I3 = short42float4(spinor[sp_idx + 3*(stride)]);          \
00054   float4 I4 = short42float4(spinor[sp_idx + 4*(stride)]);          \
00055   float4 I5 = short42float4(spinor[sp_idx + 5*(stride)]);          \
00056   float C = (spinor ## Norm)[norm_idx];                    \
00057   I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;                      \
00058   I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;                      \
00059   I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;                      \
00060   I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;                      \
00061   I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;                      \
00062   I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;                                        
00063 
00064 #define READ_SPINOR_HALF(spinor, stride, sp_idx, norm_idx)         \
00065   READ_SPINOR_HALF_(spinor, stride, sp_idx, norm_idx)              
00066 
00067 #define READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)        \
00068   float4 I0 = short42float4(spinor[sp_idx + 0*(stride)]);             \
00069   float4 I1 = short42float4(spinor[sp_idx + 1*(stride)]);             \
00070   float4 I2 = short42float4(spinor[sp_idx + 2*(stride)]);             \
00071   float C = (spinor ## Norm)[norm_idx];                               \
00072   I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;                         \
00073   I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;                         \
00074   I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;                         \
00075 
00076 #define READ_SPINOR_HALF_UP(spinor, stride, sp_idx, norm_idx)   \
00077   READ_SPINOR_HALF_UP_(spinor, stride, sp_idx, norm_idx)                
00078 
00079 #define READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)        \
00080   float4 I3 = short42float4(spinor[sp_idx + 3*stride]);                 \
00081   float4 I4 = short42float4(spinor[sp_idx + 4*stride]);                 \
00082   float4 I5 = short42float4(spinor[sp_idx + 5*stride]);                 \
00083   float C = (spinor ## Norm)[norm_idx];                                 \
00084   I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;                           \
00085   I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;                           \
00086   I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;                                        
00087 
00088 #define READ_SPINOR_HALF_DOWN(spinor, stride, sp_idx, norm_idx) \
00089   READ_SPINOR_HALF_DOWN_(spinor, stride, sp_idx, norm_idx)              
00090 
00091 #define READ_ACCUM_DOUBLE(spinor, stride)     \
00092   double2 accum0 = spinor[sid + 0*stride];   \
00093   double2 accum1 = spinor[sid + 1*stride];   \
00094   double2 accum2 = spinor[sid + 2*stride];   \
00095   double2 accum3 = spinor[sid + 3*stride];   \
00096   double2 accum4 = spinor[sid + 4*stride];   \
00097   double2 accum5 = spinor[sid + 5*stride];   \
00098   double2 accum6 = spinor[sid + 6*stride];   \
00099   double2 accum7 = spinor[sid + 7*stride];   \
00100   double2 accum8 = spinor[sid + 8*stride];   \
00101   double2 accum9 = spinor[sid + 9*stride];  \
00102   double2 accum10 = spinor[sid + 10*stride]; \
00103   double2 accum11 = spinor[sid + 11*stride];    
00104 
00105 #define READ_ACCUM_SINGLE(spinor, stride)                       \
00106   float4 accum0 = spinor[sid + 0*(stride)];       \
00107   float4 accum1 = spinor[sid + 1*(stride)];       \
00108   float4 accum2 = spinor[sid + 2*(stride)];       \
00109   float4 accum3 = spinor[sid + 3*(stride)];       \
00110   float4 accum4 = spinor[sid + 4*(stride)];       \
00111   float4 accum5 = spinor[sid + 5*(stride)]; 
00112 
00113 #define READ_ACCUM_HALF_(spinor, stride)                           \
00114   float4 accum0 = short42float4(spinor[sid + 0*stride]);           \
00115   float4 accum1 = short42float4(spinor[sid + 1*stride]);           \
00116   float4 accum2 = short42float4(spinor[sid + 2*stride]);           \
00117   float4 accum3 = short42float4(spinor[sid + 3*stride]);           \
00118   float4 accum4 = short42float4(spinor[sid + 4*stride]);           \
00119   float4 accum5 = short42float4(spinor[sid + 5*stride]);           \
00120   float C = (spinor ## Norm)[sid];                                 \
00121   accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C;      \
00122   accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C;      \
00123   accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C;      \
00124   accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C;      \
00125   accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C;      \
00126   accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;                                        
00127 
00128 #define READ_ACCUM_HALF(spinor, stride) READ_ACCUM_HALF_(spinor, stride)
00129 
00130 #define READ_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx)        \
00131   double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride));   \
00132   double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride));   \
00133   double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride));   \
00134   double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride));   \
00135   double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride));   \
00136   double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));   \
00137   double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride));   \
00138   double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride));   \
00139   double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride));   \
00140   double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride));   \
00141   double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
00142   double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
00143 
00144 #define READ_SPINOR_DOUBLE_UP_TEX(spinor, stride, sp_idx, norm_idx) \
00145   double2 I0 = fetch_double2((spinor), sp_idx + 0*(stride));   \
00146   double2 I1 = fetch_double2((spinor), sp_idx + 1*(stride));   \
00147   double2 I2 = fetch_double2((spinor), sp_idx + 2*(stride));   \
00148   double2 I3 = fetch_double2((spinor), sp_idx + 3*(stride));   \
00149   double2 I4 = fetch_double2((spinor), sp_idx + 4*(stride));   \
00150   double2 I5 = fetch_double2((spinor), sp_idx + 5*(stride));
00151 
00152 #define READ_SPINOR_DOUBLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx)      \
00153   double2 I6 = fetch_double2((spinor), sp_idx + 6*(stride));   \
00154   double2 I7 = fetch_double2((spinor), sp_idx + 7*(stride));   \
00155   double2 I8 = fetch_double2((spinor), sp_idx + 8*(stride));   \
00156   double2 I9 = fetch_double2((spinor), sp_idx + 9*(stride));   \
00157   double2 I10 = fetch_double2((spinor), sp_idx + 10*(stride)); \
00158   double2 I11 = fetch_double2((spinor), sp_idx + 11*(stride));
00159 
00160 #define READ_ACCUM_DOUBLE_TEX(spinor, stride)                   \
00161   double2 accum0 = fetch_double2((spinor), sid + 0*(stride));   \
00162   double2 accum1 = fetch_double2((spinor), sid + 1*(stride));   \
00163   double2 accum2 = fetch_double2((spinor), sid + 2*(stride));   \
00164   double2 accum3 = fetch_double2((spinor), sid + 3*(stride));   \
00165   double2 accum4 = fetch_double2((spinor), sid + 4*(stride));   \
00166   double2 accum5 = fetch_double2((spinor), sid + 5*(stride));   \
00167   double2 accum6 = fetch_double2((spinor), sid + 6*(stride));   \
00168   double2 accum7 = fetch_double2((spinor), sid + 7*(stride));   \
00169   double2 accum8 = fetch_double2((spinor), sid + 8*(stride));   \
00170   double2 accum9 = fetch_double2((spinor), sid + 9*(stride));   \
00171   double2 accum10 = fetch_double2((spinor), sid + 10*(stride)); \
00172   double2 accum11 = fetch_double2((spinor), sid + 11*(stride)); 
00173 
00174 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx)           \
00175   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride));   \
00176   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride));   \
00177   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride));   \
00178   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride));   \
00179   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride));   \
00180   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride));
00181 
00182 #define READ_SPINOR_SINGLE_UP_TEX(spinor, stride, sp_idx, norm_idx)        \
00183   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride));   \
00184   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride));   \
00185   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride));   \
00186 
00187 #define READ_SPINOR_SINGLE_DOWN_TEX(spinor, stride, sp_idx, norm_idx)  \
00188   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride));   \
00189   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride));   \
00190   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride));
00191 
00192 #define READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx) \
00193   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride));   \
00194   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride));   \
00195   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride));   \
00196   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride));   \
00197   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride));   \
00198   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride));   \
00199   float C = tex1Dfetch((spinor ## Norm), norm_idx);           \
00200   I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;                 \
00201   I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;                 \
00202   I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;                 \
00203   I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;                 \
00204   I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;                 \
00205   I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;                                        
00206 
00207 #define READ_SPINOR_HALF_TEX(spinor, stride, sp_idx, norm_idx)     \
00208   READ_SPINOR_HALF_TEX_(spinor, stride, sp_idx, norm_idx)          \
00209 
00210 #define READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx)      \
00211   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(stride));      \
00212   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(stride));      \
00213   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(stride));      \
00214   float C = tex1Dfetch((spinor ## Norm), norm_idx);           \
00215   I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;                 \
00216   I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;                 \
00217   I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;                 \
00218 
00219 #define READ_SPINOR_HALF_UP_TEX(spinor, stride, sp_idx, norm_idx) \
00220   READ_SPINOR_HALF_UP_TEX_(spinor, stride, sp_idx, norm_idx)      \
00221 
00222 #define READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx)    \
00223   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(stride));      \
00224   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(stride));      \
00225   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(stride));      \
00226   float C = tex1Dfetch((spinor ## Norm), norm_idx);           \
00227   I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;                 \
00228   I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;                 \
00229   I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;                                        
00230 
00231 #define READ_SPINOR_HALF_DOWN_TEX(spinor, stride, sp_idx, norm_idx)     \
00232   READ_SPINOR_HALF_DOWN_TEX_(spinor, stride, sp_idx, norm_idx)  \
00233 
00234 #define READ_ACCUM_SINGLE_TEX(spinor, stride)                   \
00235   float4 accum0 = tex1Dfetch((spinor), sid + 0*(stride));       \
00236   float4 accum1 = tex1Dfetch((spinor), sid + 1*(stride));       \
00237   float4 accum2 = tex1Dfetch((spinor), sid + 2*(stride));       \
00238   float4 accum3 = tex1Dfetch((spinor), sid + 3*(stride));       \
00239   float4 accum4 = tex1Dfetch((spinor), sid + 4*(stride));       \
00240   float4 accum5 = tex1Dfetch((spinor), sid + 5*(stride)); 
00241 
00242 #define READ_ACCUM_HALF_TEX_(spinor, stride)                       \
00243   float4 accum0 = tex1Dfetch((spinor), sid + 0*(stride));          \
00244   float4 accum1 = tex1Dfetch((spinor), sid + 1*(stride));          \
00245   float4 accum2 = tex1Dfetch((spinor), sid + 2*(stride));          \
00246   float4 accum3 = tex1Dfetch((spinor), sid + 3*(stride));          \
00247   float4 accum4 = tex1Dfetch((spinor), sid + 4*(stride));          \
00248   float4 accum5 = tex1Dfetch((spinor), sid + 5*(stride));          \
00249   float C = tex1Dfetch((spinor ## Norm), sid);                     \
00250   accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C;      \
00251   accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C;      \
00252   accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C;      \
00253   accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C;      \
00254   accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C;      \
00255   accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;                                        
00256 
00257 #define READ_ACCUM_HALF_TEX(spinor, stride) READ_ACCUM_HALF_TEX_(spinor, stride)
00258 
00259 
00260 #define WRITE_SPINOR_DOUBLE2(stride)                       \
00261   out[0*(stride)+sid] = make_double2(o00_re, o00_im);      \
00262   out[1*(stride)+sid] = make_double2(o01_re, o01_im);      \
00263   out[2*(stride)+sid] = make_double2(o02_re, o02_im);      \
00264   out[3*(stride)+sid] = make_double2(o10_re, o10_im);      \
00265   out[4*(stride)+sid] = make_double2(o11_re, o11_im);      \
00266   out[5*(stride)+sid] = make_double2(o12_re, o12_im);      \
00267   out[6*(stride)+sid] = make_double2(o20_re, o20_im);      \
00268   out[7*(stride)+sid] = make_double2(o21_re, o21_im);      \
00269   out[8*(stride)+sid] = make_double2(o22_re, o22_im);      \
00270   out[9*(stride)+sid] = make_double2(o30_re, o30_im);      \
00271   out[10*(stride)+sid] = make_double2(o31_re, o31_im);     \
00272   out[11*(stride)+sid] = make_double2(o32_re, o32_im);           
00273 
00274 #define WRITE_SPINOR_FLOAT4(stride)                                  \
00275   out[0*(stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
00276   out[1*(stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
00277   out[2*(stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
00278   out[3*(stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
00279   out[4*(stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
00280   out[5*(stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
00281 
00282 #define WRITE_SPINOR_SHORT4(stride)                                     \
00283   float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));                       \
00284   float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im));                       \
00285   float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im));                       \
00286   float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im));                       \
00287   float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im));                       \
00288   float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im));                       \
00289   float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im));                       \
00290   float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im));                       \
00291   float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im));                       \
00292   float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im));                       \
00293   float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im));                      \
00294   float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im));                      \
00295   c0 = fmaxf(c0, c1);                                                   \
00296   c1 = fmaxf(c2, c3);                                                   \
00297   c2 = fmaxf(c4, c5);                                                   \
00298   c3 = fmaxf(c6, c7);                                                   \
00299   c4 = fmaxf(c8, c9);                                                   \
00300   c5 = fmaxf(c10, c11);                                                 \
00301   c0 = fmaxf(c0, c1);                                                   \
00302   c1 = fmaxf(c2, c3);                                                   \
00303   c2 = fmaxf(c4, c5);                                                   \
00304   c0 = fmaxf(c0, c1);                                                   \
00305   c0 = fmaxf(c0, c2);                                                   \
00306   outNorm[sid] = c0;                                                    \
00307   float scale = __fdividef(MAX_SHORT, c0);                              \
00308   o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale;   \
00309   o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale;   \
00310   o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale;   \
00311   o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale;   \
00312   o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale;   \
00313   o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale;   \
00314   out[sid+0*(stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
00315   out[sid+1*(stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
00316   out[sid+2*(stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
00317   out[sid+3*(stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
00318   out[sid+4*(stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
00319   out[sid+5*(stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
00320 
00321 #if (__COMPUTE_CAPABILITY__ >= 200)
00322 #define WRITE_SPINOR_DOUBLE2_STR(stride)                                \
00323   store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im);       \
00324   store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im);       \
00325   store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im);       \
00326   store_streaming_double2(&out[3*sp_stride+sid], o10_re, o10_im);       \
00327   store_streaming_double2(&out[4*sp_stride+sid], o11_re, o11_im);       \
00328   store_streaming_double2(&out[5*sp_stride+sid], o12_re, o12_im);       \
00329   store_streaming_double2(&out[6*sp_stride+sid], o20_re, o20_im);       \
00330   store_streaming_double2(&out[7*sp_stride+sid], o21_re, o21_im);       \
00331   store_streaming_double2(&out[8*sp_stride+sid], o22_re, o22_im);       \
00332   store_streaming_double2(&out[9*sp_stride+sid], o30_re, o30_im);       \
00333   store_streaming_double2(&out[10*sp_stride+sid], o31_re, o31_im);      \
00334   store_streaming_double2(&out[11*sp_stride+sid], o32_re, o32_im);
00335 
00336 #define WRITE_SPINOR_FLOAT4_STR(stride)                                 \
00337   store_streaming_float4(&out[0*(stride)+sid], o00_re, o00_im, o01_re, o01_im); \
00338   store_streaming_float4(&out[1*(stride)+sid], o02_re, o02_im, o10_re, o10_im); \
00339   store_streaming_float4(&out[2*(stride)+sid], o11_re, o11_im, o12_re, o12_im); \
00340   store_streaming_float4(&out[3*(stride)+sid], o20_re, o20_im, o21_re, o21_im); \
00341   store_streaming_float4(&out[4*(stride)+sid], o22_re, o22_im, o30_re, o30_im); \
00342   store_streaming_float4(&out[5*(stride)+sid], o31_re, o31_im, o32_re, o32_im);
00343 
00344 #define WRITE_SPINOR_SHORT4_STR(stride)         \
00345   float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));       \
00346   float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im));       \
00347   float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im));       \
00348   float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im));       \
00349   float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im));       \
00350   float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im));       \
00351   float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im));       \
00352   float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im));       \
00353   float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im));       \
00354   float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im));       \
00355   float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im));      \
00356   float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im));      \
00357   c0 = fmaxf(c0, c1);                                   \
00358   c1 = fmaxf(c2, c3);                                   \
00359   c2 = fmaxf(c4, c5);                                   \
00360   c3 = fmaxf(c6, c7);                                   \
00361   c4 = fmaxf(c8, c9);                                   \
00362   c5 = fmaxf(c10, c11);                                 \
00363   c0 = fmaxf(c0, c1);                                   \
00364   c1 = fmaxf(c2, c3);                                   \
00365   c2 = fmaxf(c4, c5);                                   \
00366   c0 = fmaxf(c0, c1);                                   \
00367   c0 = fmaxf(c0, c2);                                   \
00368   outNorm[sid] = c0;                                    \
00369   float scale = __fdividef(MAX_SHORT, c0);                          \
00370   o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale;   \
00371   o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale;   \
00372   o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale;   \
00373   o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale;   \
00374   o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale;   \
00375   o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale;   \
00376   store_streaming_short4(&out[0*(stride)+sid], (short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
00377   store_streaming_short4(&out[1*(stride)+sid], (short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
00378   store_streaming_short4(&out[2*(stride)+sid], (short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
00379   store_streaming_short4(&out[3*(stride)+sid], (short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
00380   store_streaming_short4(&out[4*(stride)+sid], (short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
00381   store_streaming_short4(&out[5*(stride)+sid], (short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
00382 #else
00383 #define WRITE_SPINOR_DOUBLE2_STR(stride) WRITE_SPINOR_DOUBLE2(stride)
00384 #define WRITE_SPINOR_FLOAT4_STR(stride) WRITE_SPINOR_FLOAT4(stride)
00385 #define WRITE_SPINOR_SHORT4_STR(stride) WRITE_SPINOR_SHORT4(stride)
00386 #endif
00387 
00388 // macros used for exterior Wilson Dslash kernels and face packing
00389 
00390 #define READ_HALF_SPINOR READ_SPINOR_UP
00391 
00392 #define WRITE_HALF_SPINOR_DOUBLE2(stride, sid)       \
00393   out[0*(stride)+sid] = make_double2(a0_re, a0_im);  \
00394   out[1*(stride)+sid] = make_double2(a1_re, a1_im);  \
00395   out[2*(stride)+sid] = make_double2(a2_re, a2_im);  \
00396   out[3*(stride)+sid] = make_double2(b0_re, b0_im);  \
00397   out[4*(stride)+sid] = make_double2(b1_re, b1_im);  \
00398   out[5*(stride)+sid] = make_double2(b2_re, b2_im);
00399 
00400 #define WRITE_HALF_SPINOR_FLOAT4(stride, sid)                     \
00401   out[0*(stride)+sid] = make_float4(a0_re, a0_im, a1_re, a1_im);  \
00402   out[1*(stride)+sid] = make_float4(a2_re, a2_im, b0_re, b0_im);  \
00403   out[2*(stride)+sid] = make_float4(b1_re, b1_im, b2_re, b2_im);
00404 
00405 #define WRITE_HALF_SPINOR_SHORT4(stride, sid)                           \
00406   float c0 = fmaxf(fabsf(a0_re), fabsf(a0_im));                         \
00407   float c1 = fmaxf(fabsf(a1_re), fabsf(a1_im));                         \
00408   float c2 = fmaxf(fabsf(a2_re), fabsf(a2_im));                         \
00409   float c3 = fmaxf(fabsf(b0_re), fabsf(b0_im));                         \
00410   float c4 = fmaxf(fabsf(b1_re), fabsf(b1_im));                         \
00411   float c5 = fmaxf(fabsf(b2_re), fabsf(b2_im));                         \
00412   c0 = fmaxf(c0, c1);                                                   \
00413   c1 = fmaxf(c2, c3);                                                   \
00414   c2 = fmaxf(c4, c5);                                                   \
00415   c0 = fmaxf(c0, c1);                                                   \
00416   c0 = fmaxf(c0, c2);                                                   \
00417   outNorm[sid] = c0;                                                    \
00418   float scale = __fdividef(MAX_SHORT, c0);                              \
00419   a0_re *= scale; a0_im *= scale; a1_re *= scale; a1_im *= scale;       \
00420   a2_re *= scale; a2_im *= scale; b0_re *= scale; b0_im *= scale;       \
00421   b1_re *= scale; b1_im *= scale; b2_re *= scale; b2_im *= scale;       \
00422   out[sid+0*(stride)] = make_short4((short)a0_re, (short)a0_im, (short)a1_re, (short)a1_im); \
00423   out[sid+1*(stride)] = make_short4((short)a2_re, (short)a2_im, (short)b0_re, (short)b0_im); \
00424   out[sid+2*(stride)] = make_short4((short)b1_re, (short)b1_im, (short)b2_re, (short)b2_im);
00425 
00426 
00427 /************* the following is used by staggered *****************/
00428 
00429 #define READ_1ST_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride)   \
00430   double2 I0 = fetch_double2((spinor), idx + 0*mystride);       \
00431   double2 I1 = fetch_double2((spinor), idx + 1*mystride);       \
00432   double2 I2 = fetch_double2((spinor), idx + 2*mystride);
00433 
00434 #define READ_3RD_NBR_SPINOR_DOUBLE_TEX(spinor, idx, mystride)   \
00435   double2 T0 = fetch_double2((spinor), idx + 0*mystride);       \
00436   double2 T1 = fetch_double2((spinor), idx + 1*mystride);       \
00437   double2 T2 = fetch_double2((spinor), idx + 2*mystride);
00438 
00439 #define READ_1ST_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride)   \
00440   float2 I0 = tex1Dfetch((spinor), idx + 0*mystride);           \
00441   float2 I1 = tex1Dfetch((spinor), idx + 1*mystride);           \
00442   float2 I2 = tex1Dfetch((spinor), idx + 2*mystride);
00443 
00444 #define READ_3RD_NBR_SPINOR_SINGLE_TEX(spinor, idx, mystride)   \
00445   float2 T0 = tex1Dfetch((spinor), idx + 0*mystride);           \
00446   float2 T1 = tex1Dfetch((spinor), idx + 1*mystride);           \
00447   float2 T2 = tex1Dfetch((spinor), idx + 2*mystride);
00448 
00449 #define READ_1ST_NBR_SPINOR_HALF_TEX(spinor, idx, mystride)             \
00450   float2 I0 = tex1Dfetch((spinor), idx + 0*mystride);                   \
00451   float2 I1 = tex1Dfetch((spinor), idx + 1*mystride);                   \
00452   float2 I2 = tex1Dfetch((spinor), idx + 2*mystride);                   \
00453   {                                                                     \
00454     float C = tex1Dfetch((spinorTexHalfNorm), norm_idx1);               \
00455     I0.x *= C; I0.y *= C;                                               \
00456     I1.x *= C; I1.y *= C;                                               \
00457     I2.x *= C; I2.y *= C;}
00458 
00459 #define READ_3RD_NBR_SPINOR_HALF_TEX(spinor, idx, mystride)             \
00460   float2 T0 = tex1Dfetch((spinor), idx + 0*mystride);                   \
00461   float2 T1 = tex1Dfetch((spinor), idx + 1*mystride);                   \
00462   float2 T2 = tex1Dfetch((spinor), idx + 2*mystride);                   \
00463   {                                                                     \
00464     float C = tex1Dfetch((spinorTexHalfNorm), norm_idx3);               \
00465     T0.x *= C; T0.y *= C;                                               \
00466     T1.x *= C; T1.y *= C;                                               \
00467     T2.x *= C; T2.y *= C;}
00468 
00469 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride)       \
00470   double2 I0 = spinor[idx + 0*mystride];                        \
00471   double2 I1 = spinor[idx + 1*mystride];                        \
00472   double2 I2 = spinor[idx + 2*mystride];
00473 
00474 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride)       \
00475   double2 T0 = spinor[idx + 0*mystride];                        \
00476   double2 T1 = spinor[idx + 1*mystride];                        \
00477   double2 T2 = spinor[idx + 2*mystride];
00478 
00479 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride)       \
00480   float2 I0 = spinor[idx + 0*mystride];                         \
00481   float2 I1 = spinor[idx + 1*mystride];                         \
00482   float2 I2 = spinor[idx + 2*mystride];
00483 
00484 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride)       \
00485   float2 T0 = spinor[idx + 0*mystride];                         \
00486   float2 T1 = spinor[idx + 1*mystride];                         \
00487   float2 T2 = spinor[idx + 2*mystride];
00488 
00489 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride)                 \
00490   float2 I0, I1, I2;                                                    \
00491   {                                                                     \
00492     short2 S0 = in[idx + 0*mystride];                                   \
00493     short2 S1 = in[idx + 1*mystride];                                   \
00494     short2 S2 = in[idx + 2*mystride];                                   \
00495     float C = inNorm[idx];                                              \
00496     I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y);               \
00497     I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y);               \
00498     I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y);               \
00499   }
00500 
00501 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride)                 \
00502   float2 T0, T1, T2;                                                    \
00503   {                                                                     \
00504     short2 S0 = in[idx + 0*mystride];                                   \
00505     short2 S1 = in[idx + 1*mystride];                                   \
00506     short2 S2 = in[idx + 2*mystride];                                   \
00507     float C = inNorm[idx];                                              \
00508     T0.x =C*short2float(S0.x); T0.y =C*short2float(S0.y);               \
00509     T1.x =C*short2float(S1.x); T1.y =C*short2float(S1.y);               \
00510     T2.x =C*short2float(S2.x); T2.y =C*short2float(S2.y);               \
00511   }
00512 
00513 
00514 #define WRITE_ST_SPINOR_DOUBLE2(out)                            \
00515   out[0*sp_stride+sid] = make_double2(o00_re, o00_im);  \
00516   out[1*sp_stride+sid] = make_double2(o01_re, o01_im);  \
00517   out[2*sp_stride+sid] = make_double2(o02_re, o02_im);
00518 
00519 #define WRITE_ST_SPINOR_FLOAT2(out)                     \
00520   out[0*sp_stride+sid] = make_float2(o00_re, o00_im);   \
00521   out[1*sp_stride+sid] = make_float2(o01_re, o01_im);   \
00522   out[2*sp_stride+sid] = make_float2(o02_re, o02_im);
00523 
00524 #define WRITE_ST_SPINOR_SHORT2(out)                                     \
00525   float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));                       \
00526   float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im));                       \
00527   float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im));                       \
00528   c0 = fmaxf(c0, c1);                                                   \
00529   c0 = fmaxf(c0, c2);                                                   \
00530   out ## Norm[sid] = c0;                                                        \
00531   float scale = __fdividef(MAX_SHORT, c0);                              \
00532   o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale;   \
00533   o02_re *= scale; o02_im *= scale;                                     \
00534   out[sid+0*sp_stride] = make_short2((short)o00_re, (short)o00_im);     \
00535   out[sid+1*sp_stride] = make_short2((short)o01_re, (short)o01_im);     \
00536   out[sid+2*sp_stride] = make_short2((short)o02_re, (short)o02_im);
00537 
00538 // Non-cache writes to minimize cache polution
00539 #if (__COMPUTE_CAPABILITY__ >= 200)
00540 
00541 #define WRITE_ST_SPINOR_DOUBLE2_STR(out) \
00542   store_streaming_double2(&out[0*sp_stride+sid], o00_re, o00_im);       \
00543   store_streaming_double2(&out[1*sp_stride+sid], o01_re, o01_im);       \
00544   store_streaming_double2(&out[2*sp_stride+sid], o02_re, o02_im);
00545 
00546 #define WRITE_ST_SPINOR_FLOAT2_STR(out)                        \
00547   store_streaming_float2(&out[0*sp_stride+sid], o00_re, o00_im);        \
00548   store_streaming_float2(&out[1*sp_stride+sid], o01_re, o01_im);        \
00549   store_streaming_float2(&out[2*sp_stride+sid], o02_re, o02_im);
00550 
00551 #define WRITE_ST_SPINOR_SHORT2_STR(out) \
00552   float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));       \
00553   float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im));       \
00554   float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im));       \
00555   c0 = fmaxf(c0, c1);                                   \
00556   c0 = fmaxf(c0, c2);                                   \
00557   out ## Norm[sid] = c0;                                \
00558   float scale = __fdividef(MAX_SHORT, c0);                          \
00559   o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale;   \
00560   o02_re *= scale; o02_im *= scale;                                     \
00561   store_streaming_short2(&g_out[0*sp_stride+sid], (short)o00_re, (short)o00_im); \
00562   store_streaming_short2(&g_out[1*sp_stride+sid], (short)o01_re, (short)o01_im); \
00563   store_streaming_short2(&g_out[2*sp_stride+sid], (short)o02_re, (short)o02_im);
00564 #else
00565 
00566 #define WRITE_ST_SPINOR_DOUBLE2_STR() WRITE_ST_SPINOR_DOUBLE2()
00567 #define WRITE_ST_SPINOR_FLOAT4_STR() WRITE_ST_SPINOR_FLOAT4()
00568 #define WRITE_ST_SPINOR_SHORT4_STR() WRITE_ST_SPINOR_SHORT4()
00569 
00570 #endif
00571 
00572 #define READ_AND_SUM_ST_SPINOR_DOUBLE_TEX(spinor) {                     \
00573   double2 tmp0 = fetch_double2((spinor), sid + 0*(sp_stride));          \
00574   double2 tmp1 = fetch_double2((spinor), sid + 1*(sp_stride));          \
00575   double2 tmp2 = fetch_double2((spinor), sid + 2*(sp_stride));          \
00576   o00_re += tmp0.x; o00_im += tmp0.y;                                   \
00577   o01_re += tmp1.x; o01_im += tmp1.y;                                   \
00578   o02_re += tmp2.x; o02_im += tmp2.y; }
00579   
00580 #define READ_AND_SUM_ST_SPINOR_SINGLE_TEX(spinor) {                     \
00581   float2 tmp0 = tex1Dfetch((spinor), sid + 0*(sp_stride));              \
00582   float2 tmp1 = tex1Dfetch((spinor), sid + 1*(sp_stride));              \
00583   float2 tmp2 = tex1Dfetch((spinor), sid + 2*(sp_stride));              \
00584   o00_re += tmp0.x; o00_im += tmp0.y;                                   \
00585   o01_re += tmp1.x; o01_im += tmp1.y;                                   \
00586   o02_re += tmp2.x; o02_im += tmp2.y; }
00587 
00588 #define READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor) {                      \
00589   float2 tmp0 = tex1Dfetch((spinor), sid + 0*sp_stride);                \
00590   float2 tmp1 = tex1Dfetch((spinor), sid + 1*sp_stride);                \
00591   float2 tmp2 = tex1Dfetch((spinor), sid + 2*sp_stride);                \
00592   float C = tex1Dfetch((spinor##Norm), sid);                            \
00593   o00_re += C*tmp0.x; o00_im += C*tmp0.y;                               \
00594   o01_re += C*tmp1.x; o01_im += C*tmp1.y;                               \
00595   o02_re += C*tmp2.x; o02_im += C*tmp2.y; }
00596 
00597 #define READ_AND_SUM_ST_SPINOR_HALF_TEX(spinor) \
00598   READ_AND_SUM_ST_SPINOR_HALF_TEX_(spinor)
00599 
00600 #define READ_AND_SUM_ST_SPINOR(spinor)                                  \
00601   o00_re += spinor[0*sp_stride+sid].x; o00_im += spinor[0*sp_stride+sid].y; \
00602   o01_re += spinor[1*sp_stride+sid].x; o01_im += spinor[1*sp_stride+sid].y; \
00603   o02_re += spinor[2*sp_stride+sid].x; o02_im += spinor[2*sp_stride+sid].y; \
00604   
00605 #define READ_AND_SUM_ST_SPINOR_HALF_(spinor)                    \
00606   float C = spinor ## Norm[sid];                                \
00607   o00_re += C*short2float(spinor[0*sp_stride + sid].x);         \
00608   o00_im += C*short2float(spinor[0*sp_stride + sid].y);         \
00609   o01_re += C*short2float(spinor[1*sp_stride + sid].x);         \
00610   o01_im += C*short2float(spinor[1*sp_stride + sid].y);         \
00611   o02_re += C*short2float(spinor[2*sp_stride + sid].x);         \
00612   o02_im += C*short2float(spinor[2*sp_stride + sid].y); 
00613 
00614 #define READ_AND_SUM_ST_SPINOR_HALF(spinor)                     \
00615   READ_AND_SUM_ST_SPINOR_HALF_(spinor)
00616 
00617 #define READ_ST_ACCUM_DOUBLE_TEX(spinor)                           \
00618   double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride));   \
00619   double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride));   \
00620   double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride));   
00621 
00622 #define READ_ST_ACCUM_SINGLE_TEX(spinor)                        \
00623   float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride);      \
00624   float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride);      \
00625   float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride);     
00626 
00627 #define READ_ST_ACCUM_HALF_TEX(spinor)                          \
00628   float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride);      \
00629   float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride);      \
00630   float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride);      \
00631   float C = tex1Dfetch((accumTexHalfNorm), sid);                \
00632   accum0.x *= C; accum0.y *= C;                                 \
00633   accum1.x *= C; accum1.y *= C;                                 \
00634   accum2.x *= C; accum2.y *= C;       
00635 
00636 #define READ_ST_ACCUM_DOUBLE(spinor)                               \
00637   double2 accum0 = spinor[sid + 0*(sp_stride)];                    \
00638   double2 accum1 = spinor[sid + 1*(sp_stride)];                    \
00639   double2 accum2 = spinor[sid + 2*(sp_stride)];   
00640 
00641 #define READ_ST_ACCUM_SINGLE(spinor)                            \
00642   float2 accum0 = spinor[sid + 0*(sp_stride)];                  \
00643   float2 accum1 = spinor[sid + 1*(sp_stride)];                  \
00644   float2 accum2 = spinor[sid + 2*(sp_stride)];     
00645 
00646 #define READ_ST_ACCUM_HALF(spinor)                                      \
00647   float2 accum0, accum1, accum2;                                        \
00648   {                                                                     \
00649     short2 S0 = x[sid + 0*sp_stride];                                   \
00650     short2 S1 = x[sid + 1*sp_stride];                                   \
00651     short2 S2 = x[sid + 2*sp_stride];                                   \
00652     float C = spinor##Norm[sid];                                        \
00653     accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y);       \
00654     accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y);       \
00655     accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y);       \
00656   }
00657 
00658 #define WRITE_SPINOR_SHARED_REAL(tx, ty, tz, reg)                       \
00659   extern __shared__ char s_data[];                                      \
00660   spinorFloat *sh = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
00661     ((tx+blockDim.x*(ty+blockDim.y*tz))/SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
00662   sh[0*SHARED_STRIDE] = reg##00_re;             \
00663   sh[1*SHARED_STRIDE] = reg##00_im;             \
00664   sh[2*SHARED_STRIDE] = reg##01_re;             \
00665   sh[3*SHARED_STRIDE] = reg##01_im;             \
00666   sh[4*SHARED_STRIDE] = reg##02_re;             \
00667   sh[5*SHARED_STRIDE] = reg##02_im;             \
00668   sh[6*SHARED_STRIDE] = reg##10_re;             \
00669   sh[7*SHARED_STRIDE] = reg##10_im;             \
00670   sh[8*SHARED_STRIDE] = reg##11_re;             \
00671   sh[9*SHARED_STRIDE] = reg##11_im;             \
00672   sh[10*SHARED_STRIDE] = reg##12_re;            \
00673   sh[11*SHARED_STRIDE] = reg##12_im;            \
00674   sh[12*SHARED_STRIDE] = reg##20_re;            \
00675   sh[13*SHARED_STRIDE] = reg##20_im;            \
00676   sh[14*SHARED_STRIDE] = reg##21_re;            \
00677   sh[15*SHARED_STRIDE] = reg##21_im;            \
00678   sh[16*SHARED_STRIDE] = reg##22_re;            \
00679   sh[17*SHARED_STRIDE] = reg##22_im;            \
00680   sh[18*SHARED_STRIDE] = reg##30_re;            \
00681   sh[19*SHARED_STRIDE] = reg##30_im;            \
00682   sh[20*SHARED_STRIDE] = reg##31_re;            \
00683   sh[21*SHARED_STRIDE] = reg##31_im;            \
00684   sh[22*SHARED_STRIDE] = reg##32_re;            \
00685   sh[23*SHARED_STRIDE] = reg##32_im;
00686 
00687 #define WRITE_SPINOR_SHARED_DOUBLE2 WRITE_SPINOR_SHARED_REAL
00688 
00689 #define READ_SPINOR_SHARED_DOUBLE2(tx, ty, tz)                          \
00690   extern __shared__ char s_data[];                                      \
00691   double *sh = (double*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
00692     ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
00693   double2 I0 = make_double2(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE]);  \
00694   double2 I1 = make_double2(sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]);  \
00695   double2 I2 = make_double2(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE]);  \
00696   double2 I3 = make_double2(sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]);  \
00697   double2 I4 = make_double2(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE]);  \
00698   double2 I5 = make_double2(sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
00699   double2 I6 = make_double2(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE]); \
00700   double2 I7 = make_double2(sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
00701   double2 I8 = make_double2(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE]); \
00702   double2 I9 = make_double2(sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
00703   double2 I10 = make_double2(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE]); \
00704   double2 I11 = make_double2(sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
00705 
00706 #ifndef SHARED_8_BYTE_WORD_SIZE // 4-byte shared memory access
00707 
00708 #define WRITE_SPINOR_SHARED_FLOAT4 WRITE_SPINOR_SHARED_REAL
00709 
00710 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz)                           \
00711   extern __shared__ char s_data[];                                      \
00712   float *sh = (float*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE* \
00713     ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
00714   float4 I0 = make_float4(sh[0*SHARED_STRIDE], sh[1*SHARED_STRIDE], sh[2*SHARED_STRIDE], sh[3*SHARED_STRIDE]); \
00715   float4 I1 = make_float4(sh[4*SHARED_STRIDE], sh[5*SHARED_STRIDE], sh[6*SHARED_STRIDE], sh[7*SHARED_STRIDE]); \
00716   float4 I2 = make_float4(sh[8*SHARED_STRIDE], sh[9*SHARED_STRIDE], sh[10*SHARED_STRIDE], sh[11*SHARED_STRIDE]); \
00717   float4 I3 = make_float4(sh[12*SHARED_STRIDE], sh[13*SHARED_STRIDE], sh[14*SHARED_STRIDE], sh[15*SHARED_STRIDE]); \
00718   float4 I4 = make_float4(sh[16*SHARED_STRIDE], sh[17*SHARED_STRIDE], sh[18*SHARED_STRIDE], sh[19*SHARED_STRIDE]); \
00719   float4 I5 = make_float4(sh[20*SHARED_STRIDE], sh[21*SHARED_STRIDE], sh[22*SHARED_STRIDE], sh[23*SHARED_STRIDE]);
00720 
00721 #else // 8-byte shared memory words
00722 
00723 #define WRITE_SPINOR_SHARED_FLOAT4(tx, ty, tz, reg)                     \
00724   extern __shared__ char s_data[];                                      \
00725   float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
00726     ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
00727   sh[0*SHARED_STRIDE] = make_float2(reg##00_re, reg##00_im);            \
00728   sh[1*SHARED_STRIDE] = make_float2(reg##01_re, reg##01_im);            \
00729   sh[2*SHARED_STRIDE] = make_float2(reg##02_re, reg##02_im);            \
00730   sh[3*SHARED_STRIDE] = make_float2(reg##10_re, reg##10_im);            \
00731   sh[4*SHARED_STRIDE] = make_float2(reg##11_re, reg##11_im);            \
00732   sh[5*SHARED_STRIDE] = make_float2(reg##12_re, reg##12_im);            \
00733   sh[6*SHARED_STRIDE] = make_float2(reg##20_re, reg##20_im);            \
00734   sh[7*SHARED_STRIDE] = make_float2(reg##21_re, reg##21_im);            \
00735   sh[8*SHARED_STRIDE] = make_float2(reg##22_re, reg##22_im);            \
00736   sh[9*SHARED_STRIDE] = make_float2(reg##30_re, reg##30_im);            \
00737   sh[10*SHARED_STRIDE] = make_float2(reg##31_re, reg##31_im);           \
00738   sh[11*SHARED_STRIDE] = make_float2(reg##32_re, reg##32_im);
00739 
00740 #define READ_SPINOR_SHARED_FLOAT4(tx, ty, tz)                           \
00741   extern __shared__ char s_data[];                                      \
00742   float2 *sh = (float2*)s_data + (DSLASH_SHARED_FLOATS_PER_THREAD/2)*SHARED_STRIDE* \
00743     ((tx+blockDim.x*(ty+blockDim.y*tz)) / SHARED_STRIDE) + ((tx+blockDim.x*(ty+blockDim.y*tz)) % SHARED_STRIDE); \
00744   float2 tmp1, tmp2;                                                    \
00745   tmp1 = sh[0*SHARED_STRIDE]; tmp2 = sh[1*SHARED_STRIDE]; float4 I0 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
00746   tmp1 = sh[2*SHARED_STRIDE]; tmp2 = sh[3*SHARED_STRIDE]; float4 I1 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
00747   tmp1 = sh[4*SHARED_STRIDE]; tmp2 = sh[5*SHARED_STRIDE]; float4 I2 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
00748   tmp1 = sh[6*SHARED_STRIDE]; tmp2 = sh[7*SHARED_STRIDE]; float4 I3 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
00749   tmp1 = sh[8*SHARED_STRIDE]; tmp2 = sh[9*SHARED_STRIDE]; float4 I4 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); \
00750   tmp1 = sh[10*SHARED_STRIDE]; tmp2 = sh[11*SHARED_STRIDE]; float4 I5 = make_float4(tmp1.x, tmp1.y, tmp2.x, tmp2.y); 
00751 
00752 #endif