QUDA: quda/lib/io_spinor.h Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #define READ_SPINOR_DOUBLE(spinor)                                \
00002   double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride));   \
00003   double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride));   \
00004   double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride));   \
00005   double2 I3 = fetch_double2((spinor), sp_idx + 3*(sp_stride));   \
00006   double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride));   \
00007   double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride));   \
00008   double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride));   \
00009   double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride));   \
00010   double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride));   \
00011   double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride));   \
00012   double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
00013   double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));
00014 
00015 #define READ_SPINOR_DOUBLE_UP(spinor)                             \
00016   double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride));   \
00017   double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride));   \
00018   double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride));   \
00019   double2 I3 = fetch_double2((spinor), sp_idx + 3*(sp_stride));   \
00020   double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride));   \
00021   double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride));
00022 
00023 #define READ_SPINOR_DOUBLE_DOWN(spinor)                           \
00024   double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride));   \
00025   double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride));   \
00026   double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride));   \
00027   double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride));   \
00028   double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
00029   double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));
00030 
00031 #define READ_SPINOR_SINGLE(spinor)                            \
00032   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
00033   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
00034   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
00035   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
00036   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
00037   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));
00038 
00039 #define READ_SPINOR_SINGLE_UP(spinor)                         \
00040   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
00041   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
00042   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
00043 
00044 #define READ_SPINOR_SINGLE_DOWN(spinor)                       \
00045   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
00046   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
00047   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));
00048 
00049 #define READ_SPINOR_HALF(spinor)                              \
00050   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
00051   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
00052   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
00053   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
00054   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
00055   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));   \
00056   float C = tex1Dfetch((spinorTexNorm), sp_idx);              \
00057   I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;                 \
00058   I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;                 \
00059   I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;                 \
00060   I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;                 \
00061   I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;                 \
00062   I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;                                        
00063 
00064 #define READ_SPINOR_HALF_UP(spinor)                           \
00065   float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
00066   float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
00067   float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
00068   float C = tex1Dfetch((spinorTexNorm), sp_idx);              \
00069   I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;                 \
00070   I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;                 \
00071   I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;                 \
00072 
00073 #define READ_SPINOR_HALF_DOWN(spinor)                         \
00074   float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
00075   float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
00076   float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));   \
00077   float C = tex1Dfetch((spinorTexNorm), sp_idx);              \
00078   I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;                 \
00079   I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;                 \
00080   I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;                                        
00081 
00082 #define READ_ACCUM_DOUBLE(spinor)                                  \
00083   double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride));   \
00084   double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride));   \
00085   double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride));   \
00086   double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride));   \
00087   double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride));   \
00088   double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride));   \
00089   double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride));   \
00090   double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride));   \
00091   double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride));   \
00092   double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride));   \
00093   double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride)); \
00094   double2 accum11 = fetch_double2((spinor), sid + 11*(sp_stride));      
00095 
00096 #define READ_ACCUM_SINGLE(spinor)                                  \
00097   float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride));       \
00098   float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride));       \
00099   float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride));       \
00100   float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride));       \
00101   float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride));       \
00102   float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); 
00103 
00104 #define READ_ACCUM_HALF(spinor)                                    \
00105   float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride));       \
00106   float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride));       \
00107   float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride));       \
00108   float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride));       \
00109   float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride));       \
00110   float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride));       \
00111   float C = tex1Dfetch((accumTexNorm), sid);                       \
00112   accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C;      \
00113   accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C;      \
00114   accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C;      \
00115   accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C;      \
00116   accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C;      \
00117   accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;                                        
00118 
00119 
00120 #define WRITE_SPINOR_DOUBLE2()                                     \
00121   out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im);           \
00122   out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im);           \
00123   out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im);           \
00124   out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im);           \
00125   out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im);           \
00126   out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im);           \
00127   out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im);           \
00128   out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im);           \
00129   out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im);           \
00130   out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im);           \
00131   out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im);          \
00132   out[11*(sp_stride)+sid] = make_double2(o32_re, o32_im);                
00133 
00134 #define WRITE_SPINOR_FLOAT4()                                           \
00135   out[0*(sp_stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
00136   out[1*(sp_stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
00137   out[2*(sp_stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
00138   out[3*(sp_stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
00139   out[4*(sp_stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
00140   out[5*(sp_stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
00141 
00142 #define WRITE_SPINOR_SHORT4()                                           \
00143   float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));                       \
00144   float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im));                       \
00145   float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im));                       \
00146   float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im));                       \
00147   float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im));                       \
00148   float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im));                       \
00149   float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im));                       \
00150   float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im));                       \
00151   float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im));                       \
00152   float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im));                       \
00153   float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im));                      \
00154   float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im));                      \
00155   c0 = fmaxf(c0, c1);                                                   \
00156   c1 = fmaxf(c2, c3);                                                   \
00157   c2 = fmaxf(c4, c5);                                                   \
00158   c3 = fmaxf(c6, c7);                                                   \
00159   c4 = fmaxf(c8, c9);                                                   \
00160   c5 = fmaxf(c10, c11);                                                 \
00161   c0 = fmaxf(c0, c1);                                                   \
00162   c1 = fmaxf(c2, c3);                                                   \
00163   c2 = fmaxf(c4, c5);                                                   \
00164   c0 = fmaxf(c0, c1);                                                   \
00165   c0 = fmaxf(c0, c2);                                                   \
00166   outNorm[sid] = c0;                                                    \
00167   float scale = __fdividef(MAX_SHORT, c0);                              \
00168   o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale;   \
00169   o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale;   \
00170   o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale;   \
00171   o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale;   \
00172   o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale;   \
00173   o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale;   \
00174   out[sid+0*(sp_stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
00175   out[sid+1*(sp_stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
00176   out[sid+2*(sp_stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
00177   out[sid+3*(sp_stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
00178   out[sid+4*(sp_stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
00179   out[sid+5*(sp_stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
00180 
00181 /*
00182 #define WRITE_SPINOR_FLOAT1_SMEM() \
00183   int t = threadIdx.x; \
00184   int B = BLOCK_DIM; \
00185   int b = blockIdx.x; \
00186   int f = SHARED_FLOATS_PER_THREAD; \
00187   __syncthreads(); \
00188   for (int i = 0; i < 6; i++) for (int c = 0; c < 4; c++) \
00189       ((float*)out)[i*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
00190 
00191 // the alternative to writing float4's directly: almost as fast, a lot more confusing
00192 #define WRITE_SPINOR_FLOAT1_STAGGERED() \
00193   int t = threadIdx.x; \
00194   int B = BLOCK_DIM; \
00195   int b = blockIdx.x; \
00196   int f = SHARED_FLOATS_PER_THREAD; \
00197   __syncthreads(); \
00198   for (int i = 0; i < 4; i++) for (int c = 0; c < 4; c++) \
00199       ((float*)out)[i*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4]; \
00200   __syncthreads(); \
00201   s[0] = o22_re; \
00202   s[1] = o22_im; \
00203   s[2] = o30_re; \
00204   s[3] = o30_im; \
00205   s[4] = o31_re; \
00206   s[5] = o31_im; \
00207   s[6] = o32_re; \
00208   s[7] = o32_im; \
00209   __syncthreads(); \
00210   for (int i = 0; i < 2; i++) for (int c = 0; c < 4; c++) \
00211     ((float*)out)[(i+4)*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
00212 */
00213 
00214 
00215 /************* the following is used by staggered *****************/
00216 
00217 #define SHORT_LENGTH 65536
00218 #define SCALE_FLOAT ((SHORT_LENGTH-1) * 0.5)
00219 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
00220 #define REVERSE_SCALE_FLOAT (3.05180438e-5f)
00221 //#define short2float(a) ( __fdividef(a, SCALE_FLOAT) - SHIFT_FLOAT)
00222 #define short2float(a) (a*REVERSE_SCALE_FLOAT + 1.52590219e-5f)
00223 
00224 
00225 #ifndef DIRECT_ACCESS_SPINOR //spinor access control
00226 
00227 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride)       \
00228   float2 I0 = tex1Dfetch((spinor), idx + 0*mystride);           \
00229   float2 I1 = tex1Dfetch((spinor), idx + 1*mystride);           \
00230   float2 I2 = tex1Dfetch((spinor), idx + 2*mystride);
00231 
00232 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride)       \
00233   float2 T0 = tex1Dfetch((spinor), idx + 0*mystride);           \
00234   float2 T1 = tex1Dfetch((spinor), idx + 1*mystride);           \
00235   float2 T2 = tex1Dfetch((spinor), idx + 2*mystride);
00236 
00237 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride)       \
00238   double2 I0 = fetch_double2((spinor), idx + 0*mystride);       \
00239   double2 I1 = fetch_double2((spinor), idx + 1*mystride);       \
00240   double2 I2 = fetch_double2((spinor), idx + 2*mystride);
00241 
00242 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride)       \
00243   double2 T0 = fetch_double2((spinor), idx + 0*mystride);       \
00244   double2 T1 = fetch_double2((spinor), idx + 1*mystride);       \
00245   double2 T2 = fetch_double2((spinor), idx + 2*mystride);
00246 
00247 
00248 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride)                 \
00249   float2 I0 = tex1Dfetch((spinor), idx + 0*mystride);                   \
00250   float2 I1 = tex1Dfetch((spinor), idx + 1*mystride);                   \
00251   float2 I2 = tex1Dfetch((spinor), idx + 2*mystride);                   \
00252   {float C = tex1Dfetch((spinorTexNorm), idx);                          \
00253     I0.x *= C; I0.y *= C;                                               \
00254     I1.x *= C; I1.y *= C;                                               \
00255     I2.x *= C; I2.y *= C;}
00256 
00257 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride)                 \
00258   float2 T0 = tex1Dfetch((spinor), idx + 0*mystride);                   \
00259   float2 T1 = tex1Dfetch((spinor), idx + 1*mystride);                   \
00260   float2 T2 = tex1Dfetch((spinor), idx + 2*mystride);                   \
00261   {float C = tex1Dfetch((spinorTexNorm), idx);                          \
00262     T0.x *= C; T0.y *= C;                                               \
00263     T1.x *= C; T1.y *= C;                                               \
00264     T2.x *= C; T2.y *= C;}
00265 
00266 
00267 #define READ_ST_ACCUM_HALF(spinor)                              \
00268   float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride);      \
00269   float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride);      \
00270   float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride);      \
00271   float C = tex1Dfetch((accumTexNorm), sid);                    \
00272   accum0.x *= C; accum0.y *= C;                                 \
00273   accum1.x *= C; accum1.y *= C;                                 \
00274   accum2.x *= C; accum2.y *= C;       
00275 
00276 #else //spinor access control
00277 
00278 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride)       \
00279   float2 I0 = spinor[idx + 0*mystride];                         \
00280   float2 I1 = spinor[idx + 1*mystride];                         \
00281   float2 I2 = spinor[idx + 2*mystride];
00282 
00283 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride)       \
00284   float2 T0 = spinor[idx + 0*mystride];                         \
00285   float2 T1 = spinor[idx + 1*mystride];                         \
00286   float2 T2 = spinor[idx + 2*mystride];
00287 
00288 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride)       \
00289   double2 I0 = spinor[idx + 0*mystride];                        \
00290   double2 I1 = spinor[idx + 1*mystride];                        \
00291   double2 I2 = spinor[idx + 2*mystride];
00292 
00293 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride)       \
00294   double2 T0 = spinor[idx + 0*mystride];                        \
00295   double2 T1 = spinor[idx + 1*mystride];                        \
00296   double2 T2 = spinor[idx + 2*mystride];
00297 
00298 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride)                 \
00299   float2 I0, I1, I2;                                                    \
00300   {                                                                     \
00301     short2 S0 = in[idx + 0*mystride];                                   \
00302     short2 S1 = in[idx + 1*mystride];                                   \
00303     short2 S2 = in[idx + 2*mystride];                                   \
00304     float C = inNorm[idx];                                              \
00305     I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y);               \
00306     I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y);               \
00307     I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y);               \
00308   }
00309 
00310 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride)                 \
00311   float2 T0, T1, T2;                                                    \
00312   {                                                                     \
00313     short2 S0 = in[idx + 0*mystride];                                   \
00314     short2 S1 = in[idx + 1*mystride];                                   \
00315     short2 S2 = in[idx + 2*mystride];                                   \
00316     float C = inNorm[idx];                                              \
00317     T0.x =C*short2float(S0.x); T0.y =C*short2float(S0.y);               \
00318     T1.x =C*short2float(S1.x); T1.y =C*short2float(S1.y);               \
00319     T2.x =C*short2float(S2.x); T2.y =C*short2float(S2.y);               \
00320   }
00321 
00322 
00323 #define READ_ST_ACCUM_HALF(spinor)                                      \
00324   float2 accum0, accum1, accum2;                                        \
00325   {                                                                     \
00326     short2 S0 = x[sid + 0*sp_stride];                                   \
00327     short2 S1 = x[sid + 1*sp_stride];                                   \
00328     short2 S2 = x[sid + 2*sp_stride];                                   \
00329     float C = xNorm[sid];                                               \
00330     accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y);       \
00331     accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y);       \
00332     accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y);       \
00333   }
00334 
00335 #endif //spinor access control
00336 
00337 
00338 
00339 
00340 #define WRITE_ST_SPINOR_DOUBLE2()                               \
00341   g_out[0*sp_stride+sid] = make_double2(o00_re, o00_im);        \
00342   g_out[1*sp_stride+sid] = make_double2(o01_re, o01_im);        \
00343   g_out[2*sp_stride+sid] = make_double2(o02_re, o02_im);
00344 
00345 #define WRITE_ST_SPINOR_FLOAT2()                        \
00346   g_out[0*sp_stride+sid] = make_float2(o00_re, o00_im); \
00347   g_out[1*sp_stride+sid] = make_float2(o01_re, o01_im); \
00348   g_out[2*sp_stride+sid] = make_float2(o02_re, o02_im);
00349 
00350 #define WRITE_ST_SPINOR_SHORT2()                                        \
00351   float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));                       \
00352   float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im));                       \
00353   float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im));                       \
00354   c0 = fmaxf(c0, c1);                                                   \
00355   c0 = fmaxf(c0, c2);                                                   \
00356   outNorm[sid] = c0;                                                    \
00357   float scale = __fdividef(MAX_SHORT, c0);                              \
00358   o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale;   \
00359   o02_re *= scale; o02_im *= scale;                                     \
00360   g_out[sid+0*sp_stride] = make_short2((short)o00_re, (short)o00_im);   \
00361   g_out[sid+1*sp_stride] = make_short2((short)o01_re, (short)o01_im);   \
00362   g_out[sid+2*sp_stride] = make_short2((short)o02_re, (short)o02_im);
00363 
00364 #define READ_AND_SUM_ST_SPINOR()                                        \
00365   o00_re += g_out[0*sp_stride+sid].x; o00_im += g_out[0*sp_stride+sid].y; \
00366   o01_re += g_out[1*sp_stride+sid].x; o01_im += g_out[1*sp_stride+sid].y; \
00367   o02_re += g_out[2*sp_stride+sid].x; o02_im += g_out[2*sp_stride+sid].y; \
00368   
00369 
00370 
00371 #define READ_AND_SUM_ST_SPINOR_HALF()                   \
00372   float C = outNorm[sid];                               \
00373   o00_re += C*short2float(g_out[0*sp_stride + sid].x);  \
00374   o00_im += C*short2float(g_out[0*sp_stride + sid].y);  \
00375   o01_re += C*short2float(g_out[1*sp_stride + sid].x);  \
00376   o01_im += C*short2float(g_out[1*sp_stride + sid].y);  \
00377   o02_re += C*short2float(g_out[2*sp_stride + sid].x);  \
00378   o02_im += C*short2float(g_out[2*sp_stride + sid].y);  
00379   
00380 #define READ_ST_ACCUM_SINGLE(spinor)                            \
00381   float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride);      \
00382   float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride);      \
00383   float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride);     
00384 
00385 #define READ_ST_SPINOR_HALF(spinor)                             \
00386   float2 I0 = tex1Dfetch((spinor), sp_idx + 0*sp_stride);       \
00387   float2 I1 = tex1Dfetch((spinor), sp_idx + 1*sp_stride);       \
00388   float2 I2 = tex1Dfetch((spinor), sp_idx + 2*sp_stride);       \
00389   float C = tex1Dfetch((spinorTexNorm), sp_idx);                \
00390   I0.x *= C; I0.y *= C;                                         \
00391   I1.x *= C; I1.y *= C;                                         \
00392   I2.x *= C; I2.y *= C;                                  
00393 
00394