|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #define READ_SPINOR_DOUBLE(spinor) \ 00002 double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride)); \ 00003 double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride)); \ 00004 double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride)); \ 00005 double2 I3 = fetch_double2((spinor), sp_idx + 3*(sp_stride)); \ 00006 double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride)); \ 00007 double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride)); \ 00008 double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride)); \ 00009 double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride)); \ 00010 double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride)); \ 00011 double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride)); \ 00012 double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \ 00013 double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride)); 00014 00015 #define READ_SPINOR_DOUBLE_UP(spinor) \ 00016 double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride)); \ 00017 double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride)); \ 00018 double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride)); \ 00019 double2 I3 = fetch_double2((spinor), sp_idx + 3*(sp_stride)); \ 00020 double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride)); \ 00021 double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride)); 00022 00023 #define READ_SPINOR_DOUBLE_DOWN(spinor) \ 00024 double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride)); \ 00025 double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride)); \ 00026 double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride)); \ 00027 double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride)); \ 00028 double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \ 00029 double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride)); 00030 00031 #define READ_SPINOR_SINGLE(spinor) \ 00032 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \ 00033 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \ 00034 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \ 00035 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \ 00036 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \ 00037 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); 00038 00039 #define READ_SPINOR_SINGLE_UP(spinor) \ 00040 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \ 00041 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \ 00042 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \ 00043 00044 #define READ_SPINOR_SINGLE_DOWN(spinor) \ 00045 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \ 00046 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \ 00047 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); 00048 00049 #define READ_SPINOR_HALF(spinor) \ 00050 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \ 00051 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \ 00052 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \ 00053 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \ 00054 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \ 00055 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); \ 00056 float C = tex1Dfetch((spinorTexNorm), sp_idx); \ 00057 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 00058 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 00059 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 00060 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 00061 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 00062 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 00063 00064 #define READ_SPINOR_HALF_UP(spinor) \ 00065 float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \ 00066 float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \ 00067 float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \ 00068 float C = tex1Dfetch((spinorTexNorm), sp_idx); \ 00069 I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \ 00070 I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \ 00071 I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \ 00072 00073 #define READ_SPINOR_HALF_DOWN(spinor) \ 00074 float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \ 00075 float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \ 00076 float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); \ 00077 float C = tex1Dfetch((spinorTexNorm), sp_idx); \ 00078 I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \ 00079 I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \ 00080 I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C; 00081 00082 #define READ_ACCUM_DOUBLE(spinor) \ 00083 double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \ 00084 double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \ 00085 double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride)); \ 00086 double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride)); \ 00087 double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride)); \ 00088 double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride)); \ 00089 double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride)); \ 00090 double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride)); \ 00091 double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride)); \ 00092 double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride)); \ 00093 double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride)); \ 00094 double2 accum11 = fetch_double2((spinor), sid + 11*(sp_stride)); 00095 00096 #define READ_ACCUM_SINGLE(spinor) \ 00097 float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \ 00098 float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \ 00099 float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \ 00100 float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \ 00101 float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \ 00102 float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); 00103 00104 #define READ_ACCUM_HALF(spinor) \ 00105 float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \ 00106 float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \ 00107 float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \ 00108 float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \ 00109 float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \ 00110 float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); \ 00111 float C = tex1Dfetch((accumTexNorm), sid); \ 00112 accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \ 00113 accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \ 00114 accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \ 00115 accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \ 00116 accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \ 00117 accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C; 00118 00119 00120 #define WRITE_SPINOR_DOUBLE2() \ 00121 out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im); \ 00122 out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im); \ 00123 out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im); \ 00124 out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im); \ 00125 out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im); \ 00126 out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im); \ 00127 out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im); \ 00128 out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im); \ 00129 out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im); \ 00130 out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im); \ 00131 out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im); \ 00132 out[11*(sp_stride)+sid] = make_double2(o32_re, o32_im); 00133 00134 #define WRITE_SPINOR_FLOAT4() \ 00135 out[0*(sp_stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \ 00136 out[1*(sp_stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \ 00137 out[2*(sp_stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \ 00138 out[3*(sp_stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \ 00139 out[4*(sp_stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \ 00140 out[5*(sp_stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im); 00141 00142 #define WRITE_SPINOR_SHORT4() \ 00143 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 00144 float c1 = fmaxf(fabsf(o01_re), fabsf(o02_im)); \ 00145 float c2 = fmaxf(fabsf(o02_re), fabsf(o01_im)); \ 00146 float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im)); \ 00147 float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im)); \ 00148 float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im)); \ 00149 float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im)); \ 00150 float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im)); \ 00151 float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im)); \ 00152 float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im)); \ 00153 float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im)); \ 00154 float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im)); \ 00155 c0 = fmaxf(c0, c1); \ 00156 c1 = fmaxf(c2, c3); \ 00157 c2 = fmaxf(c4, c5); \ 00158 c3 = fmaxf(c6, c7); \ 00159 c4 = fmaxf(c8, c9); \ 00160 c5 = fmaxf(c10, c11); \ 00161 c0 = fmaxf(c0, c1); \ 00162 c1 = fmaxf(c2, c3); \ 00163 c2 = fmaxf(c4, c5); \ 00164 c0 = fmaxf(c0, c1); \ 00165 c0 = fmaxf(c0, c2); \ 00166 outNorm[sid] = c0; \ 00167 float scale = __fdividef(MAX_SHORT, c0); \ 00168 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 00169 o02_re *= scale; o02_im *= scale; o10_re *= scale; o10_im *= scale; \ 00170 o11_re *= scale; o11_im *= scale; o12_re *= scale; o12_im *= scale; \ 00171 o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \ 00172 o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \ 00173 o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \ 00174 out[sid+0*(sp_stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \ 00175 out[sid+1*(sp_stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \ 00176 out[sid+2*(sp_stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \ 00177 out[sid+3*(sp_stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \ 00178 out[sid+4*(sp_stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \ 00179 out[sid+5*(sp_stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im); 00180 00181 /* 00182 #define WRITE_SPINOR_FLOAT1_SMEM() \ 00183 int t = threadIdx.x; \ 00184 int B = BLOCK_DIM; \ 00185 int b = blockIdx.x; \ 00186 int f = SHARED_FLOATS_PER_THREAD; \ 00187 __syncthreads(); \ 00188 for (int i = 0; i < 6; i++) for (int c = 0; c < 4; c++) \ 00189 ((float*)out)[i*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4]; 00190 00191 // the alternative to writing float4's directly: almost as fast, a lot more confusing 00192 #define WRITE_SPINOR_FLOAT1_STAGGERED() \ 00193 int t = threadIdx.x; \ 00194 int B = BLOCK_DIM; \ 00195 int b = blockIdx.x; \ 00196 int f = SHARED_FLOATS_PER_THREAD; \ 00197 __syncthreads(); \ 00198 for (int i = 0; i < 4; i++) for (int c = 0; c < 4; c++) \ 00199 ((float*)out)[i*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4]; \ 00200 __syncthreads(); \ 00201 s[0] = o22_re; \ 00202 s[1] = o22_im; \ 00203 s[2] = o30_re; \ 00204 s[3] = o30_im; \ 00205 s[4] = o31_re; \ 00206 s[5] = o31_im; \ 00207 s[6] = o32_re; \ 00208 s[7] = o32_im; \ 00209 __syncthreads(); \ 00210 for (int i = 0; i < 2; i++) for (int c = 0; c < 4; c++) \ 00211 ((float*)out)[(i+4)*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4]; 00212 */ 00213 00214 00215 /************* the following is used by staggered *****************/ 00216 00217 #define SHORT_LENGTH 65536 00218 #define SCALE_FLOAT ((SHORT_LENGTH-1) * 0.5) 00219 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1)) 00220 #define REVERSE_SCALE_FLOAT (3.05180438e-5f) 00221 //#define short2float(a) ( __fdividef(a, SCALE_FLOAT) - SHIFT_FLOAT) 00222 #define short2float(a) (a*REVERSE_SCALE_FLOAT + 1.52590219e-5f) 00223 00224 00225 #ifndef DIRECT_ACCESS_SPINOR //spinor access control 00226 00227 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 00228 float2 I0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00229 float2 I1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00230 float2 I2 = tex1Dfetch((spinor), idx + 2*mystride); 00231 00232 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 00233 float2 T0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00234 float2 T1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00235 float2 T2 = tex1Dfetch((spinor), idx + 2*mystride); 00236 00237 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 00238 double2 I0 = fetch_double2((spinor), idx + 0*mystride); \ 00239 double2 I1 = fetch_double2((spinor), idx + 1*mystride); \ 00240 double2 I2 = fetch_double2((spinor), idx + 2*mystride); 00241 00242 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 00243 double2 T0 = fetch_double2((spinor), idx + 0*mystride); \ 00244 double2 T1 = fetch_double2((spinor), idx + 1*mystride); \ 00245 double2 T2 = fetch_double2((spinor), idx + 2*mystride); 00246 00247 00248 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \ 00249 float2 I0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00250 float2 I1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00251 float2 I2 = tex1Dfetch((spinor), idx + 2*mystride); \ 00252 {float C = tex1Dfetch((spinorTexNorm), idx); \ 00253 I0.x *= C; I0.y *= C; \ 00254 I1.x *= C; I1.y *= C; \ 00255 I2.x *= C; I2.y *= C;} 00256 00257 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride) \ 00258 float2 T0 = tex1Dfetch((spinor), idx + 0*mystride); \ 00259 float2 T1 = tex1Dfetch((spinor), idx + 1*mystride); \ 00260 float2 T2 = tex1Dfetch((spinor), idx + 2*mystride); \ 00261 {float C = tex1Dfetch((spinorTexNorm), idx); \ 00262 T0.x *= C; T0.y *= C; \ 00263 T1.x *= C; T1.y *= C; \ 00264 T2.x *= C; T2.y *= C;} 00265 00266 00267 #define READ_ST_ACCUM_HALF(spinor) \ 00268 float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride); \ 00269 float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride); \ 00270 float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride); \ 00271 float C = tex1Dfetch((accumTexNorm), sid); \ 00272 accum0.x *= C; accum0.y *= C; \ 00273 accum1.x *= C; accum1.y *= C; \ 00274 accum2.x *= C; accum2.y *= C; 00275 00276 #else //spinor access control 00277 00278 #define READ_1ST_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 00279 float2 I0 = spinor[idx + 0*mystride]; \ 00280 float2 I1 = spinor[idx + 1*mystride]; \ 00281 float2 I2 = spinor[idx + 2*mystride]; 00282 00283 #define READ_3RD_NBR_SPINOR_SINGLE(spinor, idx, mystride) \ 00284 float2 T0 = spinor[idx + 0*mystride]; \ 00285 float2 T1 = spinor[idx + 1*mystride]; \ 00286 float2 T2 = spinor[idx + 2*mystride]; 00287 00288 #define READ_1ST_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 00289 double2 I0 = spinor[idx + 0*mystride]; \ 00290 double2 I1 = spinor[idx + 1*mystride]; \ 00291 double2 I2 = spinor[idx + 2*mystride]; 00292 00293 #define READ_3RD_NBR_SPINOR_DOUBLE(spinor, idx, mystride) \ 00294 double2 T0 = spinor[idx + 0*mystride]; \ 00295 double2 T1 = spinor[idx + 1*mystride]; \ 00296 double2 T2 = spinor[idx + 2*mystride]; 00297 00298 #define READ_1ST_NBR_SPINOR_HALF(spinor, idx, mystride) \ 00299 float2 I0, I1, I2; \ 00300 { \ 00301 short2 S0 = in[idx + 0*mystride]; \ 00302 short2 S1 = in[idx + 1*mystride]; \ 00303 short2 S2 = in[idx + 2*mystride]; \ 00304 float C = inNorm[idx]; \ 00305 I0.x =C*short2float(S0.x); I0.y =C*short2float(S0.y); \ 00306 I1.x =C*short2float(S1.x); I1.y =C*short2float(S1.y); \ 00307 I2.x =C*short2float(S2.x); I2.y =C*short2float(S2.y); \ 00308 } 00309 00310 #define READ_3RD_NBR_SPINOR_HALF(spinor, idx, mystride) \ 00311 float2 T0, T1, T2; \ 00312 { \ 00313 short2 S0 = in[idx + 0*mystride]; \ 00314 short2 S1 = in[idx + 1*mystride]; \ 00315 short2 S2 = in[idx + 2*mystride]; \ 00316 float C = inNorm[idx]; \ 00317 T0.x =C*short2float(S0.x); T0.y =C*short2float(S0.y); \ 00318 T1.x =C*short2float(S1.x); T1.y =C*short2float(S1.y); \ 00319 T2.x =C*short2float(S2.x); T2.y =C*short2float(S2.y); \ 00320 } 00321 00322 00323 #define READ_ST_ACCUM_HALF(spinor) \ 00324 float2 accum0, accum1, accum2; \ 00325 { \ 00326 short2 S0 = x[sid + 0*sp_stride]; \ 00327 short2 S1 = x[sid + 1*sp_stride]; \ 00328 short2 S2 = x[sid + 2*sp_stride]; \ 00329 float C = xNorm[sid]; \ 00330 accum0.x =C*short2float(S0.x); accum0.y =C*short2float(S0.y); \ 00331 accum1.x =C*short2float(S1.x); accum1.y =C*short2float(S1.y); \ 00332 accum2.x =C*short2float(S2.x); accum2.y =C*short2float(S2.y); \ 00333 } 00334 00335 #endif //spinor access control 00336 00337 00338 00339 00340 #define WRITE_ST_SPINOR_DOUBLE2() \ 00341 g_out[0*sp_stride+sid] = make_double2(o00_re, o00_im); \ 00342 g_out[1*sp_stride+sid] = make_double2(o01_re, o01_im); \ 00343 g_out[2*sp_stride+sid] = make_double2(o02_re, o02_im); 00344 00345 #define WRITE_ST_SPINOR_FLOAT2() \ 00346 g_out[0*sp_stride+sid] = make_float2(o00_re, o00_im); \ 00347 g_out[1*sp_stride+sid] = make_float2(o01_re, o01_im); \ 00348 g_out[2*sp_stride+sid] = make_float2(o02_re, o02_im); 00349 00350 #define WRITE_ST_SPINOR_SHORT2() \ 00351 float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \ 00352 float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im)); \ 00353 float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im)); \ 00354 c0 = fmaxf(c0, c1); \ 00355 c0 = fmaxf(c0, c2); \ 00356 outNorm[sid] = c0; \ 00357 float scale = __fdividef(MAX_SHORT, c0); \ 00358 o00_re *= scale; o00_im *= scale; o01_re *= scale; o01_im *= scale; \ 00359 o02_re *= scale; o02_im *= scale; \ 00360 g_out[sid+0*sp_stride] = make_short2((short)o00_re, (short)o00_im); \ 00361 g_out[sid+1*sp_stride] = make_short2((short)o01_re, (short)o01_im); \ 00362 g_out[sid+2*sp_stride] = make_short2((short)o02_re, (short)o02_im); 00363 00364 #define READ_AND_SUM_ST_SPINOR() \ 00365 o00_re += g_out[0*sp_stride+sid].x; o00_im += g_out[0*sp_stride+sid].y; \ 00366 o01_re += g_out[1*sp_stride+sid].x; o01_im += g_out[1*sp_stride+sid].y; \ 00367 o02_re += g_out[2*sp_stride+sid].x; o02_im += g_out[2*sp_stride+sid].y; \ 00368 00369 00370 00371 #define READ_AND_SUM_ST_SPINOR_HALF() \ 00372 float C = outNorm[sid]; \ 00373 o00_re += C*short2float(g_out[0*sp_stride + sid].x); \ 00374 o00_im += C*short2float(g_out[0*sp_stride + sid].y); \ 00375 o01_re += C*short2float(g_out[1*sp_stride + sid].x); \ 00376 o01_im += C*short2float(g_out[1*sp_stride + sid].y); \ 00377 o02_re += C*short2float(g_out[2*sp_stride + sid].x); \ 00378 o02_im += C*short2float(g_out[2*sp_stride + sid].y); 00379 00380 #define READ_ST_ACCUM_SINGLE(spinor) \ 00381 float2 accum0 = tex1Dfetch((spinor), sid + 0*sp_stride); \ 00382 float2 accum1 = tex1Dfetch((spinor), sid + 1*sp_stride); \ 00383 float2 accum2 = tex1Dfetch((spinor), sid + 2*sp_stride); 00384 00385 #define READ_ST_SPINOR_HALF(spinor) \ 00386 float2 I0 = tex1Dfetch((spinor), sp_idx + 0*sp_stride); \ 00387 float2 I1 = tex1Dfetch((spinor), sp_idx + 1*sp_stride); \ 00388 float2 I2 = tex1Dfetch((spinor), sp_idx + 2*sp_stride); \ 00389 float C = tex1Dfetch((spinorTexNorm), sp_idx); \ 00390 I0.x *= C; I0.y *= C; \ 00391 I1.x *= C; I1.y *= C; \ 00392 I2.x *= C; I2.y *= C; 00393 00394
1.7.3