QUDA v0.4.0
A library for QCD on GPUs
|
00001 //#include <half_quda.h> 00002 00003 // Performs complex addition 00004 #define COMPLEX_ADD_TO(a, b) \ 00005 a##_re += b##_re, \ 00006 a##_im += b##_im 00007 00008 #define COMPLEX_PRODUCT(a, b, c) \ 00009 a##_re = b##_re*c##_re; \ 00010 a##_re -= b##_im*c##_im; \ 00011 a##_im = b##_re*c##_im; \ 00012 a##_im += b##_im*c##_re 00013 00014 #define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \ 00015 a##_re = b##_re*c##_re; \ 00016 a##_re -= b##_im*c##_im; \ 00017 a##_im = -b##_re*c##_im; \ 00018 a##_im -= b##_im*c##_re 00019 00020 // Performs a complex dot product 00021 #define COMPLEX_DOT_PRODUCT(a, b, c) \ 00022 a##_re = b##_re*c##_re; \ 00023 a##_re += b##_im*c##_im; \ 00024 a##_im = b##_re*c##_im; \ 00025 a##_im -= b##_im*c##_re 00026 00027 // Performs a complex norm 00028 #define COMPLEX_NORM(a, b) \ 00029 a = b##_re*b##_re; \ 00030 a += b##_im*b##_im 00031 00032 #define ACC_COMPLEX_PROD(a, b, c) \ 00033 a##_re += b##_re*c##_re; \ 00034 a##_re -= b##_im*c##_im; \ 00035 a##_im += b##_re*c##_im; \ 00036 a##_im += b##_im*c##_re 00037 00038 // Performs the complex conjugated accumulation: a += b* c* 00039 #define ACC_CONJ_PROD(a, b, c) \ 00040 a##_re += b##_re * c##_re; \ 00041 a##_re -= b##_im * c##_im; \ 00042 a##_im -= b##_re * c##_im; \ 00043 a##_im -= b##_im * c##_re 00044 00045 #define READ_GAUGE_MATRIX_18_FLOAT2_TEX(G, gauge, dir, idx, stride) \ 00046 float2 G##0 = tex1Dfetch((gauge), idx + ((dir/2)*9+0)*stride); \ 00047 float2 G##1 = tex1Dfetch((gauge), idx + ((dir/2)*9+1)*stride); \ 00048 float2 G##2 = tex1Dfetch((gauge), idx + ((dir/2)*9+2)*stride); \ 00049 float2 G##3 = tex1Dfetch((gauge), idx + ((dir/2)*9+3)*stride); \ 00050 float2 G##4 = tex1Dfetch((gauge), idx + ((dir/2)*9+4)*stride); \ 00051 float2 G##5 = tex1Dfetch((gauge), idx + ((dir/2)*9+5)*stride); \ 00052 float2 G##6 = tex1Dfetch((gauge), idx + ((dir/2)*9+6)*stride); \ 00053 float2 G##7 = tex1Dfetch((gauge), idx + ((dir/2)*9+7)*stride); \ 00054 float2 G##8 = tex1Dfetch((gauge), idx + ((dir/2)*9+8)*stride); \ 00055 00056 #define READ_GAUGE_MATRIX_18_SHORT2_TEX(G, gauge, dir, idx, stride) \ 00057 float2 G##0 = tex1Dfetch((gauge), idx + ((dir/2)*9+0)*stride); \ 00058 float2 G##1 = tex1Dfetch((gauge), idx + ((dir/2)*9+1)*stride); \ 00059 float2 G##2 = tex1Dfetch((gauge), idx + ((dir/2)*9+2)*stride); \ 00060 float2 G##3 = tex1Dfetch((gauge), idx + ((dir/2)*9+3)*stride); \ 00061 float2 G##4 = tex1Dfetch((gauge), idx + ((dir/2)*9+4)*stride); \ 00062 float2 G##5 = tex1Dfetch((gauge), idx + ((dir/2)*9+5)*stride); \ 00063 float2 G##6 = tex1Dfetch((gauge), idx + ((dir/2)*9+6)*stride); \ 00064 float2 G##7 = tex1Dfetch((gauge), idx + ((dir/2)*9+7)*stride); \ 00065 float2 G##8 = tex1Dfetch((gauge), idx + ((dir/2)*9+8)*stride); \ 00066 00067 #define READ_GAUGE_MATRIX_12_FLOAT4_TEX(G, gauge, dir, idx, stride) \ 00068 float4 G##0 = tex1Dfetch((gauge), idx + ((dir/2)*3+0)*stride); \ 00069 float4 G##1 = tex1Dfetch((gauge), idx + ((dir/2)*3+1)*stride); \ 00070 float4 G##2 = tex1Dfetch((gauge), idx + ((dir/2)*3+2)*stride); \ 00071 float4 G##3 = make_float4(0,0,0,0); \ 00072 float4 G##4 = make_float4(0,0,0,0); 00073 00074 #define READ_GAUGE_MATRIX_12_SHORT4_TEX(G, gauge, dir, idx, stride) \ 00075 float4 G##0 = tex1Dfetch((gauge), idx + ((dir/2)*3+0)*stride); \ 00076 float4 G##1 = tex1Dfetch((gauge), idx + ((dir/2)*3+1)*stride); \ 00077 float4 G##2 = tex1Dfetch((gauge), idx + ((dir/2)*3+2)*stride); \ 00078 float4 G##3 = make_float4(0,0,0,0); \ 00079 float4 G##4 = make_float4(0,0,0,0); 00080 00081 // set A to be last components of G4 (otherwise unused) 00082 #define READ_GAUGE_MATRIX_8_FLOAT4_TEX(G, gauge, dir, idx, stride) \ 00083 float4 G##0 = tex1Dfetch((gauge), idx + ((dir/2)*2+0)*stride); \ 00084 float4 G##1 = tex1Dfetch((gauge), idx + ((dir/2)*2+1)*stride); \ 00085 float4 G##2 = make_float4(0,0,0,0); \ 00086 float4 G##3 = make_float4(0,0,0,0); \ 00087 float4 G##4 = make_float4(0,0,0,0); \ 00088 (G##3).z = (G##0).x; \ 00089 (G##3).w = (G##0).y; 00090 00091 #define READ_GAUGE_MATRIX_8_SHORT4_TEX(G, gauge, dir, idx, stride) \ 00092 float4 G##0 = tex1Dfetch((gauge), idx + ((dir/2)*2+0)*stride); \ 00093 float4 G##1 = tex1Dfetch((gauge), idx + ((dir/2)*2+1)*stride); \ 00094 float4 G##2 = make_float4(0,0,0,0); \ 00095 float4 G##3 = make_float4(0,0,0,0); \ 00096 float4 G##4 = make_float4(0,0,0,0); \ 00097 (G##3).z = (G##0).x = pi_f*(G##0).x; \ 00098 (G##3).w = (G##0).y = pi_f*(G##0).y; 00099 00100 #define READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) \ 00101 double2 G##0 = gauge[idx + ((dir/2)*9+0)*stride]; \ 00102 double2 G##1 = gauge[idx + ((dir/2)*9+1)*stride]; \ 00103 double2 G##2 = gauge[idx + ((dir/2)*9+2)*stride]; \ 00104 double2 G##3 = gauge[idx + ((dir/2)*9+3)*stride]; \ 00105 double2 G##4 = gauge[idx + ((dir/2)*9+4)*stride]; \ 00106 double2 G##5 = gauge[idx + ((dir/2)*9+5)*stride]; \ 00107 double2 G##6 = gauge[idx + ((dir/2)*9+6)*stride]; \ 00108 double2 G##7 = gauge[idx + ((dir/2)*9+7)*stride]; \ 00109 double2 G##8 = gauge[idx + ((dir/2)*9+8)*stride]; \ 00110 00111 #define READ_GAUGE_MATRIX_18_FLOAT2(G, gauge, dir, idx, stride) \ 00112 float2 G##0 = ((float2*)gauge)[idx + ((dir/2)*9+0)*stride]; \ 00113 float2 G##1 = ((float2*)gauge)[idx + ((dir/2)*9+1)*stride]; \ 00114 float2 G##2 = ((float2*)gauge)[idx + ((dir/2)*9+2)*stride]; \ 00115 float2 G##3 = ((float2*)gauge)[idx + ((dir/2)*9+3)*stride]; \ 00116 float2 G##4 = ((float2*)gauge)[idx + ((dir/2)*9+4)*stride]; \ 00117 float2 G##5 = ((float2*)gauge)[idx + ((dir/2)*9+5)*stride]; \ 00118 float2 G##6 = ((float2*)gauge)[idx + ((dir/2)*9+6)*stride]; \ 00119 float2 G##7 = ((float2*)gauge)[idx + ((dir/2)*9+7)*stride]; \ 00120 float2 G##8 = ((float2*)gauge)[idx + ((dir/2)*9+8)*stride]; \ 00121 00122 #define READ_GAUGE_MATRIX_18_SHORT2(G, gauge, dir, idx, stride) \ 00123 float2 G##0 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+0)*stride]); \ 00124 float2 G##1 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+1)*stride]); \ 00125 float2 G##2 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+2)*stride]); \ 00126 float2 G##3 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+3)*stride]); \ 00127 float2 G##4 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+4)*stride]); \ 00128 float2 G##5 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+5)*stride]); \ 00129 float2 G##6 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+6)*stride]); \ 00130 float2 G##7 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+7)*stride]); \ 00131 float2 G##8 = short22float2(((short2*)gauge)[idx + ((dir/2)*9+8)*stride]); \ 00132 00133 #define READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) \ 00134 double2 G##0 = gauge[idx + ((dir/2)*6+0)*stride]; \ 00135 double2 G##1 = gauge[idx + ((dir/2)*6+1)*stride]; \ 00136 double2 G##2 = gauge[idx + ((dir/2)*6+2)*stride]; \ 00137 double2 G##3 = gauge[idx + ((dir/2)*6+3)*stride]; \ 00138 double2 G##4 = gauge[idx + ((dir/2)*6+4)*stride]; \ 00139 double2 G##5 = gauge[idx + ((dir/2)*6+5)*stride]; \ 00140 double2 G##6 = make_double2(0,0); \ 00141 double2 G##7 = make_double2(0,0); \ 00142 double2 G##8 = make_double2(0,0); \ 00143 00144 #define READ_GAUGE_MATRIX_12_FLOAT4(G, gauge, dir, idx, stride) \ 00145 float4 G##0 = gauge[idx + ((dir/2)*3+0)*stride]; \ 00146 float4 G##1 = gauge[idx + ((dir/2)*3+1)*stride]; \ 00147 float4 G##2 = gauge[idx + ((dir/2)*3+2)*stride]; \ 00148 float4 G##3 = make_float4(0,0,0,0); \ 00149 float4 G##4 = make_float4(0,0,0,0); 00150 00151 #define READ_GAUGE_MATRIX_12_SHORT4(G, gauge, dir, idx, stride) \ 00152 float4 G##0 = short42float4(gauge[idx + ((dir/2)*3+0)*stride]); \ 00153 float4 G##1 = short42float4(gauge[idx + ((dir/2)*3+1)*stride]); \ 00154 float4 G##2 = short42float4(gauge[idx + ((dir/2)*3+2)*stride]); \ 00155 float4 G##3 = make_float4(0,0,0,0); \ 00156 float4 G##4 = make_float4(0,0,0,0); 00157 00158 // set A to be last components of G4 (otherwise unused) 00159 #define READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) \ 00160 double2 G##0 = gauge[idx + ((dir/2)*4+0)*stride]; \ 00161 double2 G##1 = gauge[idx + ((dir/2)*4+1)*stride]; \ 00162 double2 G##2 = gauge[idx + ((dir/2)*4+2)*stride]; \ 00163 double2 G##3 = gauge[idx + ((dir/2)*4+3)*stride]; \ 00164 double2 G##4 = make_double2(0,0); \ 00165 double2 G##5 = make_double2(0,0); \ 00166 double2 G##6 = make_double2(0,0); \ 00167 double2 G##7 = make_double2(0,0); \ 00168 double2 G##8 = make_double2(0,0); \ 00169 double2 G##9 = make_double2(0,0); \ 00170 (G##7).x = (G##0).x; \ 00171 (G##7).y = (G##0).y; 00172 00173 // set A to be last components of G4 (otherwise unused) 00174 #define READ_GAUGE_MATRIX_8_FLOAT4(G, gauge, dir, idx, stride) \ 00175 float4 G##0 = gauge[idx + ((dir/2)*2+0)*stride]; \ 00176 float4 G##1 = gauge[idx + ((dir/2)*2+1)*stride]; \ 00177 float4 G##2 = make_float4(0,0,0,0); \ 00178 float4 G##3 = make_float4(0,0,0,0); \ 00179 float4 G##4 = make_float4(0,0,0,0); \ 00180 (G##3).z = (G##0).x; \ 00181 (G##3).w = (G##0).y; 00182 00183 #define READ_GAUGE_MATRIX_8_SHORT4(G, gauge, dir, idx, stride) \ 00184 float4 G##0 = short42float4(gauge[idx + ((dir/2)*2+0)*stride]); \ 00185 float4 G##1 = short42float4(gauge[idx + ((dir/2)*2+1)*stride]); \ 00186 float4 G##2 = make_float4(0,0,0,0); \ 00187 float4 G##3 = make_float4(0,0,0,0); \ 00188 float4 G##4 = make_float4(0,0,0,0); \ 00189 (G##3).z = (G##0).x = pi_f*(G##0).x; \ 00190 (G##3).w = (G##0).y = pi_f*(G##0).y; 00191 00192 00193 #define RESCALE2(G, max) \ 00194 (G##0).x *= max; (G##0).y *= max; (G##1).x *= max; (G##1).y *= max; \ 00195 (G##2).x *= max; (G##2).y *= max; (G##3).x *= max; (G##3).y *= max; \ 00196 (G##4).x *= max; (G##4).y *= max; (G##5).x *= max; (G##5).y *= max; \ 00197 (G##6).x *= max; (G##6).y *= max; (G##7).x *= max; (G##7).y *= max; \ 00198 (G##8).x *= max; (G##8).y *= max; 00199 00200 // FIXME: merge staggered and Wilson reconstruct macros 00201 00202 #define RECONSTRUCT_MATRIX_18_DOUBLE(dir) 00203 #define RECONSTRUCT_MATRIX_18_SINGLE(dir) 00204 00205 #ifndef MULTI_GPU 00206 #define do_boundary ga_idx >= X4X3X2X1hmX3X2X1h 00207 #else 00208 #define do_boundary ( (Pt0 && (ga_idx >= Vh)) || ( PtNm1 && (ga_idx >= X4X3X2X1hmX3X2X1h) && (ga_idx < Vh) ) ) 00209 #endif 00210 00211 #define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \ 00212 ACC_CONJ_PROD(g20, +g01, +g12); \ 00213 ACC_CONJ_PROD(g20, -g02, +g11); \ 00214 ACC_CONJ_PROD(g21, +g02, +g10); \ 00215 ACC_CONJ_PROD(g21, -g00, +g12); \ 00216 ACC_CONJ_PROD(g22, +g00, +g11); \ 00217 ACC_CONJ_PROD(g22, -g01, +g10); \ 00218 double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \ 00219 G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0; 00220 00221 #define RECONSTRUCT_MATRIX_12_SINGLE(dir) \ 00222 ACC_CONJ_PROD(g20, +g01, +g12); \ 00223 ACC_CONJ_PROD(g20, -g02, +g11); \ 00224 ACC_CONJ_PROD(g21, +g02, +g10); \ 00225 ACC_CONJ_PROD(g21, -g00, +g12); \ 00226 ACC_CONJ_PROD(g22, +g00, +g11); \ 00227 ACC_CONJ_PROD(g22, -g01, +g10); \ 00228 float u0 = (dir < 6 ? anisotropy_f : (do_boundary ? t_boundary_f : 1)); \ 00229 G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0; 00230 00231 #define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \ 00232 double row_sum = g01_re*g01_re; \ 00233 row_sum += g01_im*g01_im; \ 00234 row_sum += g02_re*g02_re; \ 00235 row_sum += g02_im*g02_im; \ 00236 double u0 = (dir < 6 ? anisotropy : (do_boundary ? t_boundary : 1)); \ 00237 double u02_inv = 1.0 / (u0*u0); \ 00238 double column_sum = u02_inv - row_sum; \ 00239 double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \ 00240 sincos(g21_re, &g00_im, &g00_re); \ 00241 g00_re *= U00_mag; \ 00242 g00_im *= U00_mag; \ 00243 column_sum += g10_re*g10_re; \ 00244 column_sum += g10_im*g10_im; \ 00245 sincos(g21_im, &g20_im, &g20_re); \ 00246 double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \ 00247 g20_re *= U20_mag; \ 00248 g20_im *= U20_mag; \ 00249 double r_inv2 = 1.0 / (u0*row_sum); \ 00250 COMPLEX_DOT_PRODUCT(A, g00, g10); \ 00251 A_re *= u0; A_im *= u0; \ 00252 COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \ 00253 ACC_COMPLEX_PROD(g11, A, g01); \ 00254 g11_re *= -r_inv2; \ 00255 g11_im *= -r_inv2; \ 00256 COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \ 00257 ACC_COMPLEX_PROD(g12, -A, g02); \ 00258 g12_re *= r_inv2; \ 00259 g12_im *= r_inv2; \ 00260 COMPLEX_DOT_PRODUCT(A, g00, g20); \ 00261 A_re *= u0; A_im *= u0; \ 00262 COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \ 00263 ACC_COMPLEX_PROD(g21, -A, g01); \ 00264 g21_re *= r_inv2; \ 00265 g21_im *= r_inv2; \ 00266 COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \ 00267 ACC_COMPLEX_PROD(g22, A, g02); \ 00268 g22_re *= -r_inv2; \ 00269 g22_im *= -r_inv2; 00270 00271 #define RECONSTRUCT_MATRIX_8_SINGLE(dir) \ 00272 float row_sum = g01_re*g01_re; \ 00273 row_sum += g01_im*g01_im; \ 00274 row_sum += g02_re*g02_re; \ 00275 row_sum += g02_im*g02_im; \ 00276 __sincosf(g21_re, &g00_im, &g00_re); \ 00277 __sincosf(g21_im, &g20_im, &g20_re); \ 00278 float2 u0_2 = (dir < 6 ? An2 : (do_boundary ? TB2 : No2)); \ 00279 float column_sum = u0_2.y - row_sum; \ 00280 float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \ 00281 g00_re *= U00_mag; \ 00282 g00_im *= U00_mag; \ 00283 column_sum += g10_re*g10_re; \ 00284 column_sum += g10_im*g10_im; \ 00285 column_sum = u0_2.y - column_sum; \ 00286 float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \ 00287 g20_re *= U20_mag; \ 00288 g20_im *= U20_mag; \ 00289 float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \ 00290 COMPLEX_DOT_PRODUCT(A, g00, g10); \ 00291 A_re *= u0_2.x; A_im *= u0_2.x; \ 00292 COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \ 00293 ACC_COMPLEX_PROD(g11, A, g01); \ 00294 g11_re *= -r_inv2; \ 00295 g11_im *= -r_inv2; \ 00296 COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \ 00297 ACC_COMPLEX_PROD(g12, -A, g02); \ 00298 g12_re *= r_inv2; \ 00299 g12_im *= r_inv2; \ 00300 COMPLEX_DOT_PRODUCT(A, g00, g20); \ 00301 A_re *= u0_2.x; A_im *= u0_2.x; \ 00302 COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \ 00303 ACC_COMPLEX_PROD(g21, -A, g01); \ 00304 g21_re *= r_inv2; \ 00305 g21_im *= r_inv2; \ 00306 COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \ 00307 ACC_COMPLEX_PROD(g22, A, g02); \ 00308 g22_re *= -r_inv2; \ 00309 g22_im *= -r_inv2; 00310 00311 00312 00313 /************* the following is added for staggered *********/ 00314 00315 #define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \ 00316 ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \ 00317 ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \ 00318 ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \ 00319 ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \ 00320 ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \ 00321 ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \ 00322 {float u0 = coeff_f*sign; \ 00323 gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \ 00324 gauge##22_re *=u0;gauge##22_im *=u0;} 00325 00326 #define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \ 00327 ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \ 00328 ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \ 00329 ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \ 00330 ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \ 00331 ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \ 00332 ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \ 00333 {double u0 = coeff* sign; \ 00334 gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \ 00335 gauge##22_re *=u0;gauge##22_im *=u0;} 00336 00337 #define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \ 00338 double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \ 00339 row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \ 00340 double u0 = coeff*sign; \ 00341 double u02_inv = 1.0 / (u0*u0); \ 00342 double column_sum = u02_inv - row_sum; \ 00343 double U00_mag = sqrt(column_sum); \ 00344 sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \ 00345 gauge##00_re *= U00_mag; \ 00346 gauge##00_im *= U00_mag; \ 00347 column_sum += gauge##10_re*gauge##10_re; \ 00348 column_sum += gauge##10_im*gauge##10_im; \ 00349 sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \ 00350 double U20_mag = sqrt(u02_inv - column_sum); \ 00351 gauge##20_re *= U20_mag; \ 00352 gauge##20_im *= U20_mag; \ 00353 double r_inv2 = 1.0 / (u0*row_sum); \ 00354 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \ 00355 A_re *= u0; A_im *= u0; \ 00356 COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \ 00357 ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \ 00358 gauge##11_re *= -r_inv2; \ 00359 gauge##11_im *= -r_inv2; \ 00360 COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \ 00361 ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \ 00362 gauge##12_re *= r_inv2; \ 00363 gauge##12_im *= r_inv2; \ 00364 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \ 00365 A_re *= u0; A_im *= u0; \ 00366 COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \ 00367 ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \ 00368 gauge##21_re *= r_inv2; \ 00369 gauge##21_im *= r_inv2; \ 00370 COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \ 00371 ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \ 00372 gauge##22_re *= -r_inv2; \ 00373 gauge##22_im *= -r_inv2; 00374 00375 #define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \ 00376 float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \ 00377 row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \ 00378 float u0 = coeff_f*sign; \ 00379 float u02_inv = __fdividef(1.f, u0*u0); \ 00380 float column_sum = u02_inv - row_sum; \ 00381 float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \ 00382 __sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \ 00383 gauge##00_re *= U00_mag; \ 00384 gauge##00_im *= U00_mag; \ 00385 column_sum += gauge##10_re*gauge##10_re; \ 00386 column_sum += gauge##10_im*gauge##10_im; \ 00387 __sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \ 00388 float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \ 00389 gauge##20_re *= U20_mag; \ 00390 gauge##20_im *= U20_mag; \ 00391 float r_inv2 = __fdividef(1.0f, u0*row_sum); \ 00392 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \ 00393 A_re *= u0; A_im *= u0; \ 00394 COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \ 00395 ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \ 00396 gauge##11_re *= -r_inv2; \ 00397 gauge##11_im *= -r_inv2; \ 00398 COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \ 00399 ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \ 00400 gauge##12_re *= r_inv2; \ 00401 gauge##12_im *= r_inv2; \ 00402 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \ 00403 A_re *= u0; A_im *= u0; \ 00404 COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \ 00405 ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \ 00406 gauge##21_re *= r_inv2; \ 00407 gauge##21_im *= r_inv2; \ 00408 COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \ 00409 ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \ 00410 gauge##22_re *= -r_inv2; \ 00411 gauge##22_im *= -r_inv2;} 00412 00413 // Fermi patch to disable double-precision texture reads 00414 #ifdef FERMI_NO_DBLE_TEX 00415 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \ 00416 READ_GAUGE_MATRIX_18_DOUBLE2(G, gauge, dir, idx, stride) 00417 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \ 00418 READ_GAUGE_MATRIX_12_DOUBLE2(G, gauge, dir, idx, stride) 00419 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \ 00420 READ_GAUGE_MATRIX_8_DOUBLE2(G, gauge, dir, idx, stride) 00421 #else 00422 #define READ_GAUGE_MATRIX_18_DOUBLE2_TEX(G, gauge, dir, idx, stride) \ 00423 double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*stride); \ 00424 double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*stride); \ 00425 double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*stride); \ 00426 double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*stride); \ 00427 double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*stride); \ 00428 double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*stride); \ 00429 double2 G##6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*stride); \ 00430 double2 G##7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*stride); \ 00431 double2 G##8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*stride); \ 00432 00433 #define READ_GAUGE_MATRIX_12_DOUBLE2_TEX(G, gauge, dir, idx, stride) \ 00434 double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*stride); \ 00435 double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*stride); \ 00436 double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*stride); \ 00437 double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*stride); \ 00438 double2 G##4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*stride); \ 00439 double2 G##5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*stride); \ 00440 double2 G##6 = make_double2(0,0); \ 00441 double2 G##7 = make_double2(0,0); \ 00442 double2 G##8 = make_double2(0,0); \ 00443 00444 // set A to be last components of G4 (otherwise unused) 00445 #define READ_GAUGE_MATRIX_8_DOUBLE2_TEX(G, gauge, dir, idx, stride) \ 00446 double2 G##0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*stride); \ 00447 double2 G##1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*stride); \ 00448 double2 G##2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*stride); \ 00449 double2 G##3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*stride); \ 00450 double2 G##4 = make_double2(0,0); \ 00451 double2 G##5 = make_double2(0,0); \ 00452 double2 G##6 = make_double2(0,0); \ 00453 double2 G##7 = make_double2(0,0); \ 00454 double2 G##8 = make_double2(0,0); \ 00455 double2 G##9 = make_double2(0,0); \ 00456 (G##7).x = (G##0).x; \ 00457 (G##7).y = (G##0).y; 00458 00459 #endif 00460