|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #define FAST_INT_DIVIDE(a, b) ( a/b ) 00002 00003 // Performs complex addition 00004 #define COMPLEX_ADD_TO(a, b) \ 00005 a##_re += b##_re, \ 00006 a##_im += b##_im 00007 00008 #define COMPLEX_PRODUCT(a, b, c) \ 00009 a##_re = b##_re*c##_re; \ 00010 a##_re -= b##_im*c##_im; \ 00011 a##_im = b##_re*c##_im; \ 00012 a##_im += b##_im*c##_re 00013 00014 #define COMPLEX_CONJUGATE_PRODUCT(a, b, c) \ 00015 a##_re = b##_re*c##_re; \ 00016 a##_re -= b##_im*c##_im; \ 00017 a##_im = -b##_re*c##_im; \ 00018 a##_im -= b##_im*c##_re 00019 00020 // Performs a complex dot product 00021 #define COMPLEX_DOT_PRODUCT(a, b, c) \ 00022 a##_re = b##_re*c##_re; \ 00023 a##_re += b##_im*c##_im; \ 00024 a##_im = b##_re*c##_im; \ 00025 a##_im -= b##_im*c##_re 00026 00027 // Performs a complex norm 00028 #define COMPLEX_NORM(a, b) \ 00029 a = b##_re*b##_re; \ 00030 a += b##_im*b##_im 00031 00032 #define ACC_COMPLEX_PROD(a, b, c) \ 00033 a##_re += b##_re*c##_re; \ 00034 a##_re -= b##_im*c##_im; \ 00035 a##_im += b##_re*c##_im; \ 00036 a##_im += b##_im*c##_re 00037 00038 // Performs the complex conjugated accumulation: a += b* c* 00039 #define ACC_CONJ_PROD(a, b, c) \ 00040 a##_re += b##_re * c##_re; \ 00041 a##_re -= b##_im * c##_im; \ 00042 a##_im -= b##_re * c##_im; \ 00043 a##_im -= b##_im * c##_re 00044 00045 #define READ_GAUGE_MATRIX_18_DOUBLE(gauge, dir) \ 00046 double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*9+0)*ga_stride); \ 00047 double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*9+1)*ga_stride); \ 00048 double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*9+2)*ga_stride); \ 00049 double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*9+3)*ga_stride); \ 00050 double2 G4 = fetch_double2((gauge), ga_idx + ((dir/2)*9+4)*ga_stride); \ 00051 double2 G5 = fetch_double2((gauge), ga_idx + ((dir/2)*9+5)*ga_stride); \ 00052 double2 G6 = fetch_double2((gauge), ga_idx + ((dir/2)*9+6)*ga_stride); \ 00053 double2 G7 = fetch_double2((gauge), ga_idx + ((dir/2)*9+7)*ga_stride); \ 00054 double2 G8 = fetch_double2((gauge), ga_idx + ((dir/2)*9+8)*ga_stride); \ 00055 double2 G9 = make_double2(0,0); 00056 00057 #define READ_GAUGE_MATRIX_18_SINGLE(gauge, dir) \ 00058 float2 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+0)*ga_stride); \ 00059 float2 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+1)*ga_stride); \ 00060 float2 G2 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+2)*ga_stride); \ 00061 float2 G3 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+3)*ga_stride); \ 00062 float2 G4 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+4)*ga_stride); \ 00063 float2 G5 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+5)*ga_stride); \ 00064 float2 G6 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+6)*ga_stride); \ 00065 float2 G7 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+7)*ga_stride); \ 00066 float2 G8 = tex1Dfetch((gauge), ga_idx + ((dir/2)*9+8)*ga_stride); \ 00067 float2 G9 = make_float2(0,0); 00068 00069 #define RECONSTRUCT_MATRIX_18_DOUBLE(dir) \ 00070 00071 #define RECONSTRUCT_MATRIX_18_SINGLE(dir) \ 00072 00073 #define READ_GAUGE_MATRIX_12_DOUBLE(gauge, dir) \ 00074 double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*6+0)*ga_stride); \ 00075 double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*6+1)*ga_stride); \ 00076 double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*6+2)*ga_stride); \ 00077 double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*6+3)*ga_stride); \ 00078 double2 G4 = fetch_double2((gauge), ga_idx + ((dir/2)*6+4)*ga_stride); \ 00079 double2 G5 = fetch_double2((gauge), ga_idx + ((dir/2)*6+5)*ga_stride); \ 00080 double2 G6 = make_double2(0,0); \ 00081 double2 G7 = make_double2(0,0); \ 00082 double2 G8 = make_double2(0,0); \ 00083 double2 G9 = make_double2(0,0); 00084 00085 #define READ_GAUGE_MATRIX_12_SINGLE(gauge, dir) \ 00086 float4 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+0)*ga_stride); \ 00087 float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+1)*ga_stride); \ 00088 float4 G2 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+2)*ga_stride); \ 00089 float4 G3 = make_float4(0,0,0,0); \ 00090 float4 G4 = make_float4(0,0,0,0); 00091 00092 #define RECONSTRUCT_MATRIX_12_DOUBLE(dir) \ 00093 ACC_CONJ_PROD(g20, +g01, +g12); \ 00094 ACC_CONJ_PROD(g20, -g02, +g11); \ 00095 ACC_CONJ_PROD(g21, +g02, +g10); \ 00096 ACC_CONJ_PROD(g21, -g00, +g12); \ 00097 ACC_CONJ_PROD(g22, +g00, +g11); \ 00098 ACC_CONJ_PROD(g22, -g01, +g10); \ 00099 double u0 = (dir < 6 ? anisotropy : (ga_idx >= X4X3X2X1hmX3X2X1h ? t_boundary : 1)); \ 00100 G6.x*=u0; G6.y*=u0; G7.x*=u0; G7.y*=u0; G8.x*=u0; G8.y*=u0; 00101 00102 #define RECONSTRUCT_MATRIX_12_SINGLE(dir) \ 00103 ACC_CONJ_PROD(g20, +g01, +g12); \ 00104 ACC_CONJ_PROD(g20, -g02, +g11); \ 00105 ACC_CONJ_PROD(g21, +g02, +g10); \ 00106 ACC_CONJ_PROD(g21, -g00, +g12); \ 00107 ACC_CONJ_PROD(g22, +g00, +g11); \ 00108 ACC_CONJ_PROD(g22, -g01, +g10); \ 00109 float u0 = (dir < 6 ? anisotropy_f : (ga_idx >= X4X3X2X1hmX3X2X1h ? t_boundary_f : 1)); \ 00110 G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0; 00111 00112 00113 // set A to be last components of G4 (otherwise unused) 00114 #define READ_GAUGE_MATRIX_8_DOUBLE(gauge, dir) \ 00115 double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*4+0)*ga_stride); \ 00116 double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*4+1)*ga_stride); \ 00117 double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*4+2)*ga_stride); \ 00118 double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*4+3)*ga_stride); \ 00119 double2 G4 = make_double2(0,0); \ 00120 double2 G5 = make_double2(0,0); \ 00121 double2 G6 = make_double2(0,0); \ 00122 double2 G7 = make_double2(0,0); \ 00123 double2 G8 = make_double2(0,0); \ 00124 double2 G9 = make_double2(0,0); \ 00125 g21_re = g00_re; \ 00126 g21_im = g00_im; 00127 00128 // set A to be last components of G4 (otherwise unused) 00129 #define READ_GAUGE_MATRIX_8_SINGLE(gauge, dir) \ 00130 float4 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*2+0)*ga_stride); \ 00131 float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*2+1)*ga_stride); \ 00132 float4 G2 = make_float4(0,0,0,0); \ 00133 float4 G3 = make_float4(0,0,0,0); \ 00134 float4 G4 = make_float4(0,0,0,0); \ 00135 g21_re = g00_re; \ 00136 g21_im = g00_im; 00137 00138 #define READ_GAUGE_MATRIX_8_HALF(gauge, dir) \ 00139 float4 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*2+0)*ga_stride); \ 00140 float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*2+1)*ga_stride); \ 00141 float4 G2 = make_float4(0,0,0,0); \ 00142 float4 G3 = make_float4(0,0,0,0); \ 00143 float4 G4 = make_float4(0,0,0,0); \ 00144 g21_re = pi_f*g00_re; \ 00145 g21_im = pi_f*g00_im; 00146 00147 #define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \ 00148 double row_sum = g01_re*g01_re; \ 00149 row_sum += g01_im*g01_im; \ 00150 row_sum += g02_re*g02_re; \ 00151 row_sum += g02_im*g02_im; \ 00152 double u0 = (dir < 6 ? anisotropy : (ga_idx >= X4X3X2X1hmX3X2X1h ? t_boundary : 1)); \ 00153 double u02_inv = 1.0 / (u0*u0); \ 00154 double column_sum = u02_inv - row_sum; \ 00155 double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \ 00156 sincos(g21_re, &g00_im, &g00_re); \ 00157 g00_re *= U00_mag; \ 00158 g00_im *= U00_mag; \ 00159 column_sum += g10_re*g10_re; \ 00160 column_sum += g10_im*g10_im; \ 00161 sincos(g21_im, &g20_im, &g20_re); \ 00162 double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \ 00163 g20_re *= U20_mag; \ 00164 g20_im *= U20_mag; \ 00165 double r_inv2 = 1.0 / (u0*row_sum); \ 00166 COMPLEX_DOT_PRODUCT(A, g00, g10); \ 00167 A_re *= u0; A_im *= u0; \ 00168 COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \ 00169 ACC_COMPLEX_PROD(g11, A, g01); \ 00170 g11_re *= -r_inv2; \ 00171 g11_im *= -r_inv2; \ 00172 COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \ 00173 ACC_COMPLEX_PROD(g12, -A, g02); \ 00174 g12_re *= r_inv2; \ 00175 g12_im *= r_inv2; \ 00176 COMPLEX_DOT_PRODUCT(A, g00, g20); \ 00177 A_re *= u0; A_im *= u0; \ 00178 COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \ 00179 ACC_COMPLEX_PROD(g21, -A, g01); \ 00180 g21_re *= r_inv2; \ 00181 g21_im *= r_inv2; \ 00182 COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \ 00183 ACC_COMPLEX_PROD(g22, A, g02); \ 00184 g22_re *= -r_inv2; \ 00185 g22_im *= -r_inv2; 00186 00187 00188 00189 00190 00191 // use __saturate ? 00192 // float U00_mag = sqrtf(__saturatef(column_sum)); \ 00193 // float U20_mag = sqrtf(__saturatef(column_sum)); \ 00194 00195 #define RECONSTRUCT_MATRIX_8_SINGLE(dir) \ 00196 float row_sum = g01_re*g01_re; \ 00197 row_sum += g01_im*g01_im; \ 00198 row_sum += g02_re*g02_re; \ 00199 row_sum += g02_im*g02_im; \ 00200 __sincosf(g21_re, &g00_im, &g00_re); \ 00201 __sincosf(g21_im, &g20_im, &g20_re); \ 00202 float2 u0_2 = (dir < 6 ? An2 : (ga_idx >= X4X3X2X1hmX3X2X1h ? TB2 : No2)); \ 00203 float column_sum = u0_2.y - row_sum; \ 00204 float U00_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \ 00205 g00_re *= U00_mag; \ 00206 g00_im *= U00_mag; \ 00207 column_sum += g10_re*g10_re; \ 00208 column_sum += g10_im*g10_im; \ 00209 column_sum = u0_2.y - column_sum; \ 00210 float U20_mag = column_sum * rsqrtf((column_sum > 0 ? column_sum : 1e14)); \ 00211 g20_re *= U20_mag; \ 00212 g20_im *= U20_mag; \ 00213 float r_inv2 = __fdividef(1.0f, u0_2.x*row_sum); \ 00214 COMPLEX_DOT_PRODUCT(A, g00, g10); \ 00215 A_re *= u0_2.x; A_im *= u0_2.x; \ 00216 COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \ 00217 ACC_COMPLEX_PROD(g11, A, g01); \ 00218 g11_re *= -r_inv2; \ 00219 g11_im *= -r_inv2; \ 00220 COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \ 00221 ACC_COMPLEX_PROD(g12, -A, g02); \ 00222 g12_re *= r_inv2; \ 00223 g12_im *= r_inv2; \ 00224 COMPLEX_DOT_PRODUCT(A, g00, g20); \ 00225 A_re *= u0_2.x; A_im *= u0_2.x; \ 00226 COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \ 00227 ACC_COMPLEX_PROD(g21, -A, g01); \ 00228 g21_re *= r_inv2; \ 00229 g21_im *= r_inv2; \ 00230 COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \ 00231 ACC_COMPLEX_PROD(g22, A, g02); \ 00232 g22_re *= -r_inv2; \ 00233 g22_im *= -r_inv2; 00234 00235 00236 00237 /************* the following is added for staggered *********/ 00238 00239 #define RECONSTRUCT_GAUGE_MATRIX_8_DOUBLE(dir, gauge, idx, sign) \ 00240 double row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \ 00241 row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \ 00242 double u0 = coeff*sign; \ 00243 double u02_inv = 1.0 / (u0*u0); \ 00244 double column_sum = u02_inv - row_sum; \ 00245 double U00_mag = sqrt(column_sum); \ 00246 sincos(gauge##21_re, &gauge##00_im, &gauge##00_re); \ 00247 gauge##00_re *= U00_mag; \ 00248 gauge##00_im *= U00_mag; \ 00249 column_sum += gauge##10_re*gauge##10_re; \ 00250 column_sum += gauge##10_im*gauge##10_im; \ 00251 sincos(gauge##21_im, &gauge##20_im, &gauge##20_re); \ 00252 double U20_mag = sqrt(u02_inv - column_sum); \ 00253 gauge##20_re *= U20_mag; \ 00254 gauge##20_im *= U20_mag; \ 00255 double r_inv2 = 1.0 / (u0*row_sum); \ 00256 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \ 00257 A_re *= u0; A_im *= u0; \ 00258 COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \ 00259 ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \ 00260 gauge##11_re *= -r_inv2; \ 00261 gauge##11_im *= -r_inv2; \ 00262 COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \ 00263 ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \ 00264 gauge##12_re *= r_inv2; \ 00265 gauge##12_im *= r_inv2; \ 00266 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \ 00267 A_re *= u0; A_im *= u0; \ 00268 COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \ 00269 ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \ 00270 gauge##21_re *= r_inv2; \ 00271 gauge##21_im *= r_inv2; \ 00272 COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \ 00273 ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \ 00274 gauge##22_re *= -r_inv2; \ 00275 gauge##22_im *= -r_inv2; 00276 00277 00278 #define RECONSTRUCT_GAUGE_MATRIX_12_SINGLE(dir, gauge, idx, sign) \ 00279 ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \ 00280 ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \ 00281 ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \ 00282 ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \ 00283 ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \ 00284 ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \ 00285 {float u0 = coeff_f*sign; \ 00286 gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \ 00287 gauge##22_re *=u0;gauge##22_im *=u0;} 00288 00289 #define RECONSTRUCT_GAUGE_MATRIX_12_DOUBLE(dir, gauge, idx, sign) \ 00290 ACC_CONJ_PROD(gauge##20, +gauge##01, +gauge##12); \ 00291 ACC_CONJ_PROD(gauge##20, -gauge##02, +gauge##11); \ 00292 ACC_CONJ_PROD(gauge##21, +gauge##02, +gauge##10); \ 00293 ACC_CONJ_PROD(gauge##21, -gauge##00, +gauge##12); \ 00294 ACC_CONJ_PROD(gauge##22, +gauge##00, +gauge##11); \ 00295 ACC_CONJ_PROD(gauge##22, -gauge##01, +gauge##10); \ 00296 {double u0 = coeff* sign; \ 00297 gauge##20_re *=u0;gauge##20_im *=u0; gauge##21_re *=u0; gauge##21_im *=u0; \ 00298 gauge##22_re *=u0;gauge##22_im *=u0;} 00299 00300 00301 #define RECONSTRUCT_GAUGE_MATRIX_8_SINGLE(dir, gauge, idx, sign) { \ 00302 float row_sum = gauge##01_re*gauge##01_re + gauge##01_im*gauge##01_im; \ 00303 row_sum += gauge##02_re*gauge##02_re + gauge##02_im*gauge##02_im; \ 00304 float u0 = coeff_f*sign; \ 00305 float u02_inv = __fdividef(1.f, u0*u0); \ 00306 float column_sum = u02_inv - row_sum; \ 00307 float U00_mag = sqrtf(column_sum > 0 ?column_sum:0); \ 00308 __sincosf(gauge##21_re, &gauge##00_im, &gauge##00_re); \ 00309 gauge##00_re *= U00_mag; \ 00310 gauge##00_im *= U00_mag; \ 00311 column_sum += gauge##10_re*gauge##10_re; \ 00312 column_sum += gauge##10_im*gauge##10_im; \ 00313 __sincosf(gauge##21_im, &gauge##20_im, &gauge##20_re); \ 00314 float U20_mag = sqrtf( (u02_inv - column_sum)>0? (u02_inv - column_sum): 0); \ 00315 gauge##20_re *= U20_mag; \ 00316 gauge##20_im *= U20_mag; \ 00317 float r_inv2 = __fdividef(1.0f, u0*row_sum); \ 00318 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##10); \ 00319 A_re *= u0; A_im *= u0; \ 00320 COMPLEX_CONJUGATE_PRODUCT(gauge##11, gauge##20, gauge##02); \ 00321 ACC_COMPLEX_PROD(gauge##11, A, gauge##01); \ 00322 gauge##11_re *= -r_inv2; \ 00323 gauge##11_im *= -r_inv2; \ 00324 COMPLEX_CONJUGATE_PRODUCT(gauge##12, gauge##20, gauge##01); \ 00325 ACC_COMPLEX_PROD(gauge##12, -A, gauge##02); \ 00326 gauge##12_re *= r_inv2; \ 00327 gauge##12_im *= r_inv2; \ 00328 COMPLEX_DOT_PRODUCT(A, gauge##00, gauge##20); \ 00329 A_re *= u0; A_im *= u0; \ 00330 COMPLEX_CONJUGATE_PRODUCT(gauge##21, gauge##10, gauge##02); \ 00331 ACC_COMPLEX_PROD(gauge##21, -A, gauge##01); \ 00332 gauge##21_re *= r_inv2; \ 00333 gauge##21_im *= r_inv2; \ 00334 COMPLEX_CONJUGATE_PRODUCT(gauge##22, gauge##10, gauge##01); \ 00335 ACC_COMPLEX_PROD(gauge##22, A, gauge##02); \ 00336 gauge##22_re *= -r_inv2; \ 00337 gauge##22_im *= -r_inv2;} 00338 00339 #ifndef DIRECT_ACCESS_FAT_LINK 00340 #define READ_FAT_MATRIX_18_SINGLE(gauge, dir, idx) \ 00341 float2 FAT0 = tex1Dfetch((gauge), idx + ((dir/2)*9+0)*fat_ga_stride); \ 00342 float2 FAT1 = tex1Dfetch((gauge), idx + ((dir/2)*9+1)*fat_ga_stride); \ 00343 float2 FAT2 = tex1Dfetch((gauge), idx + ((dir/2)*9+2)*fat_ga_stride); \ 00344 float2 FAT3 = tex1Dfetch((gauge), idx + ((dir/2)*9+3)*fat_ga_stride); \ 00345 float2 FAT4 = tex1Dfetch((gauge), idx + ((dir/2)*9+4)*fat_ga_stride); \ 00346 float2 FAT5 = tex1Dfetch((gauge), idx + ((dir/2)*9+5)*fat_ga_stride); \ 00347 float2 FAT6 = tex1Dfetch((gauge), idx + ((dir/2)*9+6)*fat_ga_stride); \ 00348 float2 FAT7 = tex1Dfetch((gauge), idx + ((dir/2)*9+7)*fat_ga_stride); \ 00349 float2 FAT8 = tex1Dfetch((gauge), idx + ((dir/2)*9+8)*fat_ga_stride); 00350 00351 00352 00353 #define READ_FAT_MATRIX_18_DOUBLE(gauge, dir, idx) \ 00354 double2 FAT0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*fat_ga_stride); \ 00355 double2 FAT1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*fat_ga_stride); \ 00356 double2 FAT2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*fat_ga_stride); \ 00357 double2 FAT3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*fat_ga_stride); \ 00358 double2 FAT4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*fat_ga_stride); \ 00359 double2 FAT5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*fat_ga_stride); \ 00360 double2 FAT6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*fat_ga_stride); \ 00361 double2 FAT7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*fat_ga_stride); \ 00362 double2 FAT8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*fat_ga_stride); 00363 00364 00365 00366 #else 00367 #define READ_FAT_MATRIX_18_SINGLE(gauge, dir, idx) \ 00368 float2 FAT0 = gauge[idx + ((dir/2)*9+0)*fat_ga_stride]; \ 00369 float2 FAT1 = gauge[idx + ((dir/2)*9+1)*fat_ga_stride]; \ 00370 float2 FAT2 = gauge[idx + ((dir/2)*9+2)*fat_ga_stride]; \ 00371 float2 FAT3 = gauge[idx + ((dir/2)*9+3)*fat_ga_stride]; \ 00372 float2 FAT4 = gauge[idx + ((dir/2)*9+4)*fat_ga_stride]; \ 00373 float2 FAT5 = gauge[idx + ((dir/2)*9+5)*fat_ga_stride]; \ 00374 float2 FAT6 = gauge[idx + ((dir/2)*9+6)*fat_ga_stride]; \ 00375 float2 FAT7 = gauge[idx + ((dir/2)*9+7)*fat_ga_stride]; \ 00376 float2 FAT8 = gauge[idx + ((dir/2)*9+8)*fat_ga_stride]; 00377 00378 00379 #define READ_FAT_MATRIX_18_DOUBLE(gauge, dir, idx) \ 00380 double2 FAT0 = gauge[idx + ((dir/2)*9+0)*fat_ga_stride]; \ 00381 double2 FAT1 = gauge[idx + ((dir/2)*9+1)*fat_ga_stride]; \ 00382 double2 FAT2 = gauge[idx + ((dir/2)*9+2)*fat_ga_stride]; \ 00383 double2 FAT3 = gauge[idx + ((dir/2)*9+3)*fat_ga_stride]; \ 00384 double2 FAT4 = gauge[idx + ((dir/2)*9+4)*fat_ga_stride]; \ 00385 double2 FAT5 = gauge[idx + ((dir/2)*9+5)*fat_ga_stride]; \ 00386 double2 FAT6 = gauge[idx + ((dir/2)*9+6)*fat_ga_stride]; \ 00387 double2 FAT7 = gauge[idx + ((dir/2)*9+7)*fat_ga_stride]; \ 00388 double2 FAT8 = gauge[idx + ((dir/2)*9+8)*fat_ga_stride]; 00389 00390 #endif 00391 00392 00393 #define READ_FAT_MATRIX_18_HALF(gauge, dir, idx) \ 00394 float2 FAT0 = tex1Dfetch((gauge), idx + ((dir/2)*9+0)*fat_ga_stride); \ 00395 float2 FAT1 = tex1Dfetch((gauge), idx + ((dir/2)*9+1)*fat_ga_stride); \ 00396 float2 FAT2 = tex1Dfetch((gauge), idx + ((dir/2)*9+2)*fat_ga_stride); \ 00397 float2 FAT3 = tex1Dfetch((gauge), idx + ((dir/2)*9+3)*fat_ga_stride); \ 00398 float2 FAT4 = tex1Dfetch((gauge), idx + ((dir/2)*9+4)*fat_ga_stride); \ 00399 float2 FAT5 = tex1Dfetch((gauge), idx + ((dir/2)*9+5)*fat_ga_stride); \ 00400 float2 FAT6 = tex1Dfetch((gauge), idx + ((dir/2)*9+6)*fat_ga_stride); \ 00401 float2 FAT7 = tex1Dfetch((gauge), idx + ((dir/2)*9+7)*fat_ga_stride); \ 00402 float2 FAT8 = tex1Dfetch((gauge), idx + ((dir/2)*9+8)*fat_ga_stride); 00403 00404 00405 #ifndef DIRECT_ACCESS_LONG_LINK //longlink access 00406 00407 #define READ_LONG_MATRIX_12_SINGLE(gauge, dir, idx) \ 00408 float4 LONG0 = tex1Dfetch((gauge), idx + ((dir/2)*3+0)*long_ga_stride); \ 00409 float4 LONG1 = tex1Dfetch((gauge), idx + ((dir/2)*3+1)*long_ga_stride); \ 00410 float4 LONG2 = tex1Dfetch((gauge), idx + ((dir/2)*3+2)*long_ga_stride); \ 00411 float4 LONG3 = make_float4(0,0,0,0); \ 00412 float4 LONG4 = make_float4(0,0,0,0); 00413 #define READ_LONG_MATRIX_8_SINGLE(gauge, dir, idx) \ 00414 float4 LONG0 = tex1Dfetch((gauge), idx + ((dir/2)*2+0)*long_ga_stride); \ 00415 float4 LONG1 = tex1Dfetch((gauge), idx + ((dir/2)*2+1)*long_ga_stride); \ 00416 float4 LONG2 = make_float4(0,0,0,0); \ 00417 float4 LONG3 = make_float4(0,0,0,0); \ 00418 float4 LONG4 = make_float4(0,0,0,0); \ 00419 long21_re = long00_re; \ 00420 long21_im = long00_im; 00421 #define READ_LONG_MATRIX_18_SINGLE(gauge, dir, idx) \ 00422 float2 LONG0 = tex1Dfetch((gauge), idx + ((dir/2)*9+0)*long_ga_stride); \ 00423 float2 LONG1 = tex1Dfetch((gauge), idx + ((dir/2)*9+1)*long_ga_stride); \ 00424 float2 LONG2 = tex1Dfetch((gauge), idx + ((dir/2)*9+2)*long_ga_stride); \ 00425 float2 LONG3 = tex1Dfetch((gauge), idx + ((dir/2)*9+3)*long_ga_stride); \ 00426 float2 LONG4 = tex1Dfetch((gauge), idx + ((dir/2)*9+4)*long_ga_stride); \ 00427 float2 LONG5 = tex1Dfetch((gauge), idx + ((dir/2)*9+5)*long_ga_stride); \ 00428 float2 LONG6 = tex1Dfetch((gauge), idx + ((dir/2)*9+6)*long_ga_stride); \ 00429 float2 LONG7 = tex1Dfetch((gauge), idx + ((dir/2)*9+7)*long_ga_stride); \ 00430 float2 LONG8 = tex1Dfetch((gauge), idx + ((dir/2)*9+8)*long_ga_stride); 00431 00432 #define READ_LONG_MATRIX_12_DOUBLE(gauge, dir, idx) \ 00433 double2 LONG0 = fetch_double2((gauge), idx + ((dir/2)*6+0)*long_ga_stride); \ 00434 double2 LONG1 = fetch_double2((gauge), idx + ((dir/2)*6+1)*long_ga_stride); \ 00435 double2 LONG2 = fetch_double2((gauge), idx + ((dir/2)*6+2)*long_ga_stride); \ 00436 double2 LONG3 = fetch_double2((gauge), idx + ((dir/2)*6+3)*long_ga_stride); \ 00437 double2 LONG4 = fetch_double2((gauge), idx + ((dir/2)*6+4)*long_ga_stride); \ 00438 double2 LONG5 = fetch_double2((gauge), idx + ((dir/2)*6+5)*long_ga_stride); \ 00439 double2 LONG6 = make_double2(0,0); \ 00440 double2 LONG7 = make_double2(0,0); \ 00441 double2 LONG8 = make_double2(0,0); \ 00442 double2 LONG9 = make_double2(0,0); 00443 00444 #define READ_LONG_MATRIX_8_DOUBLE(gauge, dir, idx) \ 00445 double2 LONG0 = fetch_double2((gauge), idx + ((dir/2)*4+0)*long_ga_stride); \ 00446 double2 LONG1 = fetch_double2((gauge), idx + ((dir/2)*4+1)*long_ga_stride); \ 00447 double2 LONG2 = fetch_double2((gauge), idx + ((dir/2)*4+2)*long_ga_stride); \ 00448 double2 LONG3 = fetch_double2((gauge), idx + ((dir/2)*4+3)*long_ga_stride); \ 00449 double2 LONG4 = make_double2(0,0); \ 00450 double2 LONG5 = make_double2(0,0); \ 00451 double2 LONG6 = make_double2(0,0); \ 00452 double2 LONG7 = make_double2(0,0); \ 00453 double2 LONG8 = make_double2(0,0); \ 00454 double2 LONG9 = make_double2(0,0); \ 00455 long21_re = long00_re; \ 00456 long21_im = long00_im; 00457 00458 #define READ_LONG_MATRIX_18_DOUBLE(gauge, dir, idx) \ 00459 double2 LONG0 = fetch_double2((gauge), idx + ((dir/2)*9+0)*long_ga_stride); \ 00460 double2 LONG1 = fetch_double2((gauge), idx + ((dir/2)*9+1)*long_ga_stride); \ 00461 double2 LONG2 = fetch_double2((gauge), idx + ((dir/2)*9+2)*long_ga_stride); \ 00462 double2 LONG3 = fetch_double2((gauge), idx + ((dir/2)*9+3)*long_ga_stride); \ 00463 double2 LONG4 = fetch_double2((gauge), idx + ((dir/2)*9+4)*long_ga_stride); \ 00464 double2 LONG5 = fetch_double2((gauge), idx + ((dir/2)*9+5)*long_ga_stride); \ 00465 double2 LONG6 = fetch_double2((gauge), idx + ((dir/2)*9+6)*long_ga_stride); \ 00466 double2 LONG7 = fetch_double2((gauge), idx + ((dir/2)*9+7)*long_ga_stride); \ 00467 double2 LONG8 = fetch_double2((gauge), idx + ((dir/2)*9+8)*long_ga_stride); 00468 00469 00470 #else //longlink access 00471 00472 #define READ_LONG_MATRIX_12_SINGLE(gauge, dir, idx) \ 00473 float4 LONG0 = gauge[idx + ((dir/2)*3+0)*long_ga_stride]; \ 00474 float4 LONG1 = gauge[idx + ((dir/2)*3+1)*long_ga_stride]; \ 00475 float4 LONG2 = gauge[idx + ((dir/2)*3+2)*long_ga_stride]; \ 00476 float4 LONG3 = make_float4(0,0,0,0); \ 00477 float4 LONG4 = make_float4(0,0,0,0); 00478 #define READ_LONG_MATRIX_8_SINGLE(gauge, dir, idx) \ 00479 float4 LONG0 = gauge[idx + ((dir/2)*2+0)*long_ga_stride]; \ 00480 float4 LONG1 = gauge[idx + ((dir/2)*2+1)*long_ga_stride]; \ 00481 float4 LONG2 = make_float4(0,0,0,0); \ 00482 float4 LONG3 = make_float4(0,0,0,0); \ 00483 float4 LONG4 = make_float4(0,0,0,0); \ 00484 long21_re = long00_re; \ 00485 long21_im = long00_im; 00486 #define READ_LONG_MATRIX_18_SINGLE(gauge, dir, idx) \ 00487 float2 LONG0 = gauge[idx + ((dir/2)*9+0)*long_ga_stride]; \ 00488 float2 LONG1 = gauge[idx + ((dir/2)*9+1)*long_ga_stride]; \ 00489 float2 LONG2 = gauge[idx + ((dir/2)*9+2)*long_ga_stride]; \ 00490 float2 LONG3 = gauge[idx + ((dir/2)*9+3)*long_ga_stride]; \ 00491 float2 LONG4 = gauge[idx + ((dir/2)*9+4)*long_ga_stride]; \ 00492 float2 LONG5 = gauge[idx + ((dir/2)*9+5)*long_ga_stride]; \ 00493 float2 LONG6 = gauge[idx + ((dir/2)*9+6)*long_ga_stride]; \ 00494 float2 LONG7 = gauge[idx + ((dir/2)*9+7)*long_ga_stride]; \ 00495 float2 LONG8 = gauge[idx + ((dir/2)*9+8)*long_ga_stride]; 00496 00497 #define READ_LONG_MATRIX_12_DOUBLE(gauge, dir, idx) \ 00498 double2 LONG0 = gauge[idx + ((dir/2)*6+0)*long_ga_stride]; \ 00499 double2 LONG1 = gauge[idx + ((dir/2)*6+1)*long_ga_stride]; \ 00500 double2 LONG2 = gauge[idx + ((dir/2)*6+2)*long_ga_stride]; \ 00501 double2 LONG3 = gauge[idx + ((dir/2)*6+3)*long_ga_stride]; \ 00502 double2 LONG4 = gauge[idx + ((dir/2)*6+4)*long_ga_stride]; \ 00503 double2 LONG5 = gauge[idx + ((dir/2)*6+5)*long_ga_stride]; \ 00504 double2 LONG6 = make_double2(0,0); \ 00505 double2 LONG7 = make_double2(0,0); \ 00506 double2 LONG8 = make_double2(0,0); \ 00507 double2 LONG9 = make_double2(0,0); 00508 00509 #define READ_LONG_MATRIX_8_DOUBLE(gauge, dir, idx) \ 00510 double2 LONG0 = gauge[idx + ((dir/2)*4+0)*long_ga_stride]; \ 00511 double2 LONG1 = gauge[idx + ((dir/2)*4+1)*long_ga_stride]; \ 00512 double2 LONG2 = gauge[idx + ((dir/2)*4+2)*long_ga_stride]; \ 00513 double2 LONG3 = gauge[idx + ((dir/2)*4+3)*long_ga_stride]; \ 00514 double2 LONG4 = make_double2(0,0); \ 00515 double2 LONG5 = make_double2(0,0); \ 00516 double2 LONG6 = make_double2(0,0); \ 00517 double2 LONG7 = make_double2(0,0); \ 00518 double2 LONG8 = make_double2(0,0); \ 00519 double2 LONG9 = make_double2(0,0); \ 00520 long21_re = long00_re; \ 00521 long21_im = long00_im; 00522 00523 #define READ_LONG_MATRIX_18_DOUBLE(gauge, dir, idx) \ 00524 double2 LONG0 = gauge[idx + ((dir/2)*9+0)*long_ga_stride]; \ 00525 double2 LONG1 = gauge[idx + ((dir/2)*9+1)*long_ga_stride]; \ 00526 double2 LONG2 = gauge[idx + ((dir/2)*9+2)*long_ga_stride]; \ 00527 double2 LONG3 = gauge[idx + ((dir/2)*9+3)*long_ga_stride]; \ 00528 double2 LONG4 = gauge[idx + ((dir/2)*9+4)*long_ga_stride]; \ 00529 double2 LONG5 = gauge[idx + ((dir/2)*9+5)*long_ga_stride]; \ 00530 double2 LONG6 = gauge[idx + ((dir/2)*9+6)*long_ga_stride]; \ 00531 double2 LONG7 = gauge[idx + ((dir/2)*9+7)*long_ga_stride]; \ 00532 double2 LONG8 = gauge[idx + ((dir/2)*9+8)*long_ga_stride]; 00533 00534 00535 #endif //longlink access 00536 00537 00538 00539 #define READ_LONG_MATRIX_8_HALF(gauge, dir, idx) \ 00540 float4 LONG0 = tex1Dfetch((gauge), idx + ((dir/2)*2+0)*long_ga_stride); \ 00541 float4 LONG1 = tex1Dfetch((gauge), idx + ((dir/2)*2+1)*long_ga_stride); \ 00542 float4 LONG2 = make_float4(0,0,0,0); \ 00543 float4 LONG3 = make_float4(0,0,0,0); \ 00544 float4 LONG4 = make_float4(0,0,0,0); \ 00545 long00_re=long21_re = pi_f*long00_re; \ 00546 long00_im=long21_im = pi_f*long00_im; 00547 00548 00549 #define READ_LONG_MATRIX_12_HALF(gauge, dir, idx) \ 00550 float4 LONG0 = tex1Dfetch((gauge), idx + ((dir/2)*3+0)*long_ga_stride); \ 00551 float4 LONG1 = tex1Dfetch((gauge), idx + ((dir/2)*3+1)*long_ga_stride); \ 00552 float4 LONG2 = tex1Dfetch((gauge), idx + ((dir/2)*3+2)*long_ga_stride); \ 00553 float4 LONG3 = make_float4(0,0,0,0); \ 00554 float4 LONG4 = make_float4(0,0,0,0); 00555 00556 00557 00558 #define READ_LONG_MATRIX_18_HALF(gauge, dir, idx) \ 00559 float2 LONG0 = tex1Dfetch((gauge), idx + ((dir/2)*9+0)*long_ga_stride); \ 00560 float2 LONG1 = tex1Dfetch((gauge), idx + ((dir/2)*9+1)*long_ga_stride); \ 00561 float2 LONG2 = tex1Dfetch((gauge), idx + ((dir/2)*9+2)*long_ga_stride); \ 00562 float2 LONG3 = tex1Dfetch((gauge), idx + ((dir/2)*9+3)*long_ga_stride); \ 00563 float2 LONG4 = tex1Dfetch((gauge), idx + ((dir/2)*9+4)*long_ga_stride); \ 00564 float2 LONG5 = tex1Dfetch((gauge), idx + ((dir/2)*9+5)*long_ga_stride); \ 00565 float2 LONG6 = tex1Dfetch((gauge), idx + ((dir/2)*9+6)*long_ga_stride); \ 00566 float2 LONG7 = tex1Dfetch((gauge), idx + ((dir/2)*9+7)*long_ga_stride); \ 00567 float2 LONG8 = tex1Dfetch((gauge), idx + ((dir/2)*9+8)*long_ga_stride);
1.7.3