QUDA v0.4.0
A library for QCD on GPUs
|
00001 00002 #ifndef __KERNEL_COMMOM_MACRO_H__ 00003 #define __KERNEL_COMMOM_MACRO_H__ 00004 00005 #define XUP 0 00006 #define YUP 1 00007 #define ZUP 2 00008 #define TUP 3 00009 #define TDOWN 4 00010 #define ZDOWN 5 00011 #define YDOWN 6 00012 #define XDOWN 7 00013 #define OPP_DIR(dir) (7-(dir)) 00014 #define GOES_FORWARDS(dir) (dir<=3) 00015 #define GOES_BACKWARDS(dir) (dir>3) 00016 00017 00018 #define linkaT00_re (+linka00_re) 00019 #define linkaT00_im (-linka00_im) 00020 #define linkaT01_re (+linka10_re) 00021 #define linkaT01_im (-linka10_im) 00022 #define linkaT02_re (+linka20_re) 00023 #define linkaT02_im (-linka20_im) 00024 #define linkaT10_re (+linka01_re) 00025 #define linkaT10_im (-linka01_im) 00026 #define linkaT11_re (+linka11_re) 00027 #define linkaT11_im (-linka11_im) 00028 #define linkaT12_re (+linka21_re) 00029 #define linkaT12_im (-linka21_im) 00030 #define linkaT20_re (+linka02_re) 00031 #define linkaT20_im (-linka02_im) 00032 #define linkaT21_re (+linka12_re) 00033 #define linkaT21_im (-linka12_im) 00034 #define linkaT22_re (+linka22_re) 00035 #define linkaT22_im (-linka22_im) 00036 00037 00038 #define linkbT00_re (+linkb00_re) 00039 #define linkbT00_im (-linkb00_im) 00040 #define linkbT01_re (+linkb10_re) 00041 #define linkbT01_im (-linkb10_im) 00042 #define linkbT02_re (+linkb20_re) 00043 #define linkbT02_im (-linkb20_im) 00044 #define linkbT10_re (+linkb01_re) 00045 #define linkbT10_im (-linkb01_im) 00046 #define linkbT11_re (+linkb11_re) 00047 #define linkbT11_im (-linkb11_im) 00048 #define linkbT12_re (+linkb21_re) 00049 #define linkbT12_im (-linkb21_im) 00050 #define linkbT20_re (+linkb02_re) 00051 #define linkbT20_im (-linkb02_im) 00052 #define linkbT21_re (+linkb12_re) 00053 #define linkbT21_im (-linkb12_im) 00054 #define linkbT22_re (+linkb22_re) 00055 #define linkbT22_im (-linkb22_im) 00056 00057 00058 00059 00060 #define linkc00_re LINKC0.x 00061 #define linkc00_im LINKC0.y 00062 #define linkc01_re LINKC0.z 00063 #define linkc01_im LINKC0.w 00064 #define linkc02_re LINKC1.x 00065 #define linkc02_im LINKC1.y 00066 #define linkc10_re LINKC1.z 00067 #define linkc10_im LINKC1.w 00068 #define linkc11_re LINKC2.x 00069 #define linkc11_im LINKC2.y 00070 #define linkc12_re LINKC2.z 00071 #define linkc12_im LINKC2.w 00072 #define linkc20_re LINKC3.x 00073 #define linkc20_im LINKC3.y 00074 #define linkc21_re LINKC3.z 00075 #define linkc21_im LINKC3.w 00076 #define linkc22_re LINKC4.x 00077 #define linkc22_im LINKC4.y 00078 00079 #define linkcT00_re (+linkc00_re) 00080 #define linkcT00_im (-linkc00_im) 00081 #define linkcT01_re (+linkc10_re) 00082 #define linkcT01_im (-linkc10_im) 00083 #define linkcT02_re (+linkc20_re) 00084 #define linkcT02_im (-linkc20_im) 00085 #define linkcT10_re (+linkc01_re) 00086 #define linkcT10_im (-linkc01_im) 00087 #define linkcT11_re (+linkc11_re) 00088 #define linkcT11_im (-linkc11_im) 00089 #define linkcT12_re (+linkc21_re) 00090 #define linkcT12_im (-linkc21_im) 00091 #define linkcT20_re (+linkc02_re) 00092 #define linkcT20_im (-linkc02_im) 00093 #define linkcT21_re (+linkc12_re) 00094 #define linkcT21_im (-linkc12_im) 00095 #define linkcT22_re (+linkc22_re) 00096 #define linkcT22_im (-linkc22_im) 00097 00098 00099 #define staple00_re STAPLE0.x 00100 #define staple00_im STAPLE0.y 00101 #define staple01_re STAPLE1.x 00102 #define staple01_im STAPLE1.y 00103 #define staple02_re STAPLE2.x 00104 #define staple02_im STAPLE2.y 00105 #define staple10_re STAPLE3.x 00106 #define staple10_im STAPLE3.y 00107 #define staple11_re STAPLE4.x 00108 #define staple11_im STAPLE4.y 00109 #define staple12_re STAPLE5.x 00110 #define staple12_im STAPLE5.y 00111 #define staple20_re STAPLE6.x 00112 #define staple20_im STAPLE6.y 00113 #define staple21_re STAPLE7.x 00114 #define staple21_im STAPLE7.y 00115 #define staple22_re STAPLE8.x 00116 #define staple22_im STAPLE8.y 00117 00118 #define stapleT00_re (+staple00_re) 00119 #define stapleT00_im (-staple00_im) 00120 #define stapleT01_re (+staple10_re) 00121 #define stapleT01_im (-staple10_im) 00122 #define stapleT02_re (+staple20_re) 00123 #define stapleT02_im (-staple20_im) 00124 #define stapleT10_re (+staple01_re) 00125 #define stapleT10_im (-staple01_im) 00126 #define stapleT11_re (+staple11_re) 00127 #define stapleT11_im (-staple11_im) 00128 #define stapleT12_re (+staple21_re) 00129 #define stapleT12_im (-staple21_im) 00130 #define stapleT20_re (+staple02_re) 00131 #define stapleT20_im (-staple02_im) 00132 #define stapleT21_re (+staple12_re) 00133 #define stapleT21_im (-staple12_im) 00134 #define stapleT22_re (+staple22_re) 00135 #define stapleT22_im (-staple22_im) 00136 00137 //#ifdef FERMI_DBLE_NO_TEX 00138 00139 #ifdef FERMI_DBLE_NO_TEX 00140 #define READ_DOUBLE2_TEXTURE(x_tex, x, i) x[i] 00141 #else 00142 #define READ_DOUBLE2_TEXTURE(x_tex, x, i) fetch_double2(x_tex, i) 00143 #endif 00144 00145 00146 #define LOAD_MATRIX_12_SINGLE(gauge, dir, idx, var, stride)do{ \ 00147 var##0 = gauge[idx + dir*stride*3]; \ 00148 var##1 = gauge[idx + dir*stride*3 + stride]; \ 00149 var##2 = gauge[idx + dir*stride*3 + stride*2]; \ 00150 }while(0) 00151 00152 #define LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)do{ \ 00153 var##0 = tex1Dfetch(gauge, idx + dir*stride*3); \ 00154 var##1 = tex1Dfetch(gauge, idx + dir*stride*3 + stride); \ 00155 var##2 = tex1Dfetch(gauge, idx + dir*stride*3 + stride*2); \ 00156 }while(0) 00157 00158 #define LOAD_MATRIX_12_DOUBLE(gauge, dir, idx, var, stride)do{ \ 00159 var##0 = gauge[idx + dir*stride*6]; \ 00160 var##1 = gauge[idx + dir*stride*6 + stride]; \ 00161 var##2 = gauge[idx + dir*stride*6 + stride*2]; \ 00162 var##3 = gauge[idx + dir*stride*6 + stride*3]; \ 00163 var##4 = gauge[idx + dir*stride*6 + stride*4]; \ 00164 var##5 = gauge[idx + dir*stride*6 + stride*5]; \ 00165 }while(0) 00166 00167 #define LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \ 00168 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6); \ 00169 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride); \ 00170 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*2); \ 00171 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*3); \ 00172 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*4); \ 00173 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*5); \ 00174 }while(0) 00175 00176 #define LOAD_MATRIX_18(gauge, dir, idx, var, stride)do{ \ 00177 var##0 = gauge[idx + dir*stride*9]; \ 00178 var##1 = gauge[idx + dir*stride*9 + stride]; \ 00179 var##2 = gauge[idx + dir*stride*9 + stride*2]; \ 00180 var##3 = gauge[idx + dir*stride*9 + stride*3]; \ 00181 var##4 = gauge[idx + dir*stride*9 + stride*4]; \ 00182 var##5 = gauge[idx + dir*stride*9 + stride*5]; \ 00183 var##6 = gauge[idx + dir*stride*9 + stride*6]; \ 00184 var##7 = gauge[idx + dir*stride*9 + stride*7]; \ 00185 var##8 = gauge[idx + dir*stride*9 + stride*8]; \ 00186 }while(0) 00187 00188 #define LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)do{ \ 00189 var##0 = tex1Dfetch(gauge, idx + dir*stride*9); \ 00190 var##1 = tex1Dfetch(gauge, idx + dir*stride*9 + stride); \ 00191 var##2 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*2); \ 00192 var##3 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*3); \ 00193 var##4 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*4); \ 00194 var##5 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*5); \ 00195 var##6 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*6); \ 00196 var##7 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*7); \ 00197 var##8 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*8); \ 00198 }while(0) 00199 00200 #define LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \ 00201 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9); \ 00202 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride); \ 00203 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*2); \ 00204 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*3); \ 00205 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*4); \ 00206 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*5); \ 00207 var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*6); \ 00208 var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*7); \ 00209 var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*8); \ 00210 }while(0) 00211 00212 #define MULT_SU3_NN(ma, mb, mc) \ 00213 mc##00_re = \ 00214 ma##00_re * mb##00_re - ma##00_im * mb##00_im + \ 00215 ma##01_re * mb##10_re - ma##01_im * mb##10_im + \ 00216 ma##02_re * mb##20_re - ma##02_im * mb##20_im; \ 00217 mc##00_im = \ 00218 ma##00_re * mb##00_im + ma##00_im * mb##00_re + \ 00219 ma##01_re * mb##10_im + ma##01_im * mb##10_re + \ 00220 ma##02_re * mb##20_im + ma##02_im * mb##20_re; \ 00221 mc##10_re = \ 00222 ma##10_re * mb##00_re - ma##10_im * mb##00_im + \ 00223 ma##11_re * mb##10_re - ma##11_im * mb##10_im + \ 00224 ma##12_re * mb##20_re - ma##12_im * mb##20_im; \ 00225 mc##10_im = \ 00226 ma##10_re * mb##00_im + ma##10_im * mb##00_re + \ 00227 ma##11_re * mb##10_im + ma##11_im * mb##10_re + \ 00228 ma##12_re * mb##20_im + ma##12_im * mb##20_re; \ 00229 mc##20_re = \ 00230 ma##20_re * mb##00_re - ma##20_im * mb##00_im + \ 00231 ma##21_re * mb##10_re - ma##21_im * mb##10_im + \ 00232 ma##22_re * mb##20_re - ma##22_im * mb##20_im; \ 00233 mc##20_im = \ 00234 ma##20_re * mb##00_im + ma##20_im * mb##00_re + \ 00235 ma##21_re * mb##10_im + ma##21_im * mb##10_re + \ 00236 ma##22_re * mb##20_im + ma##22_im * mb##20_re; \ 00237 mc##01_re = \ 00238 ma##00_re * mb##01_re - ma##00_im * mb##01_im + \ 00239 ma##01_re * mb##11_re - ma##01_im * mb##11_im + \ 00240 ma##02_re * mb##21_re - ma##02_im * mb##21_im; \ 00241 mc##01_im = \ 00242 ma##00_re * mb##01_im + ma##00_im * mb##01_re + \ 00243 ma##01_re * mb##11_im + ma##01_im * mb##11_re + \ 00244 ma##02_re * mb##21_im + ma##02_im * mb##21_re; \ 00245 mc##11_re = \ 00246 ma##10_re * mb##01_re - ma##10_im * mb##01_im + \ 00247 ma##11_re * mb##11_re - ma##11_im * mb##11_im + \ 00248 ma##12_re * mb##21_re - ma##12_im * mb##21_im; \ 00249 mc##11_im = \ 00250 ma##10_re * mb##01_im + ma##10_im * mb##01_re + \ 00251 ma##11_re * mb##11_im + ma##11_im * mb##11_re + \ 00252 ma##12_re * mb##21_im + ma##12_im * mb##21_re; \ 00253 mc##21_re = \ 00254 ma##20_re * mb##01_re - ma##20_im * mb##01_im + \ 00255 ma##21_re * mb##11_re - ma##21_im * mb##11_im + \ 00256 ma##22_re * mb##21_re - ma##22_im * mb##21_im; \ 00257 mc##21_im = \ 00258 ma##20_re * mb##01_im + ma##20_im * mb##01_re + \ 00259 ma##21_re * mb##11_im + ma##21_im * mb##11_re + \ 00260 ma##22_re * mb##21_im + ma##22_im * mb##21_re; \ 00261 mc##02_re = \ 00262 ma##00_re * mb##02_re - ma##00_im * mb##02_im + \ 00263 ma##01_re * mb##12_re - ma##01_im * mb##12_im + \ 00264 ma##02_re * mb##22_re - ma##02_im * mb##22_im; \ 00265 mc##02_im = \ 00266 ma##00_re * mb##02_im + ma##00_im * mb##02_re + \ 00267 ma##01_re * mb##12_im + ma##01_im * mb##12_re + \ 00268 ma##02_re * mb##22_im + ma##02_im * mb##22_re; \ 00269 mc##12_re = \ 00270 ma##10_re * mb##02_re - ma##10_im * mb##02_im + \ 00271 ma##11_re * mb##12_re - ma##11_im * mb##12_im + \ 00272 ma##12_re * mb##22_re - ma##12_im * mb##22_im; \ 00273 mc##12_im = \ 00274 ma##10_re * mb##02_im + ma##10_im * mb##02_re + \ 00275 ma##11_re * mb##12_im + ma##11_im * mb##12_re + \ 00276 ma##12_re * mb##22_im + ma##12_im * mb##22_re; \ 00277 mc##22_re = \ 00278 ma##20_re * mb##02_re - ma##20_im * mb##02_im + \ 00279 ma##21_re * mb##12_re - ma##21_im * mb##12_im + \ 00280 ma##22_re * mb##22_re - ma##22_im * mb##22_im; \ 00281 mc##22_im = \ 00282 ma##20_re * mb##02_im + ma##20_im * mb##02_re + \ 00283 ma##21_re * mb##12_im + ma##21_im * mb##12_re + \ 00284 ma##22_re * mb##22_im + ma##22_im * mb##22_re; 00285 00286 00287 00288 #define MULT_SU3_NA(ma, mb, mc) \ 00289 mc##00_re = \ 00290 ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \ 00291 ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \ 00292 ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \ 00293 mc##00_im = \ 00294 ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \ 00295 ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \ 00296 ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \ 00297 mc##10_re = \ 00298 ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \ 00299 ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \ 00300 ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \ 00301 mc##10_im = \ 00302 ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \ 00303 ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \ 00304 ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \ 00305 mc##20_re = \ 00306 ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \ 00307 ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \ 00308 ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \ 00309 mc##20_im = \ 00310 ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \ 00311 ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \ 00312 ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \ 00313 mc##01_re = \ 00314 ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \ 00315 ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \ 00316 ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \ 00317 mc##01_im = \ 00318 ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \ 00319 ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \ 00320 ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \ 00321 mc##11_re = \ 00322 ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \ 00323 ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \ 00324 ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \ 00325 mc##11_im = \ 00326 ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \ 00327 ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \ 00328 ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \ 00329 mc##21_re = \ 00330 ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \ 00331 ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \ 00332 ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \ 00333 mc##21_im = \ 00334 ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \ 00335 ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \ 00336 ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \ 00337 mc##02_re = \ 00338 ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \ 00339 ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \ 00340 ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \ 00341 mc##02_im = \ 00342 ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \ 00343 ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \ 00344 ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \ 00345 mc##12_re = \ 00346 ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \ 00347 ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \ 00348 ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \ 00349 mc##12_im = \ 00350 ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \ 00351 ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \ 00352 ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \ 00353 mc##22_re = \ 00354 ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \ 00355 ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \ 00356 ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \ 00357 mc##22_im = \ 00358 ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \ 00359 ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \ 00360 ma##22_re * mb##T22_im + ma##22_im * mb##T22_re; 00361 00362 00363 00364 #define MULT_SU3_AN(ma, mb, mc) \ 00365 mc##00_re = \ 00366 ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \ 00367 ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \ 00368 ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \ 00369 mc##00_im = \ 00370 ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \ 00371 ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \ 00372 ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \ 00373 mc##10_re = \ 00374 ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \ 00375 ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \ 00376 ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \ 00377 mc##10_im = \ 00378 ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \ 00379 ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \ 00380 ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \ 00381 mc##20_re = \ 00382 ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \ 00383 ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \ 00384 ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \ 00385 mc##20_im = \ 00386 ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \ 00387 ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \ 00388 ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \ 00389 mc##01_re = \ 00390 ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \ 00391 ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \ 00392 ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \ 00393 mc##01_im = \ 00394 ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \ 00395 ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \ 00396 ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \ 00397 mc##11_re = \ 00398 ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \ 00399 ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \ 00400 ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \ 00401 mc##11_im = \ 00402 ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \ 00403 ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \ 00404 ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \ 00405 mc##21_re = \ 00406 ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \ 00407 ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \ 00408 ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \ 00409 mc##21_im = \ 00410 ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \ 00411 ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \ 00412 ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \ 00413 mc##02_re = \ 00414 ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \ 00415 ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \ 00416 ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \ 00417 mc##02_im = \ 00418 ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \ 00419 ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \ 00420 ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \ 00421 mc##12_re = \ 00422 ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \ 00423 ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \ 00424 ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \ 00425 mc##12_im = \ 00426 ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \ 00427 ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \ 00428 ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \ 00429 mc##22_re = \ 00430 ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \ 00431 ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \ 00432 ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \ 00433 mc##22_im = \ 00434 ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \ 00435 ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \ 00436 ma##T22_re * mb##22_im + ma##T22_im * mb##22_re; 00437 00438 #define SET_SU3_MATRIX(a, value) \ 00439 a##00_re = value; \ 00440 a##00_im = value; \ 00441 a##01_re = value; \ 00442 a##01_im = value; \ 00443 a##02_re = value; \ 00444 a##02_im = value; \ 00445 a##10_re = value; \ 00446 a##10_im = value; \ 00447 a##11_re = value; \ 00448 a##11_im = value; \ 00449 a##12_re = value; \ 00450 a##12_im = value; \ 00451 a##20_re = value; \ 00452 a##20_im = value; \ 00453 a##21_re = value; \ 00454 a##21_im = value; \ 00455 a##22_re = value; \ 00456 a##22_im = value; \ 00457 00458 #define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc) \ 00459 mc##00_re = ma##00_re + mb##00_re * s; \ 00460 mc##00_im = ma##00_im + mb##00_im * s; \ 00461 mc##01_re = ma##01_re + mb##01_re * s; \ 00462 mc##01_im = ma##01_im + mb##01_im * s; \ 00463 mc##02_re = ma##02_re + mb##02_re * s; \ 00464 mc##02_im = ma##02_im + mb##02_im * s; \ 00465 mc##10_re = ma##10_re + mb##10_re * s; \ 00466 mc##10_im = ma##10_im + mb##10_im * s; \ 00467 mc##11_re = ma##11_re + mb##11_re * s; \ 00468 mc##11_im = ma##11_im + mb##11_im * s; \ 00469 mc##12_re = ma##12_re + mb##12_re * s; \ 00470 mc##12_im = ma##12_im + mb##12_im * s; \ 00471 mc##20_re = ma##20_re + mb##20_re * s; \ 00472 mc##20_im = ma##20_im + mb##20_im * s; \ 00473 mc##21_re = ma##21_re + mb##21_re * s; \ 00474 mc##21_im = ma##21_im + mb##21_im * s; \ 00475 mc##22_re = ma##22_re + mb##22_re * s; \ 00476 mc##22_im = ma##22_im + mb##22_im * s; 00477 00478 #define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc) \ 00479 mc##00_re = ma##00_re - mb##00_re * s; \ 00480 mc##00_im = ma##00_im - mb##00_im * s; \ 00481 mc##01_re = ma##01_re - mb##01_re * s; \ 00482 mc##01_im = ma##01_im - mb##01_im * s; \ 00483 mc##02_re = ma##02_re - mb##02_re * s; \ 00484 mc##02_im = ma##02_im - mb##02_im * s; \ 00485 mc##10_re = ma##10_re - mb##10_re * s; \ 00486 mc##10_im = ma##10_im - mb##10_im * s; \ 00487 mc##11_re = ma##11_re - mb##11_re * s; \ 00488 mc##11_im = ma##11_im - mb##11_im * s; \ 00489 mc##12_re = ma##12_re - mb##12_re * s; \ 00490 mc##12_im = ma##12_im - mb##12_im * s; \ 00491 mc##20_re = ma##20_re - mb##20_re * s; \ 00492 mc##20_im = ma##20_im - mb##20_im * s; \ 00493 mc##21_re = ma##21_re - mb##21_re * s; \ 00494 mc##21_im = ma##21_im - mb##21_im * s; \ 00495 mc##22_re = ma##22_re - mb##22_re * s; \ 00496 mc##22_im = ma##22_im - mb##22_im * s; 00497 00498 00499 #define ah01_re AH0.x 00500 #define ah01_im AH0.y 00501 #define ah02_re AH1.x 00502 #define ah02_im AH1.y 00503 #define ah12_re AH2.x 00504 #define ah12_im AH2.y 00505 #define ah00_im AH3.x 00506 #define ah11_im AH3.y 00507 #define ah22_im AH4.x 00508 #define ahspace AH4.y 00509 00510 #define UNCOMPRESS_ANTI_HERMITIAN(ah, m) \ 00511 m##00_re = 0; \ 00512 m##00_im = ah##00_im; \ 00513 m##11_re = 0; \ 00514 m##11_im = ah##11_im; \ 00515 m##22_re = 0; \ 00516 m##22_im = ah##22_im; \ 00517 m##01_re = ah##01_re; \ 00518 m##01_im = ah##01_im; \ 00519 m##10_re = -ah##01_re; \ 00520 m##10_im = ah##01_im; \ 00521 m##02_re = ah##02_re; \ 00522 m##02_im = ah##02_im; \ 00523 m##20_re = -ah##02_re; \ 00524 m##20_im = ah##02_im; \ 00525 m##12_re = ah##12_re; \ 00526 m##12_im = ah##12_im; \ 00527 m##21_re = -ah##12_re; \ 00528 m##21_im = ah##12_im; 00529 00530 00531 #define MAKE_ANTI_HERMITIAN(m, ah) do { \ 00532 typeof(ah##space) temp; \ 00533 temp = (m##00_im + m##11_im + m##22_im)*0.33333333333333333; \ 00534 ah##00_im = (m##00_im - temp); \ 00535 ah##11_im = (m##11_im - temp); \ 00536 ah##22_im = (m##22_im - temp); \ 00537 ah##01_re = (m##01_re - m##10_re)*0.5; \ 00538 ah##02_re = (m##02_re - m##20_re)*0.5; \ 00539 ah##12_re = (m##12_re - m##21_re)*0.5; \ 00540 ah##01_im = (m##01_im + m##10_im)*0.5; \ 00541 ah##02_im = (m##02_im + m##20_im)*0.5; \ 00542 ah##12_im = (m##12_im + m##21_im)*0.5; \ 00543 ah##space = 0; \ 00544 }while(0) 00545 00546 00547 #define LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, stride) do{ \ 00548 int start_pos = idx + dir*stride*5; \ 00549 var##0 = src[start_pos]; \ 00550 var##1 = src[start_pos + stride]; \ 00551 var##2 = src[start_pos + stride*2]; \ 00552 var##3 = src[start_pos + stride*3]; \ 00553 var##4 = src[start_pos + stride*4]; \ 00554 }while(0) 00555 00556 #define LOAD_ANTI_HERMITIAN_SINGLE_TEX(src, dir, idx, var) do{ \ 00557 int start_pos = idx + dir*Vh*5; \ 00558 var##0 = tex1Dfetch(src, start_pos); \ 00559 var##1 = tex1Dfetch(src, start_pos + Vh); \ 00560 var##2 = tex1Dfetch(src, start_pos + Vh*2); \ 00561 var##3 = tex1Dfetch(src, start_pos + Vh*3); \ 00562 var##4 = tex1Dfetch(src, start_pos + Vh*4); \ 00563 }while(0) 00564 00565 #define WRITE_ANTI_HERMITIAN(mem, dir, idx, var, stride) do{ \ 00566 int start_ps = idx + dir*stride*5; \ 00567 mem[start_ps] = var##0; \ 00568 mem[start_ps + stride] = var##1; \ 00569 mem[start_ps + stride*2] = var##2; \ 00570 mem[start_ps + stride*3] = var##3; \ 00571 mem[start_ps + stride*4] = var##4; \ 00572 }while(0) 00573 00574 #define COPY_SU3_MATRIX(a, b) \ 00575 b##00_re = a##00_re; \ 00576 b##00_im = a##00_im; \ 00577 b##01_re = a##01_re; \ 00578 b##01_im = a##01_im; \ 00579 b##02_re = a##02_re; \ 00580 b##02_im = a##02_im; \ 00581 b##10_re = a##10_re; \ 00582 b##10_im = a##10_im; \ 00583 b##11_re = a##11_re; \ 00584 b##11_im = a##11_im; \ 00585 b##12_re = a##12_re; \ 00586 b##12_im = a##12_im; \ 00587 b##20_re = a##20_re; \ 00588 b##20_im = a##20_im; \ 00589 b##21_re = a##21_re; \ 00590 b##21_im = a##21_im; \ 00591 b##22_re = a##22_re; \ 00592 b##22_im = a##22_im; 00593 00594 #define SU3_ADJOINT(a, b) \ 00595 b##00_re = a##00_re; \ 00596 b##00_im = - a##00_im; \ 00597 b##01_re = a##10_re; \ 00598 b##01_im = - a##10_im; \ 00599 b##02_re = a##20_re; \ 00600 b##02_im = - a##20_im; \ 00601 b##10_re = a##01_re; \ 00602 b##10_im = - a##01_im; \ 00603 b##11_re = a##11_re; \ 00604 b##11_im = - a##11_im; \ 00605 b##12_re = a##21_re; \ 00606 b##12_im = - a##21_im; \ 00607 b##20_re = a##02_re; \ 00608 b##20_im = - a##02_im; \ 00609 b##21_re = a##12_re; \ 00610 b##21_im = - a##12_im; \ 00611 b##22_re = a##22_re; \ 00612 b##22_im = - a##22_im; 00613 00614 #define SET_UNIT_SU3_MATRIX(a) \ 00615 a##00_re = 1.0; \ 00616 a##00_im = 0; \ 00617 a##01_re = 0; \ 00618 a##01_im = 0; \ 00619 a##02_re = 0; \ 00620 a##02_im = 0; \ 00621 a##10_re = 0; \ 00622 a##10_im = 0; \ 00623 a##11_re = 1.0; \ 00624 a##11_im = 0; \ 00625 a##12_re = 0; \ 00626 a##12_im = 0; \ 00627 a##20_re = 0; \ 00628 a##20_im = 0; \ 00629 a##21_re = 0; \ 00630 a##21_im = 0; \ 00631 a##22_re = 1.0; \ 00632 a##22_im = 0; 00633 00634 // Performs the complex conjugated accumulation: a = b* c* 00635 #define ACC_CONJ_PROD_ASSIGN(a, b, c) \ 00636 a##_re = b##_re * c##_re; \ 00637 a##_re -= b##_im * c##_im; \ 00638 a##_im = - b##_re * c##_im; \ 00639 a##_im -= b##_im * c##_re 00640 00641 00642 #define RECONSTRUCT_LINK_12(sign, var) \ 00643 ACC_CONJ_PROD_ASSIGN(var##20, +var##01, +var##12); \ 00644 ACC_CONJ_PROD(var##20, -var##02, +var##11); \ 00645 ACC_CONJ_PROD_ASSIGN(var##21, +var##02, +var##10); \ 00646 ACC_CONJ_PROD(var##21, -var##00, +var##12); \ 00647 ACC_CONJ_PROD_ASSIGN(var##22, +var##00, +var##11); \ 00648 ACC_CONJ_PROD(var##22, -var##01, +var##10); \ 00649 var##20_re *=sign;var##20_im *=sign; var##21_re *=sign; var##21_im *=sign; \ 00650 var##22_re *=sign;var##22_im *=sign; 00651 00652 #define COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \ 00653 switch(mydir){ \ 00654 case 0: \ 00655 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>> 1; \ 00656 break; \ 00657 case 1: \ 00658 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1) >> 1; \ 00659 break; \ 00660 case 2: \ 00661 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1) >> 1; \ 00662 break; \ 00663 case 3: \ 00664 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1) >> 1; \ 00665 break; \ 00666 } \ 00667 }while(0) 00668 00669 #define COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \ 00670 switch(mydir){ \ 00671 case 0: \ 00672 new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \ 00673 break; \ 00674 case 1: \ 00675 new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \ 00676 break; \ 00677 case 2: \ 00678 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \ 00679 break; \ 00680 case 3: \ 00681 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \ 00682 break; \ 00683 } \ 00684 }while(0) 00685 00686 00687 #define COMPUTE_NEW_FULL_IDX_PLUS(mydir, idx) do { \ 00688 switch(mydir){ \ 00689 case 0: \ 00690 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1); \ 00691 break; \ 00692 case 1: \ 00693 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00694 break; \ 00695 case 2: \ 00696 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00697 break; \ 00698 case 3: \ 00699 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00700 break; \ 00701 } \ 00702 }while(0) 00703 00704 #define COMPUTE_NEW_FULL_IDX_MINUS(mydir, idx) do { \ 00705 switch(mydir){ \ 00706 case 0: \ 00707 new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \ 00708 break; \ 00709 case 1: \ 00710 new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \ 00711 break; \ 00712 case 2: \ 00713 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \ 00714 break; \ 00715 case 3: \ 00716 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \ 00717 break; \ 00718 } \ 00719 }while(0) 00720 00721 00722 #endif