|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 00002 #ifndef __KERNEL_COMMOM_MACRO_H__ 00003 #define __KERNEL_COMMOM_MACRO_H__ 00004 00005 #define XUP 0 00006 #define YUP 1 00007 #define ZUP 2 00008 #define TUP 3 00009 #define TDOWN 4 00010 #define ZDOWN 5 00011 #define YDOWN 6 00012 #define XDOWN 7 00013 #define OPP_DIR(dir) (7-(dir)) 00014 #define GOES_FORWARDS(dir) (dir<=3) 00015 #define GOES_BACKWARDS(dir) (dir>3) 00016 00017 #define linka00_re LINKA0.x 00018 #define linka00_im LINKA0.y 00019 #define linka01_re LINKA0.z 00020 #define linka01_im LINKA0.w 00021 #define linka02_re LINKA1.x 00022 #define linka02_im LINKA1.y 00023 #define linka10_re LINKA1.z 00024 #define linka10_im LINKA1.w 00025 #define linka11_re LINKA2.x 00026 #define linka11_im LINKA2.y 00027 #define linka12_re LINKA2.z 00028 #define linka12_im LINKA2.w 00029 #define linka20_re LINKA3.x 00030 #define linka20_im LINKA3.y 00031 #define linka21_re LINKA3.z 00032 #define linka21_im LINKA3.w 00033 #define linka22_re LINKA4.x 00034 #define linka22_im LINKA4.y 00035 00036 #define linkaT00_re (+linka00_re) 00037 #define linkaT00_im (-linka00_im) 00038 #define linkaT01_re (+linka10_re) 00039 #define linkaT01_im (-linka10_im) 00040 #define linkaT02_re (+linka20_re) 00041 #define linkaT02_im (-linka20_im) 00042 #define linkaT10_re (+linka01_re) 00043 #define linkaT10_im (-linka01_im) 00044 #define linkaT11_re (+linka11_re) 00045 #define linkaT11_im (-linka11_im) 00046 #define linkaT12_re (+linka21_re) 00047 #define linkaT12_im (-linka21_im) 00048 #define linkaT20_re (+linka02_re) 00049 #define linkaT20_im (-linka02_im) 00050 #define linkaT21_re (+linka12_re) 00051 #define linkaT21_im (-linka12_im) 00052 #define linkaT22_re (+linka22_re) 00053 #define linkaT22_im (-linka22_im) 00054 00055 #define linkb00_re LINKB0.x 00056 #define linkb00_im LINKB0.y 00057 #define linkb01_re LINKB0.z 00058 #define linkb01_im LINKB0.w 00059 #define linkb02_re LINKB1.x 00060 #define linkb02_im LINKB1.y 00061 #define linkb10_re LINKB1.z 00062 #define linkb10_im LINKB1.w 00063 #define linkb11_re LINKB2.x 00064 #define linkb11_im LINKB2.y 00065 #define linkb12_re LINKB2.z 00066 #define linkb12_im LINKB2.w 00067 #define linkb20_re LINKB3.x 00068 #define linkb20_im LINKB3.y 00069 #define linkb21_re LINKB3.z 00070 #define linkb21_im LINKB3.w 00071 #define linkb22_re LINKB4.x 00072 #define linkb22_im LINKB4.y 00073 00074 #define linkbT00_re (+linkb00_re) 00075 #define linkbT00_im (-linkb00_im) 00076 #define linkbT01_re (+linkb10_re) 00077 #define linkbT01_im (-linkb10_im) 00078 #define linkbT02_re (+linkb20_re) 00079 #define linkbT02_im (-linkb20_im) 00080 #define linkbT10_re (+linkb01_re) 00081 #define linkbT10_im (-linkb01_im) 00082 #define linkbT11_re (+linkb11_re) 00083 #define linkbT11_im (-linkb11_im) 00084 #define linkbT12_re (+linkb21_re) 00085 #define linkbT12_im (-linkb21_im) 00086 #define linkbT20_re (+linkb02_re) 00087 #define linkbT20_im (-linkb02_im) 00088 #define linkbT21_re (+linkb12_re) 00089 #define linkbT21_im (-linkb12_im) 00090 #define linkbT22_re (+linkb22_re) 00091 #define linkbT22_im (-linkb22_im) 00092 00093 #define linkc00_re LINKC0.x 00094 #define linkc00_im LINKC0.y 00095 #define linkc01_re LINKC0.z 00096 #define linkc01_im LINKC0.w 00097 #define linkc02_re LINKC1.x 00098 #define linkc02_im LINKC1.y 00099 #define linkc10_re LINKC1.z 00100 #define linkc10_im LINKC1.w 00101 #define linkc11_re LINKC2.x 00102 #define linkc11_im LINKC2.y 00103 #define linkc12_re LINKC2.z 00104 #define linkc12_im LINKC2.w 00105 #define linkc20_re LINKC3.x 00106 #define linkc20_im LINKC3.y 00107 #define linkc21_re LINKC3.z 00108 #define linkc21_im LINKC3.w 00109 #define linkc22_re LINKC4.x 00110 #define linkc22_im LINKC4.y 00111 00112 #define linkcT00_re (+linkc00_re) 00113 #define linkcT00_im (-linkc00_im) 00114 #define linkcT01_re (+linkc10_re) 00115 #define linkcT01_im (-linkc10_im) 00116 #define linkcT02_re (+linkc20_re) 00117 #define linkcT02_im (-linkc20_im) 00118 #define linkcT10_re (+linkc01_re) 00119 #define linkcT10_im (-linkc01_im) 00120 #define linkcT11_re (+linkc11_re) 00121 #define linkcT11_im (-linkc11_im) 00122 #define linkcT12_re (+linkc21_re) 00123 #define linkcT12_im (-linkc21_im) 00124 #define linkcT20_re (+linkc02_re) 00125 #define linkcT20_im (-linkc02_im) 00126 #define linkcT21_re (+linkc12_re) 00127 #define linkcT21_im (-linkc12_im) 00128 #define linkcT22_re (+linkc22_re) 00129 #define linkcT22_im (-linkc22_im) 00130 00131 00132 #define staple00_re STAPLE0.x 00133 #define staple00_im STAPLE0.y 00134 #define staple01_re STAPLE1.x 00135 #define staple01_im STAPLE1.y 00136 #define staple02_re STAPLE2.x 00137 #define staple02_im STAPLE2.y 00138 #define staple10_re STAPLE3.x 00139 #define staple10_im STAPLE3.y 00140 #define staple11_re STAPLE4.x 00141 #define staple11_im STAPLE4.y 00142 #define staple12_re STAPLE5.x 00143 #define staple12_im STAPLE5.y 00144 #define staple20_re STAPLE6.x 00145 #define staple20_im STAPLE6.y 00146 #define staple21_re STAPLE7.x 00147 #define staple21_im STAPLE7.y 00148 #define staple22_re STAPLE8.x 00149 #define staple22_im STAPLE8.y 00150 00151 #define stapleT00_re (+staple00_re) 00152 #define stapleT00_im (-staple00_im) 00153 #define stapleT01_re (+staple10_re) 00154 #define stapleT01_im (-staple10_im) 00155 #define stapleT02_re (+staple20_re) 00156 #define stapleT02_im (-staple20_im) 00157 #define stapleT10_re (+staple01_re) 00158 #define stapleT10_im (-staple01_im) 00159 #define stapleT11_re (+staple11_re) 00160 #define stapleT11_im (-staple11_im) 00161 #define stapleT12_re (+staple21_re) 00162 #define stapleT12_im (-staple21_im) 00163 #define stapleT20_re (+staple02_re) 00164 #define stapleT20_im (-staple02_im) 00165 #define stapleT21_re (+staple12_re) 00166 #define stapleT21_im (-staple12_im) 00167 #define stapleT22_re (+staple22_re) 00168 #define stapleT22_im (-staple22_im) 00169 00170 #define LOAD_MATRIX_12_SINGLE(gauge, dir, idx, var)do{ \ 00171 var##0 = gauge[idx + dir*Vhx3]; \ 00172 var##1 = gauge[idx + dir*Vhx3 + Vh]; \ 00173 var##2 = gauge[idx + dir*Vhx3 + Vhx2]; \ 00174 }while(0) 00175 00176 #define LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var)do{ \ 00177 var##0 = tex1Dfetch(gauge, idx + dir*Vhx3); \ 00178 var##1 = tex1Dfetch(gauge, idx + dir*Vhx3 + Vh); \ 00179 var##2 = tex1Dfetch(gauge, idx + dir*Vhx3 + Vhx2); \ 00180 }while(0) 00181 00182 00183 #define MULT_SU3_NN(ma, mb, mc) \ 00184 mc##00_re = \ 00185 ma##00_re * mb##00_re - ma##00_im * mb##00_im + \ 00186 ma##01_re * mb##10_re - ma##01_im * mb##10_im + \ 00187 ma##02_re * mb##20_re - ma##02_im * mb##20_im; \ 00188 mc##00_im = \ 00189 ma##00_re * mb##00_im + ma##00_im * mb##00_re + \ 00190 ma##01_re * mb##10_im + ma##01_im * mb##10_re + \ 00191 ma##02_re * mb##20_im + ma##02_im * mb##20_re; \ 00192 mc##10_re = \ 00193 ma##10_re * mb##00_re - ma##10_im * mb##00_im + \ 00194 ma##11_re * mb##10_re - ma##11_im * mb##10_im + \ 00195 ma##12_re * mb##20_re - ma##12_im * mb##20_im; \ 00196 mc##10_im = \ 00197 ma##10_re * mb##00_im + ma##10_im * mb##00_re + \ 00198 ma##11_re * mb##10_im + ma##11_im * mb##10_re + \ 00199 ma##12_re * mb##20_im + ma##12_im * mb##20_re; \ 00200 mc##20_re = \ 00201 ma##20_re * mb##00_re - ma##20_im * mb##00_im + \ 00202 ma##21_re * mb##10_re - ma##21_im * mb##10_im + \ 00203 ma##22_re * mb##20_re - ma##22_im * mb##20_im; \ 00204 mc##20_im = \ 00205 ma##20_re * mb##00_im + ma##20_im * mb##00_re + \ 00206 ma##21_re * mb##10_im + ma##21_im * mb##10_re + \ 00207 ma##22_re * mb##20_im + ma##22_im * mb##20_re; \ 00208 mc##01_re = \ 00209 ma##00_re * mb##01_re - ma##00_im * mb##01_im + \ 00210 ma##01_re * mb##11_re - ma##01_im * mb##11_im + \ 00211 ma##02_re * mb##21_re - ma##02_im * mb##21_im; \ 00212 mc##01_im = \ 00213 ma##00_re * mb##01_im + ma##00_im * mb##01_re + \ 00214 ma##01_re * mb##11_im + ma##01_im * mb##11_re + \ 00215 ma##02_re * mb##21_im + ma##02_im * mb##21_re; \ 00216 mc##11_re = \ 00217 ma##10_re * mb##01_re - ma##10_im * mb##01_im + \ 00218 ma##11_re * mb##11_re - ma##11_im * mb##11_im + \ 00219 ma##12_re * mb##21_re - ma##12_im * mb##21_im; \ 00220 mc##11_im = \ 00221 ma##10_re * mb##01_im + ma##10_im * mb##01_re + \ 00222 ma##11_re * mb##11_im + ma##11_im * mb##11_re + \ 00223 ma##12_re * mb##21_im + ma##12_im * mb##21_re; \ 00224 mc##21_re = \ 00225 ma##20_re * mb##01_re - ma##20_im * mb##01_im + \ 00226 ma##21_re * mb##11_re - ma##21_im * mb##11_im + \ 00227 ma##22_re * mb##21_re - ma##22_im * mb##21_im; \ 00228 mc##21_im = \ 00229 ma##20_re * mb##01_im + ma##20_im * mb##01_re + \ 00230 ma##21_re * mb##11_im + ma##21_im * mb##11_re + \ 00231 ma##22_re * mb##21_im + ma##22_im * mb##21_re; \ 00232 mc##02_re = \ 00233 ma##00_re * mb##02_re - ma##00_im * mb##02_im + \ 00234 ma##01_re * mb##12_re - ma##01_im * mb##12_im + \ 00235 ma##02_re * mb##22_re - ma##02_im * mb##22_im; \ 00236 mc##02_im = \ 00237 ma##00_re * mb##02_im + ma##00_im * mb##02_re + \ 00238 ma##01_re * mb##12_im + ma##01_im * mb##12_re + \ 00239 ma##02_re * mb##22_im + ma##02_im * mb##22_re; \ 00240 mc##12_re = \ 00241 ma##10_re * mb##02_re - ma##10_im * mb##02_im + \ 00242 ma##11_re * mb##12_re - ma##11_im * mb##12_im + \ 00243 ma##12_re * mb##22_re - ma##12_im * mb##22_im; \ 00244 mc##12_im = \ 00245 ma##10_re * mb##02_im + ma##10_im * mb##02_re + \ 00246 ma##11_re * mb##12_im + ma##11_im * mb##12_re + \ 00247 ma##12_re * mb##22_im + ma##12_im * mb##22_re; \ 00248 mc##22_re = \ 00249 ma##20_re * mb##02_re - ma##20_im * mb##02_im + \ 00250 ma##21_re * mb##12_re - ma##21_im * mb##12_im + \ 00251 ma##22_re * mb##22_re - ma##22_im * mb##22_im; \ 00252 mc##22_im = \ 00253 ma##20_re * mb##02_im + ma##20_im * mb##02_re + \ 00254 ma##21_re * mb##12_im + ma##21_im * mb##12_re + \ 00255 ma##22_re * mb##22_im + ma##22_im * mb##22_re; 00256 00257 00258 00259 #define MULT_SU3_NA(ma, mb, mc) \ 00260 mc##00_re = \ 00261 ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \ 00262 ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \ 00263 ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \ 00264 mc##00_im = \ 00265 ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \ 00266 ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \ 00267 ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \ 00268 mc##10_re = \ 00269 ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \ 00270 ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \ 00271 ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \ 00272 mc##10_im = \ 00273 ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \ 00274 ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \ 00275 ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \ 00276 mc##20_re = \ 00277 ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \ 00278 ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \ 00279 ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \ 00280 mc##20_im = \ 00281 ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \ 00282 ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \ 00283 ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \ 00284 mc##01_re = \ 00285 ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \ 00286 ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \ 00287 ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \ 00288 mc##01_im = \ 00289 ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \ 00290 ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \ 00291 ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \ 00292 mc##11_re = \ 00293 ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \ 00294 ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \ 00295 ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \ 00296 mc##11_im = \ 00297 ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \ 00298 ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \ 00299 ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \ 00300 mc##21_re = \ 00301 ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \ 00302 ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \ 00303 ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \ 00304 mc##21_im = \ 00305 ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \ 00306 ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \ 00307 ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \ 00308 mc##02_re = \ 00309 ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \ 00310 ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \ 00311 ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \ 00312 mc##02_im = \ 00313 ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \ 00314 ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \ 00315 ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \ 00316 mc##12_re = \ 00317 ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \ 00318 ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \ 00319 ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \ 00320 mc##12_im = \ 00321 ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \ 00322 ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \ 00323 ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \ 00324 mc##22_re = \ 00325 ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \ 00326 ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \ 00327 ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \ 00328 mc##22_im = \ 00329 ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \ 00330 ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \ 00331 ma##22_re * mb##T22_im + ma##22_im * mb##T22_re; 00332 00333 00334 00335 #define MULT_SU3_AN(ma, mb, mc) \ 00336 mc##00_re = \ 00337 ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \ 00338 ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \ 00339 ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \ 00340 mc##00_im = \ 00341 ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \ 00342 ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \ 00343 ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \ 00344 mc##10_re = \ 00345 ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \ 00346 ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \ 00347 ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \ 00348 mc##10_im = \ 00349 ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \ 00350 ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \ 00351 ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \ 00352 mc##20_re = \ 00353 ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \ 00354 ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \ 00355 ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \ 00356 mc##20_im = \ 00357 ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \ 00358 ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \ 00359 ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \ 00360 mc##01_re = \ 00361 ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \ 00362 ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \ 00363 ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \ 00364 mc##01_im = \ 00365 ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \ 00366 ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \ 00367 ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \ 00368 mc##11_re = \ 00369 ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \ 00370 ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \ 00371 ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \ 00372 mc##11_im = \ 00373 ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \ 00374 ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \ 00375 ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \ 00376 mc##21_re = \ 00377 ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \ 00378 ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \ 00379 ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \ 00380 mc##21_im = \ 00381 ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \ 00382 ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \ 00383 ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \ 00384 mc##02_re = \ 00385 ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \ 00386 ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \ 00387 ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \ 00388 mc##02_im = \ 00389 ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \ 00390 ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \ 00391 ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \ 00392 mc##12_re = \ 00393 ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \ 00394 ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \ 00395 ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \ 00396 mc##12_im = \ 00397 ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \ 00398 ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \ 00399 ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \ 00400 mc##22_re = \ 00401 ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \ 00402 ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \ 00403 ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \ 00404 mc##22_im = \ 00405 ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \ 00406 ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \ 00407 ma##T22_re * mb##22_im + ma##T22_im * mb##22_re; 00408 00409 #define SET_SU3_MATRIX(a, value) \ 00410 a##00_re = value; \ 00411 a##00_im = value; \ 00412 a##01_re = value; \ 00413 a##01_im = value; \ 00414 a##02_re = value; \ 00415 a##02_im = value; \ 00416 a##10_re = value; \ 00417 a##10_im = value; \ 00418 a##11_re = value; \ 00419 a##11_im = value; \ 00420 a##12_re = value; \ 00421 a##12_im = value; \ 00422 a##20_re = value; \ 00423 a##20_im = value; \ 00424 a##21_re = value; \ 00425 a##21_im = value; \ 00426 a##22_re = value; \ 00427 a##22_im = value; \ 00428 00429 #define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc) \ 00430 mc##00_re = ma##00_re + mb##00_re * s; \ 00431 mc##00_im = ma##00_im + mb##00_im * s; \ 00432 mc##01_re = ma##01_re + mb##01_re * s; \ 00433 mc##01_im = ma##01_im + mb##01_im * s; \ 00434 mc##02_re = ma##02_re + mb##02_re * s; \ 00435 mc##02_im = ma##02_im + mb##02_im * s; \ 00436 mc##10_re = ma##10_re + mb##10_re * s; \ 00437 mc##10_im = ma##10_im + mb##10_im * s; \ 00438 mc##11_re = ma##11_re + mb##11_re * s; \ 00439 mc##11_im = ma##11_im + mb##11_im * s; \ 00440 mc##12_re = ma##12_re + mb##12_re * s; \ 00441 mc##12_im = ma##12_im + mb##12_im * s; \ 00442 mc##20_re = ma##20_re + mb##20_re * s; \ 00443 mc##20_im = ma##20_im + mb##20_im * s; \ 00444 mc##21_re = ma##21_re + mb##21_re * s; \ 00445 mc##21_im = ma##21_im + mb##21_im * s; \ 00446 mc##22_re = ma##22_re + mb##22_re * s; \ 00447 mc##22_im = ma##22_im + mb##22_im * s; 00448 00449 #define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc) \ 00450 mc##00_re = ma##00_re - mb##00_re * s; \ 00451 mc##00_im = ma##00_im - mb##00_im * s; \ 00452 mc##01_re = ma##01_re - mb##01_re * s; \ 00453 mc##01_im = ma##01_im - mb##01_im * s; \ 00454 mc##02_re = ma##02_re - mb##02_re * s; \ 00455 mc##02_im = ma##02_im - mb##02_im * s; \ 00456 mc##10_re = ma##10_re - mb##10_re * s; \ 00457 mc##10_im = ma##10_im - mb##10_im * s; \ 00458 mc##11_re = ma##11_re - mb##11_re * s; \ 00459 mc##11_im = ma##11_im - mb##11_im * s; \ 00460 mc##12_re = ma##12_re - mb##12_re * s; \ 00461 mc##12_im = ma##12_im - mb##12_im * s; \ 00462 mc##20_re = ma##20_re - mb##20_re * s; \ 00463 mc##20_im = ma##20_im - mb##20_im * s; \ 00464 mc##21_re = ma##21_re - mb##21_re * s; \ 00465 mc##21_im = ma##21_im - mb##21_im * s; \ 00466 mc##22_re = ma##22_re - mb##22_re * s; \ 00467 mc##22_im = ma##22_im - mb##22_im * s; 00468 00469 00470 #define ah01_re AH0.x 00471 #define ah01_im AH0.y 00472 #define ah02_re AH1.x 00473 #define ah02_im AH1.y 00474 #define ah12_re AH2.x 00475 #define ah12_im AH2.y 00476 #define ah00_im AH3.x 00477 #define ah11_im AH3.y 00478 #define ah22_im AH4.x 00479 #define ahspace AH4.y 00480 00481 #define UNCOMPRESS_ANTI_HERMITIAN(ah, m) \ 00482 m##00_re = 0; \ 00483 m##00_im = ah##00_im; \ 00484 m##11_re = 0; \ 00485 m##11_im = ah##11_im; \ 00486 m##22_re = 0; \ 00487 m##22_im = ah##22_im; \ 00488 m##01_re = ah##01_re; \ 00489 m##01_im = ah##01_im; \ 00490 m##10_re = -ah##01_re; \ 00491 m##10_im = ah##01_im; \ 00492 m##02_re = ah##02_re; \ 00493 m##02_im = ah##02_im; \ 00494 m##20_re = -ah##02_re; \ 00495 m##20_im = ah##02_im; \ 00496 m##12_re = ah##12_re; \ 00497 m##12_im = ah##12_im; \ 00498 m##21_re = -ah##12_re; \ 00499 m##21_im = ah##12_im; 00500 00501 00502 #define MAKE_ANTI_HERMITIAN(m, ah) do { \ 00503 typeof(ah##space) temp; \ 00504 temp = (m##00_im + m##11_im + m##22_im)*0.33333333333333333; \ 00505 ah##00_im = (m##00_im - temp); \ 00506 ah##11_im = (m##11_im - temp); \ 00507 ah##22_im = (m##22_im - temp); \ 00508 ah##01_re = (m##01_re - m##10_re)*0.5; \ 00509 ah##02_re = (m##02_re - m##20_re)*0.5; \ 00510 ah##12_re = (m##12_re - m##21_re)*0.5; \ 00511 ah##01_im = (m##01_im + m##10_im)*0.5; \ 00512 ah##02_im = (m##02_im + m##20_im)*0.5; \ 00513 ah##12_im = (m##12_im + m##21_im)*0.5; \ 00514 ah##space = 0; \ 00515 }while(0) 00516 00517 00518 #define LOAD_ANTI_HERMITIAN_SINGLE(src, dir, idx, var) do{ \ 00519 int start_pos = idx + dir*Vhx5; \ 00520 var##0 = src[start_pos]; \ 00521 var##1 = src[start_pos + Vh]; \ 00522 var##2 = src[start_pos + Vhx2]; \ 00523 var##3 = src[start_pos + Vhx3]; \ 00524 var##4 = src[start_pos + Vhx4]; \ 00525 }while(0) 00526 00527 #define LOAD_ANTI_HERMITIAN_SINGLE_TEX(src, dir, idx, var) do{ \ 00528 int start_pos = idx + dir*Vhx5; \ 00529 var##0 = tex1Dfetch(src, start_pos); \ 00530 var##1 = tex1Dfetch(src, start_pos + Vh); \ 00531 var##2 = tex1Dfetch(src, start_pos + Vhx2); \ 00532 var##3 = tex1Dfetch(src, start_pos + Vhx3); \ 00533 var##4 = tex1Dfetch(src, start_pos + Vhx4); \ 00534 }while(0) 00535 00536 #define WRITE_ANTI_HERMITIAN_SINGLE(mem, dir, idx, var) do{ \ 00537 int start_ps = idx + dir*Vhx5; \ 00538 mem[start_ps] = var##0; \ 00539 mem[start_ps + Vh] = var##1; \ 00540 mem[start_ps + Vhx2] = var##2; \ 00541 mem[start_ps + Vhx3] = var##3; \ 00542 mem[start_ps + Vhx4] = var##4; \ 00543 }while(0) 00544 00545 #define WRITE_ANTI_HERMITIAN_SINGLE_A(mem, dir, idx, var) do{ \ 00546 int start_ps = idx + dir*Vhx5; \ 00547 mem[start_ps] = (float2){1,1}; \ 00548 mem[start_ps + Vh] = (float2){2,2}; \ 00549 mem[start_ps + Vhx2] = (float2){3,3}; \ 00550 mem[start_ps + Vhx3] = (float2){4,4}; \ 00551 mem[start_ps + Vhx4] = (float2){5,5}; \ 00552 }while(0) 00553 00554 00555 #define COPY_SU3_MATRIX(a, b) \ 00556 b##00_re = a##00_re; \ 00557 b##00_im = a##00_im; \ 00558 b##01_re = a##01_re; \ 00559 b##01_im = a##01_im; \ 00560 b##02_re = a##02_re; \ 00561 b##02_im = a##02_im; \ 00562 b##10_re = a##10_re; \ 00563 b##10_im = a##10_im; \ 00564 b##11_re = a##11_re; \ 00565 b##11_im = a##11_im; \ 00566 b##12_re = a##12_re; \ 00567 b##12_im = a##12_im; \ 00568 b##20_re = a##20_re; \ 00569 b##20_im = a##20_im; \ 00570 b##21_re = a##21_re; \ 00571 b##21_im = a##21_im; \ 00572 b##22_re = a##22_re; \ 00573 b##22_im = a##22_im; 00574 00575 #define SU3_ADJOINT(a, b) \ 00576 b##00_re = a##00_re; \ 00577 b##00_im = - a##00_im; \ 00578 b##01_re = a##10_re; \ 00579 b##01_im = - a##10_im; \ 00580 b##02_re = a##20_re; \ 00581 b##02_im = - a##20_im; \ 00582 b##10_re = a##01_re; \ 00583 b##10_im = - a##01_im; \ 00584 b##11_re = a##11_re; \ 00585 b##11_im = - a##11_im; \ 00586 b##12_re = a##21_re; \ 00587 b##12_im = - a##21_im; \ 00588 b##20_re = a##02_re; \ 00589 b##20_im = - a##02_im; \ 00590 b##21_re = a##12_re; \ 00591 b##21_im = - a##12_im; \ 00592 b##22_re = a##22_re; \ 00593 b##22_im = - a##22_im; 00594 00595 #define SET_UNIT_SU3_MATRIX(a) \ 00596 a##00_re = 1.0; \ 00597 a##00_im = 0; \ 00598 a##01_re = 0; \ 00599 a##01_im = 0; \ 00600 a##02_re = 0; \ 00601 a##02_im = 0; \ 00602 a##10_re = 0; \ 00603 a##10_im = 0; \ 00604 a##11_re = 1.0; \ 00605 a##11_im = 0; \ 00606 a##12_re = 0; \ 00607 a##12_im = 0; \ 00608 a##20_re = 0; \ 00609 a##20_im = 0; \ 00610 a##21_re = 0; \ 00611 a##21_im = 0; \ 00612 a##22_re = 1.0; \ 00613 a##22_im = 0; 00614 00615 // Performs the complex conjugated accumulation: a = b* c* 00616 #define ACC_CONJ_PROD_ASSIGN(a, b, c) \ 00617 a##_re = b##_re * c##_re; \ 00618 a##_re -= b##_im * c##_im; \ 00619 a##_im = - b##_re * c##_im; \ 00620 a##_im -= b##_im * c##_re 00621 00622 00623 #define RECONSTRUCT_LINK_12(dir, idx, sign, var) \ 00624 ACC_CONJ_PROD_ASSIGN(var##20, +var##01, +var##12); \ 00625 ACC_CONJ_PROD(var##20, -var##02, +var##11); \ 00626 ACC_CONJ_PROD_ASSIGN(var##21, +var##02, +var##10); \ 00627 ACC_CONJ_PROD(var##21, -var##00, +var##12); \ 00628 ACC_CONJ_PROD_ASSIGN(var##22, +var##00, +var##11); \ 00629 ACC_CONJ_PROD(var##22, -var##01, +var##10); \ 00630 var##20_re *=sign;var##20_im *=sign; var##21_re *=sign; var##21_im *=sign; \ 00631 var##22_re *=sign;var##22_im *=sign; 00632 00633 #define COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \ 00634 switch(mydir){ \ 00635 case 0: \ 00636 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>> 1; \ 00637 break; \ 00638 case 1: \ 00639 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1) >> 1; \ 00640 break; \ 00641 case 2: \ 00642 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1) >> 1; \ 00643 break; \ 00644 case 3: \ 00645 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1) >> 1; \ 00646 break; \ 00647 } \ 00648 }while(0) 00649 00650 #define COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \ 00651 switch(mydir){ \ 00652 case 0: \ 00653 new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \ 00654 break; \ 00655 case 1: \ 00656 new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \ 00657 break; \ 00658 case 2: \ 00659 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \ 00660 break; \ 00661 case 3: \ 00662 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \ 00663 break; \ 00664 } \ 00665 }while(0) 00666 00667 00668 #define COMPUTE_NEW_FULL_IDX_PLUS(mydir, idx) do { \ 00669 switch(mydir){ \ 00670 case 0: \ 00671 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1); \ 00672 break; \ 00673 case 1: \ 00674 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00675 break; \ 00676 case 2: \ 00677 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00678 break; \ 00679 case 3: \ 00680 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00681 break; \ 00682 } \ 00683 }while(0) 00684 00685 #define COMPUTE_NEW_FULL_IDX_MINUS(mydir, idx) do { \ 00686 switch(mydir){ \ 00687 case 0: \ 00688 new_mem_idx = ( (x1==0)?idx+X1m1:X-1); \ 00689 break; \ 00690 case 1: \ 00691 new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1); \ 00692 break; \ 00693 case 2: \ 00694 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1); \ 00695 break; \ 00696 case 3: \ 00697 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1); \ 00698 break; \ 00699 } \ 00700 }while(0) 00701 00702 00703 #endif
1.7.3