QUDA v0.4.0
A library for QCD on GPUs
quda/lib/force_common.h
Go to the documentation of this file.
00001 
00002 #ifndef __KERNEL_COMMOM_MACRO_H__
00003 #define __KERNEL_COMMOM_MACRO_H__
00004 
00005 #define XUP 0
00006 #define YUP 1
00007 #define ZUP 2
00008 #define TUP 3
00009 #define TDOWN 4
00010 #define ZDOWN 5
00011 #define YDOWN 6
00012 #define XDOWN 7
00013 #define OPP_DIR(dir)    (7-(dir))
00014 #define GOES_FORWARDS(dir) (dir<=3)
00015 #define GOES_BACKWARDS(dir) (dir>3)
00016 
00017 
00018 #define linkaT00_re (+linka00_re)
00019 #define linkaT00_im (-linka00_im)
00020 #define linkaT01_re (+linka10_re)
00021 #define linkaT01_im (-linka10_im)
00022 #define linkaT02_re (+linka20_re)
00023 #define linkaT02_im (-linka20_im)
00024 #define linkaT10_re (+linka01_re)
00025 #define linkaT10_im (-linka01_im)
00026 #define linkaT11_re (+linka11_re)
00027 #define linkaT11_im (-linka11_im)
00028 #define linkaT12_re (+linka21_re)
00029 #define linkaT12_im (-linka21_im)
00030 #define linkaT20_re (+linka02_re)
00031 #define linkaT20_im (-linka02_im)
00032 #define linkaT21_re (+linka12_re)
00033 #define linkaT21_im (-linka12_im)
00034 #define linkaT22_re (+linka22_re)
00035 #define linkaT22_im (-linka22_im)
00036 
00037 
00038 #define linkbT00_re (+linkb00_re)
00039 #define linkbT00_im (-linkb00_im)
00040 #define linkbT01_re (+linkb10_re)
00041 #define linkbT01_im (-linkb10_im)
00042 #define linkbT02_re (+linkb20_re)
00043 #define linkbT02_im (-linkb20_im)
00044 #define linkbT10_re (+linkb01_re)
00045 #define linkbT10_im (-linkb01_im)
00046 #define linkbT11_re (+linkb11_re)
00047 #define linkbT11_im (-linkb11_im)
00048 #define linkbT12_re (+linkb21_re)
00049 #define linkbT12_im (-linkb21_im)
00050 #define linkbT20_re (+linkb02_re)
00051 #define linkbT20_im (-linkb02_im)
00052 #define linkbT21_re (+linkb12_re)
00053 #define linkbT21_im (-linkb12_im)
00054 #define linkbT22_re (+linkb22_re)
00055 #define linkbT22_im (-linkb22_im)
00056 
00057 
00058 
00059 
00060 #define linkc00_re LINKC0.x
00061 #define linkc00_im LINKC0.y
00062 #define linkc01_re LINKC0.z
00063 #define linkc01_im LINKC0.w
00064 #define linkc02_re LINKC1.x
00065 #define linkc02_im LINKC1.y
00066 #define linkc10_re LINKC1.z
00067 #define linkc10_im LINKC1.w
00068 #define linkc11_re LINKC2.x
00069 #define linkc11_im LINKC2.y
00070 #define linkc12_re LINKC2.z
00071 #define linkc12_im LINKC2.w
00072 #define linkc20_re LINKC3.x
00073 #define linkc20_im LINKC3.y
00074 #define linkc21_re LINKC3.z
00075 #define linkc21_im LINKC3.w
00076 #define linkc22_re LINKC4.x
00077 #define linkc22_im LINKC4.y
00078 
00079 #define linkcT00_re (+linkc00_re)
00080 #define linkcT00_im (-linkc00_im)
00081 #define linkcT01_re (+linkc10_re)
00082 #define linkcT01_im (-linkc10_im)
00083 #define linkcT02_re (+linkc20_re)
00084 #define linkcT02_im (-linkc20_im)
00085 #define linkcT10_re (+linkc01_re)
00086 #define linkcT10_im (-linkc01_im)
00087 #define linkcT11_re (+linkc11_re)
00088 #define linkcT11_im (-linkc11_im)
00089 #define linkcT12_re (+linkc21_re)
00090 #define linkcT12_im (-linkc21_im)
00091 #define linkcT20_re (+linkc02_re)
00092 #define linkcT20_im (-linkc02_im)
00093 #define linkcT21_re (+linkc12_re)
00094 #define linkcT21_im (-linkc12_im)
00095 #define linkcT22_re (+linkc22_re)
00096 #define linkcT22_im (-linkc22_im)
00097 
00098 
00099 #define staple00_re STAPLE0.x
00100 #define staple00_im STAPLE0.y
00101 #define staple01_re STAPLE1.x
00102 #define staple01_im STAPLE1.y
00103 #define staple02_re STAPLE2.x
00104 #define staple02_im STAPLE2.y
00105 #define staple10_re STAPLE3.x
00106 #define staple10_im STAPLE3.y
00107 #define staple11_re STAPLE4.x
00108 #define staple11_im STAPLE4.y
00109 #define staple12_re STAPLE5.x
00110 #define staple12_im STAPLE5.y
00111 #define staple20_re STAPLE6.x
00112 #define staple20_im STAPLE6.y
00113 #define staple21_re STAPLE7.x
00114 #define staple21_im STAPLE7.y
00115 #define staple22_re STAPLE8.x
00116 #define staple22_im STAPLE8.y
00117 
00118 #define stapleT00_re (+staple00_re)
00119 #define stapleT00_im (-staple00_im)
00120 #define stapleT01_re (+staple10_re)
00121 #define stapleT01_im (-staple10_im)
00122 #define stapleT02_re (+staple20_re)
00123 #define stapleT02_im (-staple20_im)
00124 #define stapleT10_re (+staple01_re)
00125 #define stapleT10_im (-staple01_im)
00126 #define stapleT11_re (+staple11_re)
00127 #define stapleT11_im (-staple11_im)
00128 #define stapleT12_re (+staple21_re)
00129 #define stapleT12_im (-staple21_im)
00130 #define stapleT20_re (+staple02_re)
00131 #define stapleT20_im (-staple02_im)
00132 #define stapleT21_re (+staple12_re)
00133 #define stapleT21_im (-staple12_im)
00134 #define stapleT22_re (+staple22_re)
00135 #define stapleT22_im (-staple22_im)
00136 
00137 //#ifdef FERMI_DBLE_NO_TEX
00138 
00139 #ifdef FERMI_DBLE_NO_TEX
00140 #define READ_DOUBLE2_TEXTURE(x_tex, x, i)      x[i]
00141 #else
00142 #define READ_DOUBLE2_TEXTURE(x_tex, x, i)  fetch_double2(x_tex, i)
00143 #endif
00144 
00145 
00146 #define LOAD_MATRIX_12_SINGLE(gauge, dir, idx, var, stride)do{          \
00147     var##0 = gauge[idx + dir*stride*3];                                 \
00148     var##1 = gauge[idx + dir*stride*3 + stride];                        \
00149     var##2 = gauge[idx + dir*stride*3 + stride*2];                      \
00150   }while(0)
00151 
00152 #define LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)do{      \
00153     var##0 = tex1Dfetch(gauge, idx + dir*stride*3);                     \
00154     var##1 = tex1Dfetch(gauge, idx + dir*stride*3 + stride);            \
00155     var##2 = tex1Dfetch(gauge, idx + dir*stride*3 + stride*2);          \
00156   }while(0)
00157 
00158 #define LOAD_MATRIX_12_DOUBLE(gauge, dir, idx, var, stride)do{          \
00159     var##0 = gauge[idx + dir*stride*6];                                 \
00160     var##1 = gauge[idx + dir*stride*6 + stride];                        \
00161     var##2 = gauge[idx + dir*stride*6 + stride*2];                      \
00162     var##3 = gauge[idx + dir*stride*6 + stride*3];                      \
00163     var##4 = gauge[idx + dir*stride*6 + stride*4];                      \
00164     var##5 = gauge[idx + dir*stride*6 + stride*5];                      \
00165   }while(0)
00166 
00167 #define LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \
00168     var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6); \
00169     var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride); \
00170     var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*2); \
00171     var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*3); \
00172     var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*4); \
00173     var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*6 + stride*5); \
00174   }while(0)
00175 
00176 #define LOAD_MATRIX_18(gauge, dir, idx, var, stride)do{                 \
00177     var##0 = gauge[idx + dir*stride*9];                                 \
00178     var##1 = gauge[idx + dir*stride*9 + stride];                        \
00179     var##2 = gauge[idx + dir*stride*9 + stride*2];                      \
00180     var##3 = gauge[idx + dir*stride*9 + stride*3];                      \
00181     var##4 = gauge[idx + dir*stride*9 + stride*4];                      \
00182     var##5 = gauge[idx + dir*stride*9 + stride*5];                      \
00183     var##6 = gauge[idx + dir*stride*9 + stride*6];                      \
00184     var##7 = gauge[idx + dir*stride*9 + stride*7];                      \
00185     var##8 = gauge[idx + dir*stride*9 + stride*8];                      \
00186   }while(0)
00187 
00188 #define LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)do{      \
00189     var##0 = tex1Dfetch(gauge, idx + dir*stride*9);                     \
00190     var##1 = tex1Dfetch(gauge, idx + dir*stride*9 + stride);            \
00191     var##2 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*2);          \
00192     var##3 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*3);          \
00193     var##4 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*4);          \
00194     var##5 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*5);          \
00195     var##6 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*6);          \
00196     var##7 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*7);          \
00197     var##8 = tex1Dfetch(gauge, idx + dir*stride*9 + stride*8);          \
00198   }while(0)
00199 
00200 #define LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)do{ \
00201     var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9); \
00202     var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride); \
00203     var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*2); \
00204     var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*3); \
00205     var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*4); \
00206     var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*5); \
00207     var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*6); \
00208     var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*7); \
00209     var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*stride*9 + stride*8); \
00210   }while(0)
00211 
00212 #define MULT_SU3_NN(ma, mb, mc)                                 \
00213     mc##00_re =                                                 \
00214         ma##00_re * mb##00_re - ma##00_im * mb##00_im +         \
00215         ma##01_re * mb##10_re - ma##01_im * mb##10_im +         \
00216         ma##02_re * mb##20_re - ma##02_im * mb##20_im;          \
00217     mc##00_im =                                                 \
00218         ma##00_re * mb##00_im + ma##00_im * mb##00_re +         \
00219         ma##01_re * mb##10_im + ma##01_im * mb##10_re +         \
00220         ma##02_re * mb##20_im + ma##02_im * mb##20_re;          \
00221     mc##10_re =                                                 \
00222         ma##10_re * mb##00_re - ma##10_im * mb##00_im +         \
00223         ma##11_re * mb##10_re - ma##11_im * mb##10_im +         \
00224         ma##12_re * mb##20_re - ma##12_im * mb##20_im;          \
00225     mc##10_im =                                                 \
00226         ma##10_re * mb##00_im + ma##10_im * mb##00_re +         \
00227         ma##11_re * mb##10_im + ma##11_im * mb##10_re +         \
00228         ma##12_re * mb##20_im + ma##12_im * mb##20_re;          \
00229     mc##20_re =                                                 \
00230         ma##20_re * mb##00_re - ma##20_im * mb##00_im +         \
00231         ma##21_re * mb##10_re - ma##21_im * mb##10_im +         \
00232         ma##22_re * mb##20_re - ma##22_im * mb##20_im;          \
00233     mc##20_im =                                                 \
00234         ma##20_re * mb##00_im + ma##20_im * mb##00_re +         \
00235         ma##21_re * mb##10_im + ma##21_im * mb##10_re +         \
00236         ma##22_re * mb##20_im + ma##22_im * mb##20_re;          \
00237     mc##01_re =                                                 \
00238         ma##00_re * mb##01_re - ma##00_im * mb##01_im +         \
00239         ma##01_re * mb##11_re - ma##01_im * mb##11_im +         \
00240         ma##02_re * mb##21_re - ma##02_im * mb##21_im;          \
00241     mc##01_im =                                                 \
00242         ma##00_re * mb##01_im + ma##00_im * mb##01_re +         \
00243         ma##01_re * mb##11_im + ma##01_im * mb##11_re +         \
00244         ma##02_re * mb##21_im + ma##02_im * mb##21_re;          \
00245     mc##11_re =                                                 \
00246         ma##10_re * mb##01_re - ma##10_im * mb##01_im +         \
00247         ma##11_re * mb##11_re - ma##11_im * mb##11_im +         \
00248         ma##12_re * mb##21_re - ma##12_im * mb##21_im;          \
00249     mc##11_im =                                                 \
00250         ma##10_re * mb##01_im + ma##10_im * mb##01_re +         \
00251         ma##11_re * mb##11_im + ma##11_im * mb##11_re +         \
00252         ma##12_re * mb##21_im + ma##12_im * mb##21_re;          \
00253     mc##21_re =                                                 \
00254         ma##20_re * mb##01_re - ma##20_im * mb##01_im +         \
00255         ma##21_re * mb##11_re - ma##21_im * mb##11_im +         \
00256         ma##22_re * mb##21_re - ma##22_im * mb##21_im;          \
00257     mc##21_im =                                                 \
00258         ma##20_re * mb##01_im + ma##20_im * mb##01_re +         \
00259         ma##21_re * mb##11_im + ma##21_im * mb##11_re +         \
00260         ma##22_re * mb##21_im + ma##22_im * mb##21_re;          \
00261     mc##02_re =                                                 \
00262         ma##00_re * mb##02_re - ma##00_im * mb##02_im +         \
00263         ma##01_re * mb##12_re - ma##01_im * mb##12_im +         \
00264         ma##02_re * mb##22_re - ma##02_im * mb##22_im;          \
00265     mc##02_im =                                                 \
00266         ma##00_re * mb##02_im + ma##00_im * mb##02_re +         \
00267         ma##01_re * mb##12_im + ma##01_im * mb##12_re +         \
00268         ma##02_re * mb##22_im + ma##02_im * mb##22_re;          \
00269     mc##12_re =                                                 \
00270         ma##10_re * mb##02_re - ma##10_im * mb##02_im +         \
00271         ma##11_re * mb##12_re - ma##11_im * mb##12_im +         \
00272         ma##12_re * mb##22_re - ma##12_im * mb##22_im;          \
00273     mc##12_im =                                                 \
00274         ma##10_re * mb##02_im + ma##10_im * mb##02_re +         \
00275         ma##11_re * mb##12_im + ma##11_im * mb##12_re +         \
00276         ma##12_re * mb##22_im + ma##12_im * mb##22_re;          \
00277     mc##22_re =                                                 \
00278         ma##20_re * mb##02_re - ma##20_im * mb##02_im +         \
00279         ma##21_re * mb##12_re - ma##21_im * mb##12_im +         \
00280         ma##22_re * mb##22_re - ma##22_im * mb##22_im;          \
00281     mc##22_im =                                                 \
00282         ma##20_re * mb##02_im + ma##20_im * mb##02_re +         \
00283         ma##21_re * mb##12_im + ma##21_im * mb##12_re +         \
00284         ma##22_re * mb##22_im + ma##22_im * mb##22_re;
00285 
00286 
00287 
00288 #define MULT_SU3_NA(ma, mb, mc)                                         \
00289     mc##00_re =                                                         \
00290         ma##00_re * mb##T00_re - ma##00_im * mb##T00_im +               \
00291         ma##01_re * mb##T10_re - ma##01_im * mb##T10_im +               \
00292         ma##02_re * mb##T20_re - ma##02_im * mb##T20_im;                \
00293     mc##00_im =                                                         \
00294         ma##00_re * mb##T00_im + ma##00_im * mb##T00_re +               \
00295         ma##01_re * mb##T10_im + ma##01_im * mb##T10_re +               \
00296         ma##02_re * mb##T20_im + ma##02_im * mb##T20_re;                \
00297     mc##10_re =                                                         \
00298         ma##10_re * mb##T00_re - ma##10_im * mb##T00_im +               \
00299         ma##11_re * mb##T10_re - ma##11_im * mb##T10_im +               \
00300         ma##12_re * mb##T20_re - ma##12_im * mb##T20_im;                \
00301     mc##10_im =                                                         \
00302         ma##10_re * mb##T00_im + ma##10_im * mb##T00_re +               \
00303         ma##11_re * mb##T10_im + ma##11_im * mb##T10_re +               \
00304         ma##12_re * mb##T20_im + ma##12_im * mb##T20_re;                \
00305     mc##20_re =                                                         \
00306         ma##20_re * mb##T00_re - ma##20_im * mb##T00_im +               \
00307         ma##21_re * mb##T10_re - ma##21_im * mb##T10_im +               \
00308         ma##22_re * mb##T20_re - ma##22_im * mb##T20_im;                \
00309     mc##20_im =                                                         \
00310         ma##20_re * mb##T00_im + ma##20_im * mb##T00_re +               \
00311         ma##21_re * mb##T10_im + ma##21_im * mb##T10_re +               \
00312         ma##22_re * mb##T20_im + ma##22_im * mb##T20_re;                \
00313     mc##01_re =                                                         \
00314         ma##00_re * mb##T01_re - ma##00_im * mb##T01_im +               \
00315         ma##01_re * mb##T11_re - ma##01_im * mb##T11_im +               \
00316         ma##02_re * mb##T21_re - ma##02_im * mb##T21_im;                \
00317     mc##01_im =                                                         \
00318         ma##00_re * mb##T01_im + ma##00_im * mb##T01_re +               \
00319         ma##01_re * mb##T11_im + ma##01_im * mb##T11_re +               \
00320         ma##02_re * mb##T21_im + ma##02_im * mb##T21_re;                \
00321     mc##11_re =                                                         \
00322         ma##10_re * mb##T01_re - ma##10_im * mb##T01_im +               \
00323         ma##11_re * mb##T11_re - ma##11_im * mb##T11_im +               \
00324         ma##12_re * mb##T21_re - ma##12_im * mb##T21_im;                \
00325     mc##11_im =                                                         \
00326         ma##10_re * mb##T01_im + ma##10_im * mb##T01_re +               \
00327         ma##11_re * mb##T11_im + ma##11_im * mb##T11_re +               \
00328         ma##12_re * mb##T21_im + ma##12_im * mb##T21_re;                \
00329     mc##21_re =                                                         \
00330         ma##20_re * mb##T01_re - ma##20_im * mb##T01_im +               \
00331         ma##21_re * mb##T11_re - ma##21_im * mb##T11_im +               \
00332         ma##22_re * mb##T21_re - ma##22_im * mb##T21_im;                \
00333     mc##21_im =                                                         \
00334         ma##20_re * mb##T01_im + ma##20_im * mb##T01_re +               \
00335         ma##21_re * mb##T11_im + ma##21_im * mb##T11_re +               \
00336         ma##22_re * mb##T21_im + ma##22_im * mb##T21_re;                \
00337     mc##02_re =                                                         \
00338         ma##00_re * mb##T02_re - ma##00_im * mb##T02_im +               \
00339         ma##01_re * mb##T12_re - ma##01_im * mb##T12_im +               \
00340         ma##02_re * mb##T22_re - ma##02_im * mb##T22_im;                \
00341     mc##02_im =                                                         \
00342         ma##00_re * mb##T02_im + ma##00_im * mb##T02_re +               \
00343         ma##01_re * mb##T12_im + ma##01_im * mb##T12_re +               \
00344         ma##02_re * mb##T22_im + ma##02_im * mb##T22_re;                \
00345     mc##12_re =                                                         \
00346         ma##10_re * mb##T02_re - ma##10_im * mb##T02_im +               \
00347         ma##11_re * mb##T12_re - ma##11_im * mb##T12_im +               \
00348         ma##12_re * mb##T22_re - ma##12_im * mb##T22_im;                \
00349     mc##12_im =                                                         \
00350         ma##10_re * mb##T02_im + ma##10_im * mb##T02_re +               \
00351         ma##11_re * mb##T12_im + ma##11_im * mb##T12_re +               \
00352         ma##12_re * mb##T22_im + ma##12_im * mb##T22_re;                \
00353     mc##22_re =                                                         \
00354         ma##20_re * mb##T02_re - ma##20_im * mb##T02_im +               \
00355         ma##21_re * mb##T12_re - ma##21_im * mb##T12_im +               \
00356         ma##22_re * mb##T22_re - ma##22_im * mb##T22_im;                \
00357     mc##22_im =                                                         \
00358         ma##20_re * mb##T02_im + ma##20_im * mb##T02_re +               \
00359         ma##21_re * mb##T12_im + ma##21_im * mb##T12_re +               \
00360         ma##22_re * mb##T22_im + ma##22_im * mb##T22_re;
00361 
00362 
00363 
00364 #define MULT_SU3_AN(ma, mb, mc)                                         \
00365     mc##00_re =                                                         \
00366         ma##T00_re * mb##00_re - ma##T00_im * mb##00_im +               \
00367         ma##T01_re * mb##10_re - ma##T01_im * mb##10_im +               \
00368         ma##T02_re * mb##20_re - ma##T02_im * mb##20_im;                \
00369     mc##00_im =                                                         \
00370         ma##T00_re * mb##00_im + ma##T00_im * mb##00_re +               \
00371         ma##T01_re * mb##10_im + ma##T01_im * mb##10_re +               \
00372         ma##T02_re * mb##20_im + ma##T02_im * mb##20_re;                \
00373     mc##10_re =                                                         \
00374         ma##T10_re * mb##00_re - ma##T10_im * mb##00_im +               \
00375         ma##T11_re * mb##10_re - ma##T11_im * mb##10_im +               \
00376         ma##T12_re * mb##20_re - ma##T12_im * mb##20_im;                \
00377     mc##10_im =                                                         \
00378         ma##T10_re * mb##00_im + ma##T10_im * mb##00_re +               \
00379         ma##T11_re * mb##10_im + ma##T11_im * mb##10_re +               \
00380         ma##T12_re * mb##20_im + ma##T12_im * mb##20_re;                \
00381     mc##20_re =                                                         \
00382         ma##T20_re * mb##00_re - ma##T20_im * mb##00_im +               \
00383         ma##T21_re * mb##10_re - ma##T21_im * mb##10_im +               \
00384         ma##T22_re * mb##20_re - ma##T22_im * mb##20_im;                \
00385     mc##20_im =                                                         \
00386         ma##T20_re * mb##00_im + ma##T20_im * mb##00_re +               \
00387         ma##T21_re * mb##10_im + ma##T21_im * mb##10_re +               \
00388         ma##T22_re * mb##20_im + ma##T22_im * mb##20_re;                \
00389     mc##01_re =                                                         \
00390         ma##T00_re * mb##01_re - ma##T00_im * mb##01_im +               \
00391         ma##T01_re * mb##11_re - ma##T01_im * mb##11_im +               \
00392         ma##T02_re * mb##21_re - ma##T02_im * mb##21_im;                \
00393     mc##01_im =                                                         \
00394         ma##T00_re * mb##01_im + ma##T00_im * mb##01_re +               \
00395         ma##T01_re * mb##11_im + ma##T01_im * mb##11_re +               \
00396         ma##T02_re * mb##21_im + ma##T02_im * mb##21_re;                \
00397     mc##11_re =                                                         \
00398         ma##T10_re * mb##01_re - ma##T10_im * mb##01_im +               \
00399         ma##T11_re * mb##11_re - ma##T11_im * mb##11_im +               \
00400         ma##T12_re * mb##21_re - ma##T12_im * mb##21_im;                \
00401     mc##11_im =                                                         \
00402         ma##T10_re * mb##01_im + ma##T10_im * mb##01_re +               \
00403         ma##T11_re * mb##11_im + ma##T11_im * mb##11_re +               \
00404         ma##T12_re * mb##21_im + ma##T12_im * mb##21_re;                \
00405     mc##21_re =                                                         \
00406         ma##T20_re * mb##01_re - ma##T20_im * mb##01_im +               \
00407         ma##T21_re * mb##11_re - ma##T21_im * mb##11_im +               \
00408         ma##T22_re * mb##21_re - ma##T22_im * mb##21_im;                \
00409     mc##21_im =                                                         \
00410         ma##T20_re * mb##01_im + ma##T20_im * mb##01_re +               \
00411         ma##T21_re * mb##11_im + ma##T21_im * mb##11_re +               \
00412         ma##T22_re * mb##21_im + ma##T22_im * mb##21_re;                \
00413     mc##02_re =                                                         \
00414         ma##T00_re * mb##02_re - ma##T00_im * mb##02_im +               \
00415         ma##T01_re * mb##12_re - ma##T01_im * mb##12_im +               \
00416         ma##T02_re * mb##22_re - ma##T02_im * mb##22_im;                \
00417     mc##02_im =                                                         \
00418         ma##T00_re * mb##02_im + ma##T00_im * mb##02_re +               \
00419         ma##T01_re * mb##12_im + ma##T01_im * mb##12_re +               \
00420         ma##T02_re * mb##22_im + ma##T02_im * mb##22_re;                \
00421     mc##12_re =                                                         \
00422         ma##T10_re * mb##02_re - ma##T10_im * mb##02_im +               \
00423         ma##T11_re * mb##12_re - ma##T11_im * mb##12_im +               \
00424         ma##T12_re * mb##22_re - ma##T12_im * mb##22_im;                \
00425     mc##12_im =                                                         \
00426         ma##T10_re * mb##02_im + ma##T10_im * mb##02_re +               \
00427         ma##T11_re * mb##12_im + ma##T11_im * mb##12_re +               \
00428         ma##T12_re * mb##22_im + ma##T12_im * mb##22_re;                \
00429     mc##22_re =                                                         \
00430         ma##T20_re * mb##02_re - ma##T20_im * mb##02_im +               \
00431         ma##T21_re * mb##12_re - ma##T21_im * mb##12_im +               \
00432         ma##T22_re * mb##22_re - ma##T22_im * mb##22_im;                \
00433     mc##22_im =                                                         \
00434         ma##T20_re * mb##02_im + ma##T20_im * mb##02_re +               \
00435         ma##T21_re * mb##12_im + ma##T21_im * mb##12_re +               \
00436         ma##T22_re * mb##22_im + ma##T22_im * mb##22_re;
00437 
00438 #define SET_SU3_MATRIX(a, value)                \
00439     a##00_re = value;                           \
00440     a##00_im = value;                           \
00441     a##01_re = value;                           \
00442     a##01_im = value;                           \
00443     a##02_re = value;                           \
00444     a##02_im = value;                           \
00445     a##10_re = value;                           \
00446     a##10_im = value;                           \
00447     a##11_re = value;                           \
00448     a##11_im = value;                           \
00449     a##12_re = value;                           \
00450     a##12_im = value;                           \
00451     a##20_re = value;                           \
00452     a##20_im = value;                           \
00453     a##21_re = value;                           \
00454     a##21_im = value;                           \
00455     a##22_re = value;                           \
00456     a##22_im = value;                           \
00457 
00458 #define SCALAR_MULT_ADD_SU3_MATRIX(ma, mb, s, mc)       \
00459     mc##00_re = ma##00_re + mb##00_re * s;              \
00460     mc##00_im = ma##00_im + mb##00_im * s;              \
00461     mc##01_re = ma##01_re + mb##01_re * s;              \
00462     mc##01_im = ma##01_im + mb##01_im * s;              \
00463     mc##02_re = ma##02_re + mb##02_re * s;              \
00464     mc##02_im = ma##02_im + mb##02_im * s;              \
00465     mc##10_re = ma##10_re + mb##10_re * s;              \
00466     mc##10_im = ma##10_im + mb##10_im * s;              \
00467     mc##11_re = ma##11_re + mb##11_re * s;              \
00468     mc##11_im = ma##11_im + mb##11_im * s;              \
00469     mc##12_re = ma##12_re + mb##12_re * s;              \
00470     mc##12_im = ma##12_im + mb##12_im * s;              \
00471     mc##20_re = ma##20_re + mb##20_re * s;              \
00472     mc##20_im = ma##20_im + mb##20_im * s;              \
00473     mc##21_re = ma##21_re + mb##21_re * s;              \
00474     mc##21_im = ma##21_im + mb##21_im * s;              \
00475     mc##22_re = ma##22_re + mb##22_re * s;              \
00476     mc##22_im = ma##22_im + mb##22_im * s;              
00477 
00478 #define SCALAR_MULT_SUB_SU3_MATRIX(ma, mb, s, mc)       \
00479     mc##00_re = ma##00_re - mb##00_re * s;              \
00480     mc##00_im = ma##00_im - mb##00_im * s;              \
00481     mc##01_re = ma##01_re - mb##01_re * s;              \
00482     mc##01_im = ma##01_im - mb##01_im * s;              \
00483     mc##02_re = ma##02_re - mb##02_re * s;              \
00484     mc##02_im = ma##02_im - mb##02_im * s;              \
00485     mc##10_re = ma##10_re - mb##10_re * s;              \
00486     mc##10_im = ma##10_im - mb##10_im * s;              \
00487     mc##11_re = ma##11_re - mb##11_re * s;              \
00488     mc##11_im = ma##11_im - mb##11_im * s;              \
00489     mc##12_re = ma##12_re - mb##12_re * s;              \
00490     mc##12_im = ma##12_im - mb##12_im * s;              \
00491     mc##20_re = ma##20_re - mb##20_re * s;              \
00492     mc##20_im = ma##20_im - mb##20_im * s;              \
00493     mc##21_re = ma##21_re - mb##21_re * s;              \
00494     mc##21_im = ma##21_im - mb##21_im * s;              \
00495     mc##22_re = ma##22_re - mb##22_re * s;              \
00496     mc##22_im = ma##22_im - mb##22_im * s;              
00497 
00498 
00499 #define ah01_re AH0.x
00500 #define ah01_im AH0.y
00501 #define ah02_re AH1.x
00502 #define ah02_im AH1.y
00503 #define ah12_re AH2.x
00504 #define ah12_im AH2.y
00505 #define ah00_im AH3.x
00506 #define ah11_im AH3.y
00507 #define ah22_im AH4.x
00508 #define ahspace AH4.y
00509 
00510 #define UNCOMPRESS_ANTI_HERMITIAN(ah, m)        \
00511     m##00_re = 0;                               \
00512     m##00_im = ah##00_im;                       \
00513     m##11_re = 0;                               \
00514     m##11_im = ah##11_im;                       \
00515     m##22_re = 0;                               \
00516     m##22_im = ah##22_im;                       \
00517     m##01_re = ah##01_re;                       \
00518     m##01_im = ah##01_im;                       \
00519     m##10_re = -ah##01_re;                      \
00520     m##10_im = ah##01_im;                       \
00521     m##02_re = ah##02_re;                       \
00522     m##02_im = ah##02_im;                       \
00523     m##20_re = -ah##02_re;                      \
00524     m##20_im = ah##02_im;                       \
00525     m##12_re = ah##12_re;                       \
00526     m##12_im = ah##12_im;                       \
00527     m##21_re = -ah##12_re;                      \
00528     m##21_im = ah##12_im;
00529 
00530 
00531 #define MAKE_ANTI_HERMITIAN(m, ah) do {                                 \
00532         typeof(ah##space) temp;                                         \
00533         temp = (m##00_im + m##11_im + m##22_im)*0.33333333333333333;    \
00534         ah##00_im  = (m##00_im - temp);                                 \
00535         ah##11_im  = (m##11_im - temp);                                 \
00536         ah##22_im  = (m##22_im - temp);                                 \
00537         ah##01_re = (m##01_re - m##10_re)*0.5;                          \
00538         ah##02_re = (m##02_re - m##20_re)*0.5;                          \
00539         ah##12_re = (m##12_re - m##21_re)*0.5;                          \
00540         ah##01_im = (m##01_im + m##10_im)*0.5;                          \
00541         ah##02_im = (m##02_im + m##20_im)*0.5;                          \
00542         ah##12_im = (m##12_im + m##21_im)*0.5;                          \
00543         ah##space = 0;                                                  \
00544     }while(0)                                           
00545 
00546 
00547 #define LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, stride) do{      \
00548     int start_pos = idx + dir*stride*5;                                 \
00549     var##0 = src[start_pos];                                            \
00550     var##1 = src[start_pos + stride];                                   \
00551     var##2 = src[start_pos + stride*2];                                 \
00552     var##3 = src[start_pos + stride*3];                                 \
00553     var##4 = src[start_pos + stride*4];                                 \
00554   }while(0)
00555 
00556 #define LOAD_ANTI_HERMITIAN_SINGLE_TEX(src, dir, idx, var) do{          \
00557     int start_pos = idx + dir*Vh*5;                                     \
00558     var##0 = tex1Dfetch(src, start_pos);                                \
00559     var##1 = tex1Dfetch(src, start_pos + Vh);                           \
00560     var##2 = tex1Dfetch(src, start_pos + Vh*2);                         \
00561     var##3 = tex1Dfetch(src, start_pos + Vh*3);                         \
00562     var##4 = tex1Dfetch(src, start_pos + Vh*4);                         \
00563   }while(0)
00564 
00565 #define WRITE_ANTI_HERMITIAN(mem, dir, idx, var, stride) do{            \
00566     int start_ps = idx + dir*stride*5;                                  \
00567     mem[start_ps] = var##0;                                             \
00568     mem[start_ps + stride] = var##1;                                    \
00569     mem[start_ps + stride*2] = var##2;                                  \
00570     mem[start_ps + stride*3] = var##3;                                  \
00571     mem[start_ps + stride*4] = var##4;                                  \
00572   }while(0)
00573 
00574 #define COPY_SU3_MATRIX(a, b)           \
00575     b##00_re = a##00_re;                \
00576     b##00_im = a##00_im;                \
00577     b##01_re = a##01_re;                \
00578     b##01_im = a##01_im;                \
00579     b##02_re = a##02_re;                \
00580     b##02_im = a##02_im;                \
00581     b##10_re = a##10_re;                \
00582     b##10_im = a##10_im;                \
00583     b##11_re = a##11_re;                \
00584     b##11_im = a##11_im;                \
00585     b##12_re = a##12_re;                \
00586     b##12_im = a##12_im;                \
00587     b##20_re = a##20_re;                \
00588     b##20_im = a##20_im;                \
00589     b##21_re = a##21_re;                \
00590     b##21_im = a##21_im;                \
00591     b##22_re = a##22_re;                \
00592     b##22_im = a##22_im;                
00593 
00594 #define SU3_ADJOINT(a, b)               \
00595     b##00_re = a##00_re;                \
00596     b##00_im = - a##00_im;              \
00597     b##01_re = a##10_re;                \
00598     b##01_im = - a##10_im;              \
00599     b##02_re = a##20_re;                \
00600     b##02_im = - a##20_im;              \
00601     b##10_re = a##01_re;                \
00602     b##10_im = - a##01_im;              \
00603     b##11_re = a##11_re;                \
00604     b##11_im = - a##11_im;              \
00605     b##12_re = a##21_re;                \
00606     b##12_im = - a##21_im;              \
00607     b##20_re = a##02_re;                \
00608     b##20_im = - a##02_im;              \
00609     b##21_re = a##12_re;                \
00610     b##21_im = - a##12_im;              \
00611     b##22_re = a##22_re;                \
00612     b##22_im = - a##22_im;              
00613 
00614 #define SET_UNIT_SU3_MATRIX(a)                  \
00615     a##00_re = 1.0;                             \
00616     a##00_im = 0;                               \
00617     a##01_re = 0;                               \
00618     a##01_im = 0;                               \
00619     a##02_re = 0;                               \
00620     a##02_im = 0;                               \
00621     a##10_re = 0;                               \
00622     a##10_im = 0;                               \
00623     a##11_re = 1.0;                             \
00624     a##11_im = 0;                               \
00625     a##12_re = 0;                               \
00626     a##12_im = 0;                               \
00627     a##20_re = 0;                               \
00628     a##20_im = 0;                               \
00629     a##21_re = 0;                               \
00630     a##21_im = 0;                               \
00631     a##22_re = 1.0;                             \
00632     a##22_im = 0;                               
00633 
00634 // Performs the complex conjugated accumulation: a = b* c*
00635 #define ACC_CONJ_PROD_ASSIGN(a, b, c)           \
00636   a##_re = b##_re * c##_re;                     \
00637   a##_re -= b##_im * c##_im;                    \
00638   a##_im = - b##_re * c##_im;                   \
00639   a##_im -= b##_im * c##_re
00640 
00641 
00642 #define RECONSTRUCT_LINK_12(sign, var)                                  \
00643     ACC_CONJ_PROD_ASSIGN(var##20, +var##01, +var##12);                  \
00644     ACC_CONJ_PROD(var##20, -var##02, +var##11);                         \
00645     ACC_CONJ_PROD_ASSIGN(var##21, +var##02, +var##10);                  \
00646     ACC_CONJ_PROD(var##21, -var##00, +var##12);                         \
00647     ACC_CONJ_PROD_ASSIGN(var##22, +var##00, +var##11);                  \
00648     ACC_CONJ_PROD(var##22, -var##01, +var##10);                         \
00649     var##20_re *=sign;var##20_im *=sign; var##21_re *=sign; var##21_im *=sign; \
00650     var##22_re *=sign;var##22_im *=sign;
00651 
00652 #define COMPUTE_NEW_IDX_PLUS(mydir, idx) do {                           \
00653         switch(mydir){                                                  \
00654         case 0:                                                         \
00655             new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>> 1;             \
00656             break;                                                      \
00657         case 1:                                                         \
00658             new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1) >> 1;        \
00659             break;                                                      \
00660         case 2:                                                         \
00661             new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1) >> 1;  \
00662             break;                                                      \
00663         case 3:                                                         \
00664             new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1) >> 1; \
00665             break;                                                      \
00666         }                                                               \
00667     }while(0)
00668 
00669 #define COMPUTE_NEW_IDX_MINUS(mydir, idx) do {                          \
00670         switch(mydir){                                                  \
00671         case 0:                                                         \
00672             new_mem_idx = ( (x1==0)?idx+X1m1:X-1);                      \
00673             break;                                                      \
00674         case 1:                                                         \
00675             new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1);                  \
00676             break;                                                      \
00677         case 2:                                                         \
00678             new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1);            \
00679             break;                                                      \
00680         case 3:                                                         \
00681             new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1);      \
00682             break;                                                      \
00683         }                                                               \
00684     }while(0)
00685 
00686 
00687 #define COMPUTE_NEW_FULL_IDX_PLUS(mydir, idx) do {                      \
00688         switch(mydir){                                                  \
00689         case 0:                                                         \
00690             new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1);                 \
00691             break;                                                      \
00692         case 1:                                                         \
00693             new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1);             \
00694             break;                                                      \
00695         case 2:                                                         \
00696             new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1);       \
00697             break;                                                      \
00698         case 3:                                                         \
00699             new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
00700             break;                                                      \
00701         }                                                               \
00702     }while(0)
00703     
00704 #define COMPUTE_NEW_FULL_IDX_MINUS(mydir, idx) do {                     \
00705         switch(mydir){                                                  \
00706         case 0:                                                         \
00707             new_mem_idx = ( (x1==0)?idx+X1m1:X-1);                      \
00708             break;                                                      \
00709         case 1:                                                         \
00710             new_mem_idx = ( (x2==0)?idx+X2X1mX1:X-X1);                  \
00711             break;                                                      \
00712         case 2:                                                         \
00713             new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:X-X2X1);            \
00714             break;                                                      \
00715         case 3:                                                         \
00716             new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:X-X3X2X1);      \
00717             break;                                                      \
00718         }                                                               \
00719     }while(0)
00720 
00721 
00722 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines