QUDA v0.4.0
A library for QCD on GPUs
quda/lib/gauge_force_core.h
Go to the documentation of this file.
00001 
00002 #define xcomm kparam.ghostDim[0]
00003 #define ycomm kparam.ghostDim[1]
00004 #define zcomm kparam.ghostDim[2]
00005 #define tcomm kparam.ghostDim[3]
00006 
00007 #if (N_IN_FLOATN == 4)
00008 #define linka00_re LINKA0.x
00009 #define linka00_im LINKA0.y
00010 #define linka01_re LINKA0.z
00011 #define linka01_im LINKA0.w
00012 #define linka02_re LINKA1.x
00013 #define linka02_im LINKA1.y
00014 #define linka10_re LINKA1.z
00015 #define linka10_im LINKA1.w
00016 #define linka11_re LINKA2.x
00017 #define linka11_im LINKA2.y
00018 #define linka12_re LINKA2.z
00019 #define linka12_im LINKA2.w
00020 #define linka20_re LINKA3.x
00021 #define linka20_im LINKA3.y
00022 #define linka21_re LINKA3.z
00023 #define linka21_im LINKA3.w
00024 #define linka22_re LINKA4.x
00025 #define linka22_im LINKA4.y
00026 
00027 
00028 #define linkb00_re LINKB0.x
00029 #define linkb00_im LINKB0.y
00030 #define linkb01_re LINKB0.z
00031 #define linkb01_im LINKB0.w
00032 #define linkb02_re LINKB1.x
00033 #define linkb02_im LINKB1.y
00034 #define linkb10_re LINKB1.z
00035 #define linkb10_im LINKB1.w
00036 #define linkb11_re LINKB2.x
00037 #define linkb11_im LINKB2.y
00038 #define linkb12_re LINKB2.z
00039 #define linkb12_im LINKB2.w
00040 #define linkb20_re LINKB3.x
00041 #define linkb20_im LINKB3.y
00042 #define linkb21_re LINKB3.z
00043 #define linkb21_im LINKB3.w
00044 #define linkb22_re LINKB4.x
00045 #define linkb22_im LINKB4.y
00046 
00047 #else
00048 #define linka00_re LINKA0.x
00049 #define linka00_im LINKA0.y
00050 #define linka01_re LINKA1.x
00051 #define linka01_im LINKA1.y
00052 #define linka02_re LINKA2.x
00053 #define linka02_im LINKA2.y
00054 #define linka10_re LINKA3.x
00055 #define linka10_im LINKA3.y
00056 #define linka11_re LINKA4.x
00057 #define linka11_im LINKA4.y
00058 #define linka12_re LINKA5.x
00059 #define linka12_im LINKA5.y
00060 #define linka20_re LINKA6.x
00061 #define linka20_im LINKA6.y
00062 #define linka21_re LINKA7.x
00063 #define linka21_im LINKA7.y
00064 #define linka22_re LINKA8.x
00065 #define linka22_im LINKA8.y
00066 
00067 #define linkb00_re LINKB0.x
00068 #define linkb00_im LINKB0.y
00069 #define linkb01_re LINKB1.x
00070 #define linkb01_im LINKB1.y
00071 #define linkb02_re LINKB2.x
00072 #define linkb02_im LINKB2.y
00073 #define linkb10_re LINKB3.x
00074 #define linkb10_im LINKB3.y
00075 #define linkb11_re LINKB4.x
00076 #define linkb11_im LINKB4.y
00077 #define linkb12_re LINKB5.x
00078 #define linkb12_im LINKB5.y
00079 #define linkb20_re LINKB6.x
00080 #define linkb20_im LINKB6.y
00081 #define linkb21_re LINKB7.x
00082 #define linkb21_im LINKB7.y
00083 #define linkb22_re LINKB8.x
00084 #define linkb22_im LINKB8.y
00085 
00086 #endif
00087 
00088 
00089 #ifdef MULTI_GPU
00090 
00091 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do {               \
00092     switch(mydir){                                                      \
00093     case 0:                                                             \
00094       new_mem_idx = ((!xcomm) && (new_x1 == (X1+1)))?(idx - X1m1): idx+1; \
00095       new_x1 = ((!xcomm)&& (new_x1 == (X1+1)))? (new_x1 - X1m1):(new_x1+1); \
00096       break;                                                            \
00097     case 1:                                                             \
00098       new_mem_idx = ((!ycomm) && (new_x2 == (X2+1)))?(idx - X2m1*E1): idx+E1; \
00099       new_x2 = ((!ycomm)&& (new_x2 == (X2+1)))? (new_x2 - X2m1):(new_x2+1); \
00100       break;                                                            \
00101     case 2:                                                             \
00102       new_mem_idx = ((!zcomm) && (new_x3 == (X3+1)))?(idx - X3m1*E2E1): idx+E2E1; \
00103       new_x3 = ((!zcomm)&& (new_x3 == (X3+1)))? (new_x3 - X3m1):(new_x3+1); \
00104       break;                                                            \
00105     case 3:                                                             \
00106       new_mem_idx = ((!tcomm) && (new_x4 == (X4+1)))?(idx - X4m1*E3E2E1): idx+E3E2E1; \
00107       new_x4 = ((!tcomm)&& (new_x4 == (X4+1)))? (new_x4 - X4m1):(new_x4+1); \
00108       break;                                                            \
00109     }                                                                   \
00110   }while(0)
00111 
00112 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do {              \
00113     switch(mydir){                                                      \
00114     case 0:                                                             \
00115       new_mem_idx = ((!xcomm) && new_x1 == 2)?(idx+X1m1):(idx-1);       \
00116       new_x1 = ((!xcomm) && new_x1 == 2)? (new_x1+X1m1): (new_x1-1);    \
00117       break;                                                            \
00118     case 1:                                                             \
00119       new_mem_idx = ((!ycomm) && new_x2 == 2)?(idx+X2m1*E1):(idx-E1);   \
00120       new_x2 = ((!ycomm) && new_x2 == 2)? (new_x2+X2m1): (new_x2-1);    \
00121       break;                                                            \
00122     case 2:                                                             \
00123       new_mem_idx = ((!zcomm) && new_x3 == 2)?(idx+X3m1*E2E1):(idx-E2E1); \
00124       new_x3 = ((!zcomm) && new_x3 == 2)? (new_x3+X3m1): (new_x3-1);    \
00125       break;                                                            \
00126     case 3:                                                             \
00127       new_mem_idx = ((!tcomm) && new_x4 == 2)?(idx+X4m1*E3E2E1):(idx-E3E2E1); \
00128       new_x4 = ((!tcomm) && new_x4 == 2)? (new_x4+X4m1): (new_x4-1);    \
00129       break;                                                            \
00130     }                                                                   \
00131   }while(0)
00132 
00133 #else
00134 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do {               \
00135         switch(mydir){                                                  \
00136         case 0:                                                         \
00137             new_mem_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1);             \
00138             new_x1 = (new_x1==X1m1)?0:new_x1+1;                         \
00139             break;                                                      \
00140         case 1:                                                         \
00141             new_mem_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1);         \
00142             new_x2 = (new_x2==X2m1)?0:new_x2+1;                         \
00143             break;                                                      \
00144         case 2:                                                         \
00145             new_mem_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1);   \
00146             new_x3 = (new_x3==X3m1)?0:new_x3+1;                         \
00147             break;                                                      \
00148         case 3:                                                         \
00149             new_mem_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
00150             new_x4 = (new_x4==X4m1)?0:new_x4+1;                         \
00151             break;                                                      \
00152         }                                                               \
00153     }while(0)
00154 
00155 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do {              \
00156         switch(mydir){                                                  \
00157         case 0:                                                         \
00158             new_mem_idx = ( (new_x1==0)?idx+X1m1:idx-1);                \
00159             new_x1 = (new_x1==0)?X1m1:new_x1 - 1;                       \
00160             break;                                                      \
00161         case 1:                                                         \
00162             new_mem_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1);            \
00163             new_x2 = (new_x2==0)?X2m1:new_x2 - 1;                       \
00164             break;                                                      \
00165         case 2:                                                         \
00166             new_mem_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1);      \
00167             new_x3 = (new_x3==0)?X3m1:new_x3 - 1;                       \
00168             break;                                                      \
00169         case 3:                                                         \
00170             new_mem_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
00171             new_x4 = (new_x4==0)?X4m1:new_x4 - 1;                       \
00172             break;                                                      \
00173         }                                                               \
00174     }while(0)
00175 
00176 #endif
00177 
00178 
00179 #define MULT_SU3_NN_TEST(ma, mb) do{                            \
00180     Float fa_re,fa_im, fb_re, fb_im, fc_re, fc_im;              \
00181     fa_re =                                                     \
00182       ma##00_re * mb##00_re - ma##00_im * mb##00_im +           \
00183             ma##01_re * mb##10_re - ma##01_im * mb##10_im +     \
00184             ma##02_re * mb##20_re - ma##02_im * mb##20_im;      \
00185         fa_im =                                                 \
00186             ma##00_re * mb##00_im + ma##00_im * mb##00_re +     \
00187             ma##01_re * mb##10_im + ma##01_im * mb##10_re +     \
00188             ma##02_re * mb##20_im + ma##02_im * mb##20_re;      \
00189         fb_re =                                                 \
00190             ma##00_re * mb##01_re - ma##00_im * mb##01_im +     \
00191             ma##01_re * mb##11_re - ma##01_im * mb##11_im +     \
00192             ma##02_re * mb##21_re - ma##02_im * mb##21_im;      \
00193         fb_im =                                                 \
00194             ma##00_re * mb##01_im + ma##00_im * mb##01_re +     \
00195             ma##01_re * mb##11_im + ma##01_im * mb##11_re +     \
00196             ma##02_re * mb##21_im + ma##02_im * mb##21_re;      \
00197         fc_re =                                                 \
00198             ma##00_re * mb##02_re - ma##00_im * mb##02_im +     \
00199             ma##01_re * mb##12_re - ma##01_im * mb##12_im +     \
00200             ma##02_re * mb##22_re - ma##02_im * mb##22_im;      \
00201         fc_im =                                                 \
00202             ma##00_re * mb##02_im + ma##00_im * mb##02_re +     \
00203             ma##01_re * mb##12_im + ma##01_im * mb##12_re +     \
00204             ma##02_re * mb##22_im + ma##02_im * mb##22_re;      \
00205         ma##00_re = fa_re;                                      \
00206         ma##00_im = fa_im;                                      \
00207         ma##01_re = fb_re;                                      \
00208         ma##01_im = fb_im;                                      \
00209         ma##02_re = fc_re;                                      \
00210         ma##02_im = fc_im;                                      \
00211         fa_re =                                                 \
00212             ma##10_re * mb##00_re - ma##10_im * mb##00_im +     \
00213             ma##11_re * mb##10_re - ma##11_im * mb##10_im +     \
00214             ma##12_re * mb##20_re - ma##12_im * mb##20_im;      \
00215         fa_im =                                                 \
00216             ma##10_re * mb##00_im + ma##10_im * mb##00_re +     \
00217             ma##11_re * mb##10_im + ma##11_im * mb##10_re +     \
00218             ma##12_re * mb##20_im + ma##12_im * mb##20_re;      \
00219         fb_re =                                                 \
00220             ma##10_re * mb##01_re - ma##10_im * mb##01_im +     \
00221             ma##11_re * mb##11_re - ma##11_im * mb##11_im +     \
00222             ma##12_re * mb##21_re - ma##12_im * mb##21_im;      \
00223         fb_im =                                                 \
00224             ma##10_re * mb##01_im + ma##10_im * mb##01_re +     \
00225             ma##11_re * mb##11_im + ma##11_im * mb##11_re +     \
00226             ma##12_re * mb##21_im + ma##12_im * mb##21_re;      \
00227         fc_re =                                                 \
00228             ma##10_re * mb##02_re - ma##10_im * mb##02_im +     \
00229             ma##11_re * mb##12_re - ma##11_im * mb##12_im +     \
00230             ma##12_re * mb##22_re - ma##12_im * mb##22_im;      \
00231         fc_im =                                                 \
00232             ma##10_re * mb##02_im + ma##10_im * mb##02_re +     \
00233             ma##11_re * mb##12_im + ma##11_im * mb##12_re +     \
00234             ma##12_re * mb##22_im + ma##12_im * mb##22_re;      \
00235         ma##10_re = fa_re;                                      \
00236         ma##10_im = fa_im;                                      \
00237         ma##11_re = fb_re;                                      \
00238         ma##11_im = fb_im;                                      \
00239         ma##12_re = fc_re;                                      \
00240         ma##12_im = fc_im;                                      \
00241         fa_re =                                                 \
00242             ma##20_re * mb##00_re - ma##20_im * mb##00_im +     \
00243             ma##21_re * mb##10_re - ma##21_im * mb##10_im +     \
00244             ma##22_re * mb##20_re - ma##22_im * mb##20_im;      \
00245         fa_im =                                                 \
00246             ma##20_re * mb##00_im + ma##20_im * mb##00_re +     \
00247             ma##21_re * mb##10_im + ma##21_im * mb##10_re +     \
00248             ma##22_re * mb##20_im + ma##22_im * mb##20_re;      \
00249         fb_re =                                                 \
00250             ma##20_re * mb##01_re - ma##20_im * mb##01_im +     \
00251             ma##21_re * mb##11_re - ma##21_im * mb##11_im +     \
00252             ma##22_re * mb##21_re - ma##22_im * mb##21_im;      \
00253         fb_im =                                                 \
00254             ma##20_re * mb##01_im + ma##20_im * mb##01_re +     \
00255             ma##21_re * mb##11_im + ma##21_im * mb##11_re +     \
00256             ma##22_re * mb##21_im + ma##22_im * mb##21_re;      \
00257         fc_re =                                                 \
00258             ma##20_re * mb##02_re - ma##20_im * mb##02_im +     \
00259             ma##21_re * mb##12_re - ma##21_im * mb##12_im +     \
00260             ma##22_re * mb##22_re - ma##22_im * mb##22_im;      \
00261         fc_im =                                                 \
00262             ma##20_re * mb##02_im + ma##20_im * mb##02_re +     \
00263             ma##21_re * mb##12_im + ma##21_im * mb##12_re +     \
00264             ma##22_re * mb##22_im + ma##22_im * mb##22_re;      \
00265         ma##20_re = fa_re;                                      \
00266         ma##20_im = fa_im;                                      \
00267         ma##21_re = fb_re;                                      \
00268         ma##21_im = fb_im;                                      \
00269         ma##22_re = fc_re;                                      \
00270         ma##22_im = fc_im;                                      \
00271     }while(0)
00272 
00273 
00274 #define MULT_SU3_NA_TEST(ma, mb)        do{                             \
00275         Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im;                 \
00276         fa_re =                                                         \
00277             ma##00_re * mb##T00_re - ma##00_im * mb##T00_im +           \
00278             ma##01_re * mb##T10_re - ma##01_im * mb##T10_im +           \
00279             ma##02_re * mb##T20_re - ma##02_im * mb##T20_im;            \
00280         fa_im =                                                         \
00281             ma##00_re * mb##T00_im + ma##00_im * mb##T00_re +           \
00282             ma##01_re * mb##T10_im + ma##01_im * mb##T10_re +           \
00283             ma##02_re * mb##T20_im + ma##02_im * mb##T20_re;            \
00284         fb_re =                                                         \
00285             ma##00_re * mb##T01_re - ma##00_im * mb##T01_im +           \
00286             ma##01_re * mb##T11_re - ma##01_im * mb##T11_im +           \
00287             ma##02_re * mb##T21_re - ma##02_im * mb##T21_im;            \
00288         fb_im =                                                         \
00289             ma##00_re * mb##T01_im + ma##00_im * mb##T01_re +           \
00290             ma##01_re * mb##T11_im + ma##01_im * mb##T11_re +           \
00291             ma##02_re * mb##T21_im + ma##02_im * mb##T21_re;            \
00292         fc_re =                                                         \
00293             ma##00_re * mb##T02_re - ma##00_im * mb##T02_im +           \
00294             ma##01_re * mb##T12_re - ma##01_im * mb##T12_im +           \
00295             ma##02_re * mb##T22_re - ma##02_im * mb##T22_im;            \
00296         fc_im =                                                         \
00297             ma##00_re * mb##T02_im + ma##00_im * mb##T02_re +           \
00298             ma##01_re * mb##T12_im + ma##01_im * mb##T12_re +           \
00299             ma##02_re * mb##T22_im + ma##02_im * mb##T22_re;            \
00300         ma##00_re = fa_re;                                              \
00301         ma##00_im = fa_im;                                              \
00302         ma##01_re = fb_re;                                              \
00303         ma##01_im = fb_im;                                              \
00304         ma##02_re = fc_re;                                              \
00305         ma##02_im = fc_im;                                              \
00306         fa_re =                                                         \
00307             ma##10_re * mb##T00_re - ma##10_im * mb##T00_im +           \
00308             ma##11_re * mb##T10_re - ma##11_im * mb##T10_im +           \
00309             ma##12_re * mb##T20_re - ma##12_im * mb##T20_im;            \
00310         fa_im =                                                         \
00311             ma##10_re * mb##T00_im + ma##10_im * mb##T00_re +           \
00312             ma##11_re * mb##T10_im + ma##11_im * mb##T10_re +           \
00313             ma##12_re * mb##T20_im + ma##12_im * mb##T20_re;            \
00314         fb_re =                                                         \
00315             ma##10_re * mb##T01_re - ma##10_im * mb##T01_im +           \
00316             ma##11_re * mb##T11_re - ma##11_im * mb##T11_im +           \
00317             ma##12_re * mb##T21_re - ma##12_im * mb##T21_im;            \
00318         fb_im =                                                         \
00319             ma##10_re * mb##T01_im + ma##10_im * mb##T01_re +           \
00320             ma##11_re * mb##T11_im + ma##11_im * mb##T11_re +           \
00321             ma##12_re * mb##T21_im + ma##12_im * mb##T21_re;            \
00322         fc_re =                                                         \
00323             ma##10_re * mb##T02_re - ma##10_im * mb##T02_im +           \
00324             ma##11_re * mb##T12_re - ma##11_im * mb##T12_im +           \
00325             ma##12_re * mb##T22_re - ma##12_im * mb##T22_im;            \
00326         fc_im =                                                         \
00327             ma##10_re * mb##T02_im + ma##10_im * mb##T02_re +           \
00328             ma##11_re * mb##T12_im + ma##11_im * mb##T12_re +           \
00329             ma##12_re * mb##T22_im + ma##12_im * mb##T22_re;            \
00330         ma##10_re = fa_re;                                              \
00331         ma##10_im = fa_im;                                              \
00332         ma##11_re = fb_re;                                              \
00333         ma##11_im = fb_im;                                              \
00334         ma##12_re = fc_re;                                              \
00335         ma##12_im = fc_im;                                              \
00336         fa_re =                                                         \
00337             ma##20_re * mb##T00_re - ma##20_im * mb##T00_im +           \
00338             ma##21_re * mb##T10_re - ma##21_im * mb##T10_im +           \
00339             ma##22_re * mb##T20_re - ma##22_im * mb##T20_im;            \
00340         fa_im =                                                         \
00341             ma##20_re * mb##T00_im + ma##20_im * mb##T00_re +           \
00342             ma##21_re * mb##T10_im + ma##21_im * mb##T10_re +           \
00343             ma##22_re * mb##T20_im + ma##22_im * mb##T20_re;            \
00344         fb_re =                                                         \
00345             ma##20_re * mb##T01_re - ma##20_im * mb##T01_im +           \
00346             ma##21_re * mb##T11_re - ma##21_im * mb##T11_im +           \
00347             ma##22_re * mb##T21_re - ma##22_im * mb##T21_im;            \
00348         fb_im =                                                         \
00349             ma##20_re * mb##T01_im + ma##20_im * mb##T01_re +           \
00350             ma##21_re * mb##T11_im + ma##21_im * mb##T11_re +           \
00351             ma##22_re * mb##T21_im + ma##22_im * mb##T21_re;            \
00352         fc_re =                                                         \
00353             ma##20_re * mb##T02_re - ma##20_im * mb##T02_im +           \
00354             ma##21_re * mb##T12_re - ma##21_im * mb##T12_im +           \
00355             ma##22_re * mb##T22_re - ma##22_im * mb##T22_im;            \
00356         fc_im =                                                         \
00357             ma##20_re * mb##T02_im + ma##20_im * mb##T02_re +           \
00358             ma##21_re * mb##T12_im + ma##21_im * mb##T12_re +           \
00359             ma##22_re * mb##T22_im + ma##22_im * mb##T22_re;            \
00360         ma##20_re = fa_re;                                              \
00361         ma##20_im = fa_im;                                              \
00362         ma##21_re = fb_re;                                              \
00363         ma##21_im = fb_im;                                              \
00364         ma##22_re = fc_re;                                              \
00365         ma##22_im = fc_im;                                              \
00366     }while(0)
00367 
00368 
00369 
00370 #define MULT_SU3_AN_TEST(ma, mb)        do{                             \
00371         Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im;                 \
00372         fa_re =                                                         \
00373             ma##T00_re * mb##00_re - ma##T00_im * mb##00_im +           \
00374             ma##T01_re * mb##10_re - ma##T01_im * mb##10_im +           \
00375             ma##T02_re * mb##20_re - ma##T02_im * mb##20_im;            \
00376         fa_im =                                                         \
00377             ma##T00_re * mb##00_im + ma##T00_im * mb##00_re +           \
00378             ma##T01_re * mb##10_im + ma##T01_im * mb##10_re +           \
00379             ma##T02_re * mb##20_im + ma##T02_im * mb##20_re;            \
00380         fb_re =                                                         \
00381             ma##T10_re * mb##00_re - ma##T10_im * mb##00_im +           \
00382             ma##T11_re * mb##10_re - ma##T11_im * mb##10_im +           \
00383             ma##T12_re * mb##20_re - ma##T12_im * mb##20_im;            \
00384         fb_im =                                                         \
00385             ma##T10_re * mb##00_im + ma##T10_im * mb##00_re +           \
00386             ma##T11_re * mb##10_im + ma##T11_im * mb##10_re +           \
00387             ma##T12_re * mb##20_im + ma##T12_im * mb##20_re;            \
00388         fc_re =                                                         \
00389             ma##T20_re * mb##00_re - ma##T20_im * mb##00_im +           \
00390             ma##T21_re * mb##10_re - ma##T21_im * mb##10_im +           \
00391             ma##T22_re * mb##20_re - ma##T22_im * mb##20_im;            \
00392         fc_im =                                                         \
00393             ma##T20_re * mb##00_im + ma##T20_im * mb##00_re +           \
00394             ma##T21_re * mb##10_im + ma##T21_im * mb##10_re +           \
00395             ma##T22_re * mb##20_im + ma##T22_im * mb##20_re;            \
00396         mb##00_re = fa_re;                                              \
00397         mb##00_im = fa_im;                                              \
00398         mb##10_re = fb_re;                                              \
00399         mb##10_im = fb_im;                                              \
00400         mb##20_re = fc_re;                                              \
00401         mb##20_im = fc_im;                                              \
00402         fa_re =                                                         \
00403             ma##T00_re * mb##01_re - ma##T00_im * mb##01_im +           \
00404             ma##T01_re * mb##11_re - ma##T01_im * mb##11_im +           \
00405             ma##T02_re * mb##21_re - ma##T02_im * mb##21_im;            \
00406         fa_im =                                                         \
00407             ma##T00_re * mb##01_im + ma##T00_im * mb##01_re +           \
00408             ma##T01_re * mb##11_im + ma##T01_im * mb##11_re +           \
00409             ma##T02_re * mb##21_im + ma##T02_im * mb##21_re;            \
00410         fb_re =                                                         \
00411             ma##T10_re * mb##01_re - ma##T10_im * mb##01_im +           \
00412             ma##T11_re * mb##11_re - ma##T11_im * mb##11_im +           \
00413             ma##T12_re * mb##21_re - ma##T12_im * mb##21_im;            \
00414         fb_im =                                                         \
00415             ma##T10_re * mb##01_im + ma##T10_im * mb##01_re +           \
00416             ma##T11_re * mb##11_im + ma##T11_im * mb##11_re +           \
00417             ma##T12_re * mb##21_im + ma##T12_im * mb##21_re;            \
00418         fc_re =                                                         \
00419             ma##T20_re * mb##01_re - ma##T20_im * mb##01_im +           \
00420             ma##T21_re * mb##11_re - ma##T21_im * mb##11_im +           \
00421             ma##T22_re * mb##21_re - ma##T22_im * mb##21_im;            \
00422         fc_im =                                                         \
00423             ma##T20_re * mb##01_im + ma##T20_im * mb##01_re +           \
00424             ma##T21_re * mb##11_im + ma##T21_im * mb##11_re +           \
00425             ma##T22_re * mb##21_im + ma##T22_im * mb##21_re;            \
00426         mb##01_re = fa_re;                                              \
00427         mb##01_im = fa_im;                                              \
00428         mb##11_re = fb_re;                                              \
00429         mb##11_im = fb_im;                                              \
00430         mb##21_re = fc_re;                                              \
00431         mb##21_im = fc_im;                                              \
00432         fa_re =                                                         \
00433             ma##T00_re * mb##02_re - ma##T00_im * mb##02_im +           \
00434             ma##T01_re * mb##12_re - ma##T01_im * mb##12_im +           \
00435             ma##T02_re * mb##22_re - ma##T02_im * mb##22_im;            \
00436         fa_im =                                                         \
00437             ma##T00_re * mb##02_im + ma##T00_im * mb##02_re +           \
00438             ma##T01_re * mb##12_im + ma##T01_im * mb##12_re +           \
00439             ma##T02_re * mb##22_im + ma##T02_im * mb##22_re;            \
00440         fb_re =                                                         \
00441             ma##T10_re * mb##02_re - ma##T10_im * mb##02_im +           \
00442             ma##T11_re * mb##12_re - ma##T11_im * mb##12_im +           \
00443             ma##T12_re * mb##22_re - ma##T12_im * mb##22_im;            \
00444         fb_im =                                                         \
00445             ma##T10_re * mb##02_im + ma##T10_im * mb##02_re +           \
00446             ma##T11_re * mb##12_im + ma##T11_im * mb##12_re +           \
00447             ma##T12_re * mb##22_im + ma##T12_im * mb##22_re;            \
00448         fc_re =                                                         \
00449             ma##T20_re * mb##02_re - ma##T20_im * mb##02_im +           \
00450             ma##T21_re * mb##12_re - ma##T21_im * mb##12_im +           \
00451             ma##T22_re * mb##22_re - ma##T22_im * mb##22_im;            \
00452         fc_im =                                                         \
00453             ma##T20_re * mb##02_im + ma##T20_im * mb##02_re +           \
00454             ma##T21_re * mb##12_im + ma##T21_im * mb##12_re +           \
00455             ma##T22_re * mb##22_im + ma##T22_im * mb##22_re;            \
00456         mb##02_re = fa_re;                                              \
00457         mb##02_im = fa_im;                                              \
00458         mb##12_re = fb_re;                                              \
00459         mb##12_im = fb_im;                                              \
00460         mb##22_re = fc_re;                                              \
00461         mb##22_im = fc_im;                                              \
00462     }while(0)
00463 
00464 
00465 
00466 #define print_matrix(mul)                                               \
00467   printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
00468   printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
00469   printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
00470 
00471 
00472 //FloatN can be float2/float4/double2
00473 //Float2 can be float2/double2
00474 template<int oddBit, typename Float2, typename FloatN, typename Float>
00475   __global__ void
00476   GAUGE_FORCE_KERN_NAME(Float2* momEven, Float2* momOdd,
00477                         int dir, double eb3,
00478                         FloatN* linkEven, FloatN* linkOdd,
00479                         int* input_path, 
00480                         int* length, Float* path_coeff, int num_paths, kernel_param_t kparam)
00481 {
00482   int i,j=0;
00483   int sid = blockIdx.x * blockDim.x + threadIdx.x;
00484   
00485   int z1 = sid / X1h;
00486   int x1h = sid - z1*X1h;
00487   int z2 = z1 / X2;
00488   int x2 = z1 - z2*X2;
00489   int x4 = z2 / X3;
00490   int x3 = z2 - x4*X3;
00491   int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00492   int x1 = 2*x1h + x1odd;  
00493 
00494 #ifdef MULTI_GPU
00495   x4 += 2; x3 += 2; x2 += 2; x1 += 2;
00496   int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
00497 #else
00498   int X = 2*sid + x1odd;  
00499 #endif
00500     
00501   Float2* mymom=momEven;
00502   if (oddBit){
00503     mymom = momOdd;
00504   }
00505 
00506   DECLARE_LINK_VARS(LINKA);
00507   DECLARE_LINK_VARS(LINKB);
00508   Float2 STAPLE0, STAPLE1, STAPLE2, STAPLE3,STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
00509   Float2 AH0, AH1, AH2, AH3, AH4;
00510 
00511   int new_mem_idx;
00512     
00513     
00514   SET_SU3_MATRIX(staple, 0);
00515   for(i=0;i < num_paths; i++){
00516     int nbr_oddbit = (oddBit^1 );
00517         
00518     int new_x1 =x1;
00519     int new_x2 =x2;
00520     int new_x3 =x3;
00521     int new_x4 =x4;
00522     COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(dir, X);
00523         
00524     //linka: current matrix
00525     //linkb: the loaded matrix in this round    
00526     SET_UNIT_SU3_MATRIX(linka); 
00527     int* path = input_path + i*path_max_length;
00528         
00529     int lnkdir;
00530     int path0 = path[0];
00531     if (GOES_FORWARDS(path0)){
00532       lnkdir=path0;
00533     }else{
00534       lnkdir=OPP_DIR(path0);
00535       COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(path0), new_mem_idx);
00536       nbr_oddbit = nbr_oddbit^1;
00537             
00538     }
00539         
00540     int nbr_idx = new_mem_idx >>1;
00541     if (nbr_oddbit){
00542       LOAD_ODD_MATRIX( lnkdir, nbr_idx, LINKB);
00543     }else{
00544       LOAD_EVEN_MATRIX( lnkdir, nbr_idx, LINKB);
00545     }
00546     RECONSTRUCT_MATRIX(1, linkb);
00547     
00548     if (GOES_FORWARDS(path0)){
00549       COPY_SU3_MATRIX(linkb, linka);
00550       COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(path0, new_mem_idx);
00551       nbr_oddbit = nbr_oddbit^1;
00552     }else{
00553       SU3_ADJOINT(linkb, linka);
00554     }   
00555         
00556     for(j=1; j < length[i]; j++){
00557             
00558       int lnkdir;
00559       int pathj = path[j];
00560       if (GOES_FORWARDS(pathj)){
00561         lnkdir=pathj;
00562       }else{
00563         lnkdir=OPP_DIR(pathj);
00564         COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(pathj), new_mem_idx);
00565         nbr_oddbit = nbr_oddbit^1;
00566 
00567       }
00568             
00569       int nbr_idx = new_mem_idx >>1;
00570       if (nbr_oddbit){
00571         LOAD_ODD_MATRIX(lnkdir, nbr_idx, LINKB);
00572       }else{
00573         LOAD_EVEN_MATRIX(lnkdir, nbr_idx, LINKB);
00574       }
00575       RECONSTRUCT_MATRIX(1, linkb);
00576       if (GOES_FORWARDS(pathj)){
00577         MULT_SU3_NN_TEST(linka, linkb);
00578                 
00579         COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(pathj, new_mem_idx);
00580         nbr_oddbit = nbr_oddbit^1;
00581                 
00582                 
00583       }else{
00584         MULT_SU3_NA_TEST(linka, linkb);         
00585       }
00586             
00587     }//j
00588     SCALAR_MULT_ADD_SU3_MATRIX(staple, linka, path_coeff[i], staple);
00589   }//i
00590     
00591 
00592   //update mom 
00593   if (oddBit){
00594     LOAD_ODD_MATRIX(dir, (X>>1), LINKA);
00595   }else{
00596     LOAD_EVEN_MATRIX(dir, (X>>1), LINKA);
00597   }
00598   RECONSTRUCT_MATRIX(1, linka);
00599   MULT_SU3_NN_TEST(linka, staple);
00600   LOAD_ANTI_HERMITIAN(mymom, dir, sid, AH);
00601   UNCOMPRESS_ANTI_HERMITIAN(ah, linkb);
00602   SCALAR_MULT_SUB_SU3_MATRIX(linkb, linka, eb3, linka);
00603   MAKE_ANTI_HERMITIAN(linka, ah);
00604     
00605   WRITE_ANTI_HERMITIAN(mymom, dir, sid, AH, mom_ga_stride);
00606 
00607   return;
00608 }
00609 
00610 
00611 
00612 #undef COMPUTE_NEW_FULL_IDX_PLUS_UPDATE
00613 #undef COMPUTE_NEW_FULL_IDX_MINUS_UPDATE
00614 #undef MULT_SU3_NN_TEST
00615 #undef MULT_SU3_NA_TEST
00616 #undef MULT_SU3_AN_TEST
00617 
00618 #undef linka00_re 
00619 #undef linka00_im 
00620 #undef linka01_re 
00621 #undef linka01_im 
00622 #undef linka02_re 
00623 #undef linka02_im 
00624 #undef linka10_re 
00625 #undef linka10_im 
00626 #undef linka11_re 
00627 #undef linka11_im 
00628 #undef linka12_re 
00629 #undef linka12_im 
00630 #undef linka20_re 
00631 #undef linka20_im 
00632 #undef linka21_re 
00633 #undef linka21_im 
00634 #undef linka22_re 
00635 #undef linka22_im 
00636 
00637 #undef linkb00_re 
00638 #undef linkb00_im 
00639 #undef linkb01_re 
00640 #undef linkb01_im 
00641 #undef linkb02_re 
00642 #undef linkb02_im 
00643 #undef linkb10_re 
00644 #undef linkb10_im 
00645 #undef linkb11_re 
00646 #undef linkb11_im 
00647 #undef linkb12_re 
00648 #undef linkb12_im 
00649 #undef linkb20_re 
00650 #undef linkb20_im 
00651 #undef linkb21_re 
00652 #undef linkb21_im 
00653 #undef linkb22_re 
00654 #undef linkb22_im 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines