QUDA v0.4.0
A library for QCD on GPUs
quda/lib/llfat_core.h
Go to the documentation of this file.
00001 #define Vsh_x ghostFace[0]
00002 #define Vsh_y ghostFace[1]
00003 #define Vsh_z ghostFace[2]
00004 #define Vsh_t ghostFace[3]
00005 
00006 #define xcomm kparam.ghostDim[0]
00007 #define ycomm kparam.ghostDim[1]
00008 #define zcomm kparam.ghostDim[2]
00009 #define tcomm kparam.ghostDim[3]
00010 #define dimcomm kparam.ghostDim
00011 
00012 
00013 #define D1 kparam.D1
00014 #define D2 kparam.D2
00015 #define D3 kparam.D3
00016 #define D4 kparam.D4
00017 #define D1h kparam.D1h
00018 
00019 #if (RECONSTRUCT == 18)
00020 #define DECLARE_VAR_SIGN 
00021 #define DECLARE_NEW_X 
00022 #ifdef MULTI_GPU
00023 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
00024 #else
00025 #define DECLARE_X_ARRAY 
00026 #endif
00027 #else //RECONSTRUCT == 12
00028 #define DECLARE_VAR_SIGN short sign=1
00029 #define DECLARE_NEW_X short new_x1=x1; short new_x2=x2; \
00030   /*short new_x3=x3; */short new_x4=x4; 
00031 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4};
00032 
00033 #endif
00034 
00035 #if (PRECISION == 1 && RECONSTRUCT == 12)
00036 
00037 #define a00_re A0.x
00038 #define a00_im A0.y
00039 #define a01_re A0.z
00040 #define a01_im A0.w
00041 #define a02_re A1.x
00042 #define a02_im A1.y
00043 #define a10_re A1.z
00044 #define a10_im A1.w
00045 #define a11_re A2.x
00046 #define a11_im A2.y
00047 #define a12_re A2.z
00048 #define a12_im A2.w
00049 #define a20_re A3.x
00050 #define a20_im A3.y
00051 #define a21_re A3.z
00052 #define a21_im A3.w
00053 #define a22_re A4.x
00054 #define a22_im A4.y
00055 
00056 #define b00_re B0.x
00057 #define b00_im B0.y
00058 #define b01_re B0.z
00059 #define b01_im B0.w
00060 #define b02_re B1.x
00061 #define b02_im B1.y
00062 #define b10_re B1.z
00063 #define b10_im B1.w
00064 #define b11_re B2.x
00065 #define b11_im B2.y
00066 #define b12_re B2.z
00067 #define b12_im B2.w
00068 #define b20_re B3.x
00069 #define b20_im B3.y
00070 #define b21_re B3.z
00071 #define b21_im B3.w
00072 #define b22_re B4.x
00073 #define b22_im B4.y
00074 
00075 #define c00_re C0.x
00076 #define c00_im C0.y
00077 #define c01_re C0.z
00078 #define c01_im C0.w
00079 #define c02_re C1.x
00080 #define c02_im C1.y
00081 #define c10_re C1.z
00082 #define c10_im C1.w
00083 #define c11_re C2.x
00084 #define c11_im C2.y
00085 #define c12_re C2.z
00086 #define c12_im C2.w
00087 #define c20_re C3.x
00088 #define c20_im C3.y
00089 #define c21_re C3.z
00090 #define c21_im C3.w
00091 #define c22_re C4.x
00092 #define c22_im C4.y
00093 
00094 #else
00095 #define a00_re A0.x
00096 #define a00_im A0.y
00097 #define a01_re A1.x
00098 #define a01_im A1.y
00099 #define a02_re A2.x
00100 #define a02_im A2.y
00101 #define a10_re A3.x
00102 #define a10_im A3.y
00103 #define a11_re A4.x
00104 #define a11_im A4.y
00105 #define a12_re A5.x
00106 #define a12_im A5.y
00107 
00108 #define a20_re A6.x
00109 #define a20_im A6.y
00110 #define a21_re A7.x
00111 #define a21_im A7.y
00112 #define a22_re A8.x
00113 #define a22_im A8.y
00114 
00115 #define b00_re B0.x
00116 #define b00_im B0.y
00117 #define b01_re B1.x
00118 #define b01_im B1.y
00119 #define b02_re B2.x
00120 #define b02_im B2.y
00121 #define b10_re B3.x
00122 #define b10_im B3.y
00123 #define b11_re B4.x
00124 #define b11_im B4.y
00125 #define b12_re B5.x
00126 #define b12_im B5.y
00127 #define b20_re B6.x
00128 #define b20_im B6.y
00129 #define b21_re B7.x
00130 #define b21_im B7.y
00131 #define b22_re B8.x
00132 #define b22_im B8.y
00133 
00134 #define c00_re C0.x
00135 #define c00_im C0.y
00136 #define c01_re C1.x
00137 #define c01_im C1.y
00138 #define c02_re C2.x
00139 #define c02_im C2.y
00140 #define c10_re C3.x
00141 #define c10_im C3.y
00142 #define c11_re C4.x
00143 #define c11_im C4.y
00144 #define c12_re C5.x
00145 #define c12_im C5.y
00146 #define c20_re C6.x
00147 #define c20_im C6.y
00148 #define c21_re C7.x
00149 #define c21_im C7.y
00150 #define c22_re C8.x
00151 #define c22_im C8.y
00152 
00153 #endif
00154 
00155 #define bb00_re BB0.x
00156 #define bb00_im BB0.y
00157 #define bb01_re BB1.x
00158 #define bb01_im BB1.y
00159 #define bb02_re BB2.x
00160 #define bb02_im BB2.y
00161 #define bb10_re BB3.x
00162 #define bb10_im BB3.y
00163 #define bb11_re BB4.x
00164 #define bb11_im BB4.y
00165 #define bb12_re BB5.x
00166 #define bb12_im BB5.y
00167 #define bb20_re BB6.x
00168 #define bb20_im BB6.y
00169 #define bb21_re BB7.x
00170 #define bb21_im BB7.y
00171 #define bb22_re BB8.x
00172 #define bb22_im BB8.y
00173 
00174 
00175 
00176 #define aT00_re (+a00_re)
00177 #define aT00_im (-a00_im)
00178 #define aT01_re (+a10_re)
00179 #define aT01_im (-a10_im)
00180 #define aT02_re (+a20_re)
00181 #define aT02_im (-a20_im)
00182 #define aT10_re (+a01_re)
00183 #define aT10_im (-a01_im)
00184 #define aT11_re (+a11_re)
00185 #define aT11_im (-a11_im)
00186 #define aT12_re (+a21_re)
00187 #define aT12_im (-a21_im)
00188 #define aT20_re (+a02_re)
00189 #define aT20_im (-a02_im)
00190 #define aT21_re (+a12_re)
00191 #define aT21_im (-a12_im)
00192 #define aT22_re (+a22_re)
00193 #define aT22_im (-a22_im)
00194 
00195 #define bT00_re (+b00_re)
00196 #define bT00_im (-b00_im)
00197 #define bT01_re (+b10_re)
00198 #define bT01_im (-b10_im)
00199 #define bT02_re (+b20_re)
00200 #define bT02_im (-b20_im)
00201 #define bT10_re (+b01_re)
00202 #define bT10_im (-b01_im)
00203 #define bT11_re (+b11_re)
00204 #define bT11_im (-b11_im)
00205 #define bT12_re (+b21_re)
00206 #define bT12_im (-b21_im)
00207 #define bT20_re (+b02_re)
00208 #define bT20_im (-b02_im)
00209 #define bT21_re (+b12_re)
00210 #define bT21_im (-b12_im)
00211 #define bT22_re (+b22_re)
00212 #define bT22_im (-b22_im)
00213 
00214 #define cT00_re (+c00_re)
00215 #define cT00_im (-c00_im)
00216 #define cT01_re (+c10_re)
00217 #define cT01_im (-c10_im)
00218 #define cT02_re (+c20_re)
00219 #define cT02_im (-c20_im)
00220 #define cT10_re (+c01_re)
00221 #define cT10_im (-c01_im)
00222 #define cT11_re (+c11_re)
00223 #define cT11_im (-c11_im)
00224 #define cT12_re (+c21_re)
00225 #define cT12_im (-c21_im)
00226 #define cT20_re (+c02_re)
00227 #define cT20_im (-c02_im)
00228 #define cT21_re (+c12_re)
00229 #define cT21_im (-c12_im)
00230 #define cT22_re (+c22_re)
00231 #define cT22_im (-c22_im)
00232 
00233 
00234 #define tempa00_re TEMPA0.x
00235 #define tempa00_im TEMPA0.y
00236 #define tempa01_re TEMPA1.x
00237 #define tempa01_im TEMPA1.y
00238 #define tempa02_re TEMPA2.x
00239 #define tempa02_im TEMPA2.y
00240 #define tempa10_re TEMPA3.x
00241 #define tempa10_im TEMPA3.y
00242 #define tempa11_re TEMPA4.x
00243 #define tempa11_im TEMPA4.y
00244 #define tempa12_re TEMPA5.x
00245 #define tempa12_im TEMPA5.y
00246 #define tempa20_re TEMPA6.x
00247 #define tempa20_im TEMPA6.y
00248 #define tempa21_re TEMPA7.x
00249 #define tempa21_im TEMPA7.y
00250 #define tempa22_re TEMPA8.x
00251 #define tempa22_im TEMPA8.y
00252 
00253 #define tempb00_re TEMPB0.x
00254 #define tempb00_im TEMPB0.y
00255 #define tempb01_re TEMPB1.x
00256 #define tempb01_im TEMPB1.y
00257 #define tempb02_re TEMPB2.x
00258 #define tempb02_im TEMPB2.y
00259 #define tempb10_re TEMPB3.x
00260 #define tempb10_im TEMPB3.y
00261 #define tempb11_re TEMPB4.x
00262 #define tempb11_im TEMPB4.y
00263 #define tempb12_re TEMPB5.x
00264 #define tempb12_im TEMPB5.y
00265 #define tempb20_re TEMPB6.x
00266 #define tempb20_im TEMPB6.y
00267 #define tempb21_re TEMPB7.x
00268 #define tempb21_im TEMPB7.y
00269 #define tempb22_re TEMPB8.x
00270 #define tempb22_im TEMPB8.y
00271 
00272 #define fat00_re FAT0.x
00273 #define fat00_im FAT0.y
00274 #define fat01_re FAT1.x
00275 #define fat01_im FAT1.y
00276 #define fat02_re FAT2.x
00277 #define fat02_im FAT2.y
00278 #define fat10_re FAT3.x
00279 #define fat10_im FAT3.y
00280 #define fat11_re FAT4.x
00281 #define fat11_im FAT4.y
00282 #define fat12_re FAT5.x
00283 #define fat12_im FAT5.y
00284 #define fat20_re FAT6.x
00285 #define fat20_im FAT6.y
00286 #define fat21_re FAT7.x
00287 #define fat21_im FAT7.y
00288 #define fat22_re FAT8.x
00289 #define fat22_im FAT8.y
00290 
00291 #define NUM_FLOATS 5
00292 #define TEMPA0 sd_data[threadIdx.x + 0*blockDim.x]
00293 #define TEMPA1 sd_data[threadIdx.x + 1*blockDim.x ]
00294 #define TEMPA2 sd_data[threadIdx.x + 2*blockDim.x ]
00295 #define TEMPA3 sd_data[threadIdx.x + 3*blockDim.x ]
00296 #define TEMPA4 sd_data[threadIdx.x + 4*blockDim.x ]
00297     
00298 
00299 #undef UPDATE_COOR_PLUS
00300 #undef UPDATE_COOR_MINUS
00301 #undef UPDATE_COOR_LOWER_STAPLE
00302 #undef UPDATE_COOR_LOWER_STAPLE_DIAG
00303 #undef UPDATE_COOR_LOWER_STAPLE_EX
00304 #undef COMPUTE_RECONSTRUCT_SIGN
00305 #if (RECONSTRUCT  != 18)
00306 #define UPDATE_COOR_PLUS(mydir, idx) do {                               \
00307     new_x1 = x1; new_x2 = x2; new_x4 = x4;                              \
00308     switch(mydir){                                                      \
00309     case 0:                                                             \
00310       new_x1 = x1+1;                                                    \
00311       break;                                                            \
00312     case 1:                                                             \
00313       new_x2 = x2+1;                                                    \
00314       break;                                                            \
00315     case 2:                                                             \
00316       break;                                                            \
00317     case 3:                                                             \
00318       new_x4 = x4+1;                                                    \
00319       break;                                                            \
00320     }                                                                   \
00321   }while(0)
00322 
00323 #define UPDATE_COOR_MINUS(mydir, idx) do {                              \
00324     new_x1 = x1; new_x2 = x2; new_x4 = x4;                              \
00325     switch(mydir){                                                      \
00326     case 0:                                                             \
00327       new_x1 = x1-1;                                                    \
00328       break;                                                            \
00329     case 1:                                                             \
00330       new_x2 = x2-1;                                                    \
00331       break;                                                            \
00332     case 2:                                                             \
00333       break;                                                            \
00334     case 3:                                                             \
00335       new_x4 = x4-1;                                                    \
00336       break;                                                            \
00337     }                                                                   \
00338   }while(0)
00339 
00340 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) do {   \
00341     new_x1 = x1; new_x2 = x2;  new_x4 = x4;                             \
00342     if(dimcomm[mydir1] == 0 || x[mydir1] > 0){                          \
00343       switch(mydir1){                                                   \
00344       case 0:                                                           \
00345         new_x1 = x1 - 1;                                                \
00346         break;                                                          \
00347       case 1:                                                           \
00348         new_x2 = x2 - 1;                                                \
00349         break;                                                          \
00350       case 2:                                                           \
00351         break;                                                          \
00352       case 3:                                                           \
00353         new_x4 = x4 - 1;                                                \
00354         break;                                                          \
00355       }                                                                 \
00356       switch(mydir2){                                                   \
00357       case 0:                                                           \
00358         new_x1 = x1+1;                                                  \
00359         break;                                                          \
00360       case 1:                                                           \
00361         new_x2 = x2+1;                                                  \
00362         break;                                                          \
00363       case 2:                                                           \
00364         break;                                                          \
00365       case 3:                                                           \
00366         new_x4 = x4+1;                                                  \
00367         break;                                                          \
00368       }                                                                 \
00369     }else{                                                              \
00370       /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \
00371       switch(mydir2){                                                   \
00372       case 0:                                                           \
00373         new_x1 = x1+1;                                                  \
00374         break;                                                          \
00375       case 1:                                                           \
00376         new_x2 = x2+1;                                                  \
00377         break;                                                          \
00378       case 2:                                                           \
00379         break;                                                          \
00380       case 3:                                                           \
00381         new_x4 = x4+1;                                                  \
00382         break;                                                          \
00383       }                                                                 \
00384       switch(mydir1){/*mydir1 is 0 here */                              \
00385       case 0:                                                           \
00386         new_x1 = x1-1;                                                  \
00387         break;                                                          \
00388       case 1:                                                           \
00389         new_x2 = x2-1;                                                  \
00390         break;                                                          \
00391       case 2:                                                           \
00392         break;                                                          \
00393       case 3:                                                           \
00394         new_x4 = x4-1;                                                  \
00395         break;                                                          \
00396       }                                                                 \
00397     }                                                                   \
00398   }while(0)
00399 
00400 
00401 
00402 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do {          \
00403     int new_x[4];                                                       \
00404     new_x[3] = x4; new_x[1] = x2; new_x[0] = x1;                        \
00405     new_x[nu] =  -1;                                                    \
00406     new_x[mu] = 0;                                                      \
00407     new_x1 = new_x[0];                                                  \
00408     new_x2 = new_x[1];                                                  \
00409     new_x4 = new_x[3];                                                  \
00410   }while(0)
00411 
00412 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) do {                \
00413     new_x1 = x1; new_x2 = x2;  new_x4 = x4;                             \
00414     switch(mydir1){                                                     \
00415     case 0:                                                             \
00416       new_x1 = x1 - 1;                                                  \
00417       break;                                                            \
00418     case 1:                                                             \
00419       new_x2 = x2 - 1;                                                  \
00420       break;                                                            \
00421     case 2:                                                             \
00422       break;                                                            \
00423     case 3:                                                             \
00424       new_x4 = x4 - 1;                                                  \
00425       break;                                                            \
00426     }                                                                   \
00427     switch(mydir2){                                                     \
00428     case 0:                                                             \
00429       new_x1 = x1+1;                                                    \
00430       break;                                                            \
00431     case 1:                                                             \
00432       new_x2 = x2+1;                                                    \
00433       break;                                                            \
00434     case 2:                                                             \
00435       break;                                                            \
00436     case 3:                                                             \
00437       new_x4 = x4+1;                                                    \
00438       break;                                                            \
00439     }                                                                   \
00440   }while(0)
00441 
00442 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do {   \
00443     sign =1;                                                    \
00444     switch(dir){                                                \
00445     case XUP:                                                   \
00446       if ( (i4 & 1) != 0){                                      \
00447         sign = -1;                                              \
00448       }                                                         \
00449       break;                                                    \
00450     case YUP:                                                   \
00451       if ( ((i4+i1) & 1) != 0){                                 \
00452         sign = -1;                                              \
00453       }                                                         \
00454       break;                                                    \
00455     case ZUP:                                                   \
00456       if ( ((i4+i1+i2) & 1) != 0){                              \
00457         sign = -1;                                              \
00458       }                                                         \
00459       break;                                                    \
00460     case TUP:                                                   \
00461       if (i4 == X4m1 && last_proc_in_tdim){                     \
00462         sign = -1;                                              \
00463       }else if(i4 == -1 && first_proc_in_tdim){                 \
00464         sign = -1;                                              \
00465       }                                                         \
00466       break;                                                    \
00467     }                                                           \
00468   }while (0)
00469 
00470 
00471 #else
00472 
00473 #define UPDATE_COOR_PLUS(mydir, idx)
00474 #define UPDATE_COOR_MINUS(mydir, idx)
00475 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2)
00476 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2)
00477 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) 
00478 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2)
00479 #endif
00480 
00481 #ifdef MULTI_GPU
00482 
00483 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do {                     \
00484     switch(mydir){                                                      \
00485     case 0:                                                             \
00486       new_mem_idx = (x1==X1m1)? ((Vh+Vsh_x+ spacecon_x)*xcomm+(idx - X1m1)/2*(1-xcomm)):((idx+1)>>1); \
00487       break;                                                            \
00488     case 1:                                                             \
00489       new_mem_idx = (x2==X2m1)? ((Vh+2*(Vsh_x)+Vsh_y+ spacecon_y)*ycomm+(idx-X2X1mX1)/2*(1-ycomm)):((idx+X1)>>1); \
00490       break;                                                            \
00491     case 2:                                                             \
00492       new_mem_idx = (x3==X3m1)? ((Vh+2*(Vsh_x+Vsh_y)+Vsh_z+ spacecon_z))*zcomm+(idx-X3X2X1mX2X1)/2*(1-zcomm):((idx+X2X1)>>1); \
00493       break;                                                            \
00494     case 3:                                                             \
00495       new_mem_idx = ( (x4==X4m1)? ((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t+spacecon_t))*tcomm+(idx-X4X3X2X1mX3X2X1)/2*(1-tcomm): (idx+X3X2X1)>>1); \
00496       break;                                                            \
00497     }                                                                   \
00498     UPDATE_COOR_PLUS(mydir, idx);                                       \
00499   }while(0)
00500 
00501 
00502 
00503 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do {                    \
00504     switch(mydir){                                                      \
00505     case 0:                                                             \
00506       new_mem_idx = (x1==0)?( (Vh+spacecon_x)*xcomm+(idx+X1m1)/2*(1-xcomm)):((idx-1) >> 1); \
00507       break;                                                            \
00508     case 1:                                                             \
00509       new_mem_idx = (x2==0)?( (Vh+2*Vsh_x+spacecon_y)*ycomm+(idx+X2X1mX1)/2*(1-ycomm)):((idx-X1) >> 1); \
00510       break;                                                            \
00511     case 2:                                                             \
00512       new_mem_idx = (x3==0)?((Vh+2*(Vsh_x+Vsh_y)+spacecon_z)*zcomm+(idx+X3X2X1mX2X1)/2*(1-zcomm)):((idx-X2X1) >> 1); \
00513       break;                                                            \
00514     case 3:                                                             \
00515       new_mem_idx = (x4==0)?((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+ spacecon_t)*tcomm + (idx+X4X3X2X1mX3X2X1)/2*(1-tcomm)):((idx-X3X2X1) >> 1); \
00516       break;                                                            \
00517     }                                                                   \
00518     UPDATE_COOR_MINUS(mydir, idx);                                      \
00519   }while(0)
00520 
00521 
00522 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do {         \
00523     int local_new_x1=x1;                                                \
00524     int local_new_x2=x2;                                                \
00525     int local_new_x3=x3;                                                \
00526     int local_new_x4=x4;                                                \
00527     new_mem_idx=X;                                                      \
00528     if(dimcomm[mydir1] == 0 || x[mydir1] > 0){                          \
00529       switch(mydir1){/*mydir1 is not partitioned or x[mydir1]!=  0*/    \
00530       case 0:                                                           \
00531         new_mem_idx = (x1==0)?(new_mem_idx+X1m1):(new_mem_idx-1);       \
00532         local_new_x1 = (x1==0)?X1m1:(x1 - 1);                           \
00533         break;                                                          \
00534       case 1:                                                           \
00535         new_mem_idx = (x2==0)?(new_mem_idx+X2X1mX1):(new_mem_idx-X1);   \
00536         local_new_x2 = (x2==0)?X2m1:(x2 - 1);                           \
00537         break;                                                          \
00538       case 2:                                                           \
00539         new_mem_idx = (x3==0)?(new_mem_idx+X3X2X1mX2X1):(new_mem_idx-X2X1); \
00540         local_new_x3 = (x3==0)?X3m1:(x3 -1);                            \
00541         break;                                                          \
00542       case 3:                                                           \
00543         new_mem_idx = (x4==0)?(new_mem_idx+X4X3X2X1mX3X2X1):(new_mem_idx-X3X2X1); \
00544         local_new_x4 = (x4==0)?X4m1:(x4 - 1);                           \
00545         break;                                                          \
00546       }                                                                 \
00547       switch(mydir2){                                                   \
00548       case 0:                                                           \
00549         new_mem_idx = (x1==X1m1)?(2*(Vh+Vsh_x)+((local_new_x4*X3X2+local_new_x3*X2+local_new_x2)))*xcomm+(new_mem_idx-X1m1)*(1-xcomm):(new_mem_idx+1); \
00550         break;                                                          \
00551       case 1:                                                           \
00552         new_mem_idx = (x2==X2m1)?(2*(Vh+2*(Vsh_x)+Vsh_y)+((local_new_x4*X3X1+local_new_x3*X1+local_new_x1)))*ycomm+(new_mem_idx-X2X1mX1)*(1-ycomm):(new_mem_idx+X1); \
00553         break;                                                          \
00554       case 2:                                                           \
00555         new_mem_idx = (x3==X3m1)?(2*(Vh+2*(Vsh_x+Vsh_y)+Vsh_z)+((local_new_x4*X2X1+local_new_x2*X1+local_new_x1)))*zcomm+(new_mem_idx-X3X2X1mX2X1)*(1-zcomm):(new_mem_idx+X2X1); \
00556         break;                                                          \
00557       case 3:                                                           \
00558         new_mem_idx = (x4==X4m1)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t)+((local_new_x3*X2X1+local_new_x2*X1+local_new_x1)))*tcomm+(new_mem_idx-X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx+X3X2X1); \
00559         break;                                                          \
00560       }                                                                 \
00561     }else{                                                              \
00562       /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \
00563       switch(mydir2){   /*mydir2 is not partitioned or x[mydir2]!=  0*/ \
00564       case 0:                                                           \
00565         new_mem_idx = (x1==X1m1)?(new_mem_idx-X1m1):(new_mem_idx+1);    \
00566         local_new_x1 = (x1==X1m1)?0:(x1+1);                             \
00567         break;                                                          \
00568       case 1:                                                           \
00569         new_mem_idx = (x2==X2m1)?(new_mem_idx-X2X1mX1):(new_mem_idx+X1); \
00570         local_new_x2 = (x2==X2m1)?0:(x2+1);                             \
00571         break;                                                          \
00572       case 2:                                                           \
00573         new_mem_idx = (x3==X3m1)?(new_mem_idx-X3X2X1mX2X1):(new_mem_idx+X2X1); \
00574         local_new_x3 = (x3==X3m1)?0:(x3+1);                             \
00575         break;                                                          \
00576       case 3:                                                           \
00577         new_mem_idx = (x4==X4m1)?(new_mem_idx-X4X3X2X1mX3X2X1):(new_mem_idx+X3X2X1); \
00578         local_new_x4 = (x4==X4m1)?0:(x4+1);                             \
00579         break;                                                          \
00580       }                                                                 \
00581       switch(mydir1){/*mydir1 is 0 here */                              \
00582       case 0:                                                           \
00583         new_mem_idx = (x1==0)?(2*(Vh)+(local_new_x4*X3X2+local_new_x3*X2+local_new_x2))*xcomm+(new_mem_idx+X1m1)*(1-xcomm):(new_mem_idx -1); \
00584         break;                                                          \
00585       case 1:                                                           \
00586         new_mem_idx = (x2==0)?(2*(Vh+2*Vsh_x)+(local_new_x4*X3X1+local_new_x3*X1+local_new_x1))*ycomm+(new_mem_idx+X2X1mX1)*(1-ycomm):(new_mem_idx-X1); \
00587         break;                                                          \
00588       case 2:                                                           \
00589         new_mem_idx = (x3==0)?(2*(Vh+2*(Vsh_x+Vsh_y))+(local_new_x4*X2X1+local_new_x2*X1+local_new_x1))*zcomm+(new_mem_idx+X3X2X1mX2X1)*(1-zcomm):(new_mem_idx-X2X1); \
00590         break;                                                          \
00591       case 3:                                                           \
00592         new_mem_idx = (x4==0)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z))+(local_new_x3*X2X1+local_new_x2*X1+local_new_x1))*tcomm+(new_mem_idx+X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx-X3X2X1); \
00593         break;                                                          \
00594       }                                                                 \
00595     }                                                                   \
00596     new_mem_idx = new_mem_idx >> 1;                                     \
00597     UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2);                           \
00598   }while(0)
00599 
00600 
00601 
00602 
00603 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \
00604     new_mem_idx = Vh+2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + mu*Vh_2d_max + ((x[dir2]*Z[dir1] + x[dir1])>>1); \
00605     UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2);                  \
00606   }while(0)
00607 
00608 
00609 #else
00610 
00611 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do {             \
00612     switch(mydir){                                                      \
00613     case 0:                                                             \
00614       new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>>1;                    \
00615       break;                                                            \
00616     case 1:                                                             \
00617       new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1)>>1;                \
00618       break;                                                            \
00619     case 2:                                                             \
00620       new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1)>>1;          \
00621       break;                                                            \
00622     case 3:                                                             \
00623       new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1: idx+X3X2X1)>>1;   \
00624       break;                                                            \
00625     }                                                                   \
00626     UPDATE_COOR_PLUS(mydir, idx);                                       \
00627   }while(0)
00628 
00629 
00630 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do {            \
00631     switch(mydir){                                                      \
00632     case 0:                                                             \
00633       new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1;                     \
00634       break;                                                            \
00635     case 1:                                                             \
00636       new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1;                 \
00637       break;                                                            \
00638     case 2:                                                             \
00639       new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1;           \
00640       break;                                                            \
00641     case 3:                                                             \
00642       new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1;     \
00643       break;                                                            \
00644     }                                                                   \
00645     UPDATE_COOR_MINUS(mydir, idx);                                      \
00646      }while(0)
00647 
00648  
00649 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do {         \
00650     switch(mydir1){                                                     \
00651     case 0:                                                             \
00652       new_mem_idx = ( (x1==0)?X+X1m1:X-1);                              \
00653       break;                                                            \
00654     case 1:                                                             \
00655       new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1);                          \
00656       break;                                                            \
00657     case 2:                                                             \
00658       new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1);                    \
00659       break;                                                            \
00660     case 3:                                                             \
00661       new_mem_idx = ((x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1);               \
00662       break;                                                            \
00663     }                                                                   \
00664     switch(mydir2){                                                     \
00665     case 0:                                                             \
00666       new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1;   \
00667       break;                                                            \
00668     case 1:                                                             \
00669       new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \
00670       break;                                                            \
00671     case 2:                                                             \
00672       new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \
00673       break;                                                            \
00674     case 3:                                                             \
00675       new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \
00676       break;                                                            \
00677     }                                                                   \
00678     UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2);                           \
00679   }while(0)
00680 
00681 #endif
00682 
00683 
00684 #define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, idx) do {                  \
00685     switch(mydir){                                                      \
00686     case 0:                                                             \
00687       new_mem_idx = (idx+1)>>1;                                         \
00688       break;                                                            \
00689     case 1:                                                             \
00690       new_mem_idx = (idx+E1)>>1;                                        \
00691       break;                                                            \
00692     case 2:                                                             \
00693       new_mem_idx = (idx+E2E1)>>1;                                      \
00694       break;                                                            \
00695     case 3:                                                             \
00696       new_mem_idx = (idx+E3E2E1)>>1;                                    \
00697       break;                                                            \
00698     }                                                                   \
00699     UPDATE_COOR_PLUS(mydir, idx);                                       \
00700   }while(0)
00701 
00702 #define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx) do {                 \
00703     switch(mydir){                                                      \
00704     case 0:                                                             \
00705       new_mem_idx = (idx-1) >> 1;                                       \
00706       break;                                                            \
00707     case 1:                                                             \
00708       new_mem_idx = (idx-E1) >> 1;                                      \
00709       break;                                                            \
00710     case 2:                                                             \
00711       new_mem_idx = (idx-E2E1) >> 1;                                    \
00712       break;                                                            \
00713     case 3:                                                             \
00714       new_mem_idx = (idx-E3E2E1) >> 1;                                  \
00715       break;                                                            \
00716     }                                                                   \
00717     UPDATE_COOR_MINUS(mydir, idx);                                      \
00718   }while(0)
00719 
00720 
00721 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2) do {      \
00722     switch(mydir1){                                                     \
00723     case 0:                                                             \
00724       new_mem_idx = X-1;                                                \
00725       break;                                                            \
00726     case 1:                                                             \
00727       new_mem_idx = X-E1;                                               \
00728       break;                                                            \
00729     case 2:                                                             \
00730       new_mem_idx = X-E2E1;                                             \
00731       break;                                                            \
00732     case 3:                                                             \
00733       new_mem_idx = X-E3E2E1;                                           \
00734       break;                                                            \
00735     }                                                                   \
00736     switch(mydir2){                                                     \
00737     case 0:                                                             \
00738       new_mem_idx = (new_mem_idx+1)>> 1;                                \
00739       break;                                                            \
00740     case 1:                                                             \
00741       new_mem_idx = (new_mem_idx+E1) >> 1;                              \
00742       break;                                                            \
00743     case 2:                                                             \
00744       new_mem_idx = (new_mem_idx+E2E1) >> 1;                            \
00745       break;                                                            \
00746     case 3:                                                             \
00747       new_mem_idx = (new_mem_idx+E3E2E1) >> 1;                          \
00748       break;                                                            \
00749     }                                                                   \
00750     UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2);                        \
00751   }while(0)
00752 
00753 
00754 
00755 
00756 template<int mu, int nu, int odd_bit>
00757   __global__ void
00758   LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 
00759                                                            FloatN* sitelink_even, FloatN* sitelink_odd, 
00760                                                            FloatM* fatlink_even, FloatM* fatlink_odd,   
00761                                                            Float mycoeff, llfat_kernel_param_t kparam)
00762 {
00763   __shared__ FloatM sd_data[NUM_FLOATS*64];
00764   
00765   //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
00766   FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8;
00767   FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
00768   //FloatM STAPLE6, STAPLE7, STAPLE8;
00769     
00770   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00771     
00772   int z1 = mem_idx / X1h;
00773   short x1h = mem_idx - z1*X1h;
00774   int z2 = z1 / X2;
00775   short x2 = z1 - z2*X2;
00776   short x4 = z2 / X3;
00777   short x3 = z2 - x4*X3;
00778 
00779   short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
00780   short x1 = 2*x1h + x1odd;
00781   int X = 2*mem_idx + x1odd;    
00782 
00783   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return;
00784   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return;
00785   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return;
00786   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return;
00787   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return;
00788   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return;
00789   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return;
00790   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return;
00791 
00792   int new_mem_idx;
00793   DECLARE_VAR_SIGN;
00794   DECLARE_NEW_X;
00795   DECLARE_X_ARRAY;
00796   
00797   //int x[4] = {x1,x2,x3, x4};
00798 #ifdef MULTI_GPU
00799   int Z[4] ={X1,X2,X3,X4};
00800   int spacecon_x = (x4*X3X2+x3*X2+x2)>>1;
00801   int spacecon_y = (x4*X3X1+x3*X1+x1)>>1;
00802   int spacecon_z = (x4*X2X1+x2*X1+x1)>>1;
00803   int spacecon_t = (x3*X2X1+x2*X1+x1)>>1;
00804 #endif
00805 
00806   /* Upper staple */
00807   /* Computes the staple :
00808    *                 mu (B)
00809    *               +-------+
00810    *       nu      |       | 
00811    *         (A)   |       |(C)
00812    *               X       X
00813    *
00814    */
00815     
00816   {
00817     /* load matrix A*/
00818     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);   
00819     COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
00820     RECONSTRUCT_SITE_LINK(sign, a);
00821 
00822 
00823     /* load matrix B*/  
00824     LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X);    
00825     LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
00826     COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);    
00827     RECONSTRUCT_SITE_LINK(sign, b);
00828     
00829 
00830     MULT_SU3_NN(a, b, tempa);    
00831     
00832     /* load matrix C*/
00833         
00834     LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X);    
00835     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
00836     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);    
00837     RECONSTRUCT_SITE_LINK(sign, c);
00838 
00839     MULT_SU3_NA(tempa, c, staple);              
00840   }
00841 
00842   /***************lower staple****************
00843    *
00844    *                   X       X
00845    *             nu    |       | 
00846    *             (A)   |       | (C)
00847    *                   +-------+
00848    *                  mu (B)
00849    *
00850    *********************************************/
00851   {
00852     /* load matrix A*/
00853     LLFAT_COMPUTE_NEW_IDX_MINUS(nu,X);    
00854     
00855     LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
00856     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);        
00857     RECONSTRUCT_SITE_LINK(sign, a);
00858     
00859     /* load matrix B*/                          
00860     LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
00861     COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);    
00862     RECONSTRUCT_SITE_LINK(sign, b);
00863     
00864     MULT_SU3_AN(a, b, tempa);
00865     
00866     /* load matrix C*/
00867     //if(x[nu] == 0 && x[mu] == Z[mu] - 1){
00868 #ifdef MULTI_GPU
00869     if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){
00870       int idx = nu*4+mu;
00871       LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
00872     }else{
00873       LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
00874     }
00875 #else
00876       LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
00877 #endif
00878 
00879     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
00880    
00881     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);        
00882     RECONSTRUCT_SITE_LINK(sign, c);
00883     
00884     
00885     MULT_SU3_NN(tempa, c, b);           
00886     LLFAT_ADD_SU3_MATRIX(b, staple, staple);
00887   }
00888   
00889   if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
00890     LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
00891     SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
00892     WRITE_FAT_MATRIX(fatlink_even,mu,  mem_idx);        
00893   }
00894   WRITE_STAPLE_MATRIX(staple_even, mem_idx);    
00895     
00896   return;
00897 }
00898 
00899 template<int mu, int nu, int odd_bit, int save_staple>
00900   __global__ void
00901   LLFAT_KERNEL(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 
00902                                                            FloatN* sitelink_even, FloatN* sitelink_odd,
00903                                                            FloatM* fatlink_even, FloatM* fatlink_odd,                       
00904                                                            FloatM* mulink_even, FloatM* mulink_odd, 
00905                                                            Float mycoeff, llfat_kernel_param_t kparam)
00906 {
00907   __shared__ FloatM sd_data[NUM_FLOATS*64];
00908   //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;  
00909   FloatM  TEMPA5, TEMPA6, TEMPA7, TEMPA8;  
00910   FloatM TEMPB0, TEMPB1, TEMPB2, TEMPB3, TEMPB4, TEMPB5, TEMPB6, TEMPB7, TEMPB8;
00911   FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
00912   //FloatM STAPLE6, STAPLE7, STAPLE8;
00913     
00914   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00915     
00916   int z1 = mem_idx / X1h;
00917   int x1h = mem_idx - z1*X1h;
00918   int z2 = z1 / X2;
00919   int x2 = z1 - z2*X2;
00920   int x4 = z2 / X3;
00921   int x3 = z2 - x4*X3;
00922 
00923   int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
00924   int x1 = 2*x1h + x1odd;
00925   int X = 2*mem_idx + x1odd;
00926 
00927   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return;
00928   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return;
00929   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return;
00930   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return;
00931   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return;
00932   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return;
00933   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return;
00934   if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return;
00935 
00936   DECLARE_X_ARRAY;
00937 #ifdef MULTI_GPU
00938   int Z[4] ={X1,X2,X3,X4};  
00939   int spacecon_x = (x4*X3X2+x3*X2+x2)>>1;
00940   int spacecon_y = (x4*X3X1+x3*X1+x1)>>1;
00941   int spacecon_z = (x4*X2X1+x2*X1+x1)>>1;
00942   int spacecon_t = (x3*X2X1+x2*X1+x1)>>1;
00943 #endif
00944 
00945   int new_mem_idx;
00946   DECLARE_VAR_SIGN;
00947   DECLARE_NEW_X;
00948 
00949   /* Upper staple */
00950   /* Computes the staple :
00951    *                mu (BB)
00952    *               +-------+
00953    *       nu      |       | 
00954    *         (A)   |       |(C)
00955    *               X       X
00956    *
00957    */
00958   {             
00959     /* load matrix A*/
00960     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
00961     COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
00962     RECONSTRUCT_SITE_LINK(sign, a);
00963     
00964     /* load matrix BB*/
00965     LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X);    
00966     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
00967     
00968     MULT_SU3_NN(a, bb, tempa);    
00969     
00970     /* load matrix C*/
00971     LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X);    
00972     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
00973     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00974     RECONSTRUCT_SITE_LINK(sign, c);
00975     if (save_staple){
00976       MULT_SU3_NA(tempa, c, staple);
00977     }else{
00978       MULT_SU3_NA(tempa, c, tempb);
00979     }
00980   }
00981   
00982   /***************lower staple****************
00983    *
00984    *                   X       X
00985    *             nu    |       | 
00986    *             (A)   |       | (C)
00987    *                   +-------+
00988    *                  mu (B)
00989    *
00990    *********************************************/
00991     
00992 
00993   {
00994     /* load matrix A*/
00995     LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X);
00996     
00997     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
00998     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00999     RECONSTRUCT_SITE_LINK(sign, a);
01000     
01001     /* load matrix B*/
01002     LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X);                         
01003     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
01004     
01005     MULT_SU3_AN(a, bb, tempa);
01006     
01007     /* load matrix C*/
01008     //if(x[nu] == 0 && x[mu] == Z[mu] - 1){
01009 #ifdef MULTI_GPU
01010     if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){
01011       int idx = nu*4+mu; 
01012       LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]);
01013     }else{
01014       LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
01015     }
01016 #else
01017     LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
01018 #endif
01019 
01020     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
01021     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
01022     RECONSTRUCT_SITE_LINK(sign, c);                             
01023     
01024     MULT_SU3_NN(tempa, c, a);   
01025     if(save_staple){
01026       LLFAT_ADD_SU3_MATRIX(staple, a, staple);
01027     }else{
01028       LLFAT_ADD_SU3_MATRIX(a, tempb, tempb);
01029     }
01030   }
01031 
01032   LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
01033   if(save_staple){
01034     if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
01035       SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
01036     }
01037     WRITE_STAPLE_MATRIX(staple_even, mem_idx);              
01038   }else{
01039     if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){
01040       SCALAR_MULT_ADD_SU3_MATRIX(fat, tempb, mycoeff, fat);     
01041     }else{
01042       //The code should never be here
01043       //because it makes no sense to split kernels when no staple is stored
01044       //print error?
01045     }
01046   }
01047 
01048   WRITE_FAT_MATRIX(fatlink_even, mu,  mem_idx); 
01049   
01050   return;
01051 }
01052 
01053 __global__ void 
01054 LLFAT_KERNEL(llfatOneLink, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd,
01055                                         FloatM* fatlink_even, FloatM* fatlink_odd,
01056                                         Float coeff0, Float coeff5)
01057 {
01058   FloatN* my_sitelink;
01059   FloatM* my_fatlink;
01060   int sid = blockIdx.x*blockDim.x + threadIdx.x;
01061   int mem_idx = sid;
01062 
01063 #if (RECONSTRUCT != 18)
01064   int odd_bit= 0;
01065 #endif
01066 
01067   my_sitelink = sitelink_even;
01068   my_fatlink = fatlink_even;
01069   if (mem_idx >= Vh){
01070 #if (RECONSTRUCT != 18)
01071     odd_bit=1;
01072 #endif
01073     mem_idx = mem_idx - Vh;
01074     my_sitelink = sitelink_odd;
01075     my_fatlink = fatlink_odd;
01076   }
01077    
01078 #if (RECONSTRUCT != 18)
01079   int z1 = mem_idx / X1h;
01080   int x1h = mem_idx - z1*X1h;
01081   int z2 = z1 / X2;
01082   int x2 = z1 - z2*X2;
01083   int x4 = z2 / X3;
01084   int x3 = z2 - x4*X3;
01085   int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
01086   int x1 = 2*x1h + x1odd; 
01087   DECLARE_VAR_SIGN;
01088 #endif
01089 
01090   for(int dir=0;dir < 4; dir++){
01091     LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
01092     COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4);
01093     RECONSTRUCT_SITE_LINK(sign, a);
01094 
01095     LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx);
01096         
01097     SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat); 
01098     
01099     WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx);  
01100   }
01101     
01102   return;
01103 }
01104 
01105 
01106 
01107 
01108 template<int mu, int nu, int odd_bit>
01109   __global__ void
01110   LLFAT_KERNEL_EX(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 
01111                                                               FloatN* sitelink_even, FloatN* sitelink_odd, 
01112                                                               FloatM* fatlink_even, FloatM* fatlink_odd,        
01113                                                               Float mycoeff, llfat_kernel_param_t kparam)
01114 {
01115 #if 1
01116   extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu
01117 
01118   
01119   //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
01120   FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8;
01121   FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
01122     
01123   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
01124   if(mem_idx >= kparam.threads) return;
01125 
01126   int z1 = mem_idx/D1h;
01127   short x1h = mem_idx - z1*D1h;
01128   int z2 = z1/D2;
01129   short x2 = z1 - z2*D2;
01130   short x4 = z2/D3;
01131   short x3 = z2 - x4*D3;
01132   
01133   short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
01134   short x1 = 2*x1h + x1odd;
01135   
01136   x1 += kparam.base_idx;
01137   x2 += kparam.base_idx;
01138   x3 += kparam.base_idx;
01139   x4 += kparam.base_idx;
01140   int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
01141   mem_idx = X/2;
01142  
01143   int new_mem_idx;
01144   DECLARE_VAR_SIGN;
01145   DECLARE_NEW_X;
01146 
01147   /* Upper staple */
01148   /* Computes the staple :
01149    *                 mu (B)
01150    *               +-------+
01151    *       nu      |       | 
01152    *         (A)   |       |(C)
01153    *               X       X
01154    *
01155    */
01156     
01157   {
01158     /* load matrix A*/
01159     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);   
01160     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2));
01161     RECONSTRUCT_SITE_LINK(sign, a);
01162 
01163 
01164     
01165     /* load matrix B*/  
01166     LLFAT_COMPUTE_NEW_IDX_PLUS_EX(nu, X);    
01167     LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
01168     COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));    
01169     RECONSTRUCT_SITE_LINK(sign, b);
01170     
01171 
01172     MULT_SU3_NN(a, b, tempa);    
01173     
01174     /* load matrix C*/
01175         
01176     LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mu, X);    
01177     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
01178     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));    
01179     RECONSTRUCT_SITE_LINK(sign, c);
01180 
01181     MULT_SU3_NA(tempa, c, staple);                 
01182  
01183   }
01184 
01185   /***************lower staple****************
01186    *
01187    *                   X       X
01188    *             nu    |       | 
01189    *             (A)   |       | (C)
01190    *                   +-------+
01191    *                  mu (B)
01192    *
01193    *********************************************/
01194     {
01195     /* load matrix A*/
01196     LLFAT_COMPUTE_NEW_IDX_MINUS_EX(nu,X);    
01197     
01198     LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
01199     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));        
01200     RECONSTRUCT_SITE_LINK(sign, a);
01201     
01202     /* load matrix B*/                          
01203     LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
01204     COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));    
01205     RECONSTRUCT_SITE_LINK(sign, b);
01206     
01207     MULT_SU3_AN(a, b, tempa);
01208 
01209 
01210 
01211     /* load matrix C*/
01212     LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(nu, mu);
01213     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
01214    
01215     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));        
01216     RECONSTRUCT_SITE_LINK(sign, c);
01217     
01218     
01219     MULT_SU3_NN(tempa, c, b);           
01220     LLFAT_ADD_SU3_MATRIX(b, staple, staple);
01221 
01222   }
01223    
01224     
01225     if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2
01226         || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){
01227     int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1;
01228     
01229     LOAD_EVEN_FAT_MATRIX(mu, orig_idx);
01230     SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
01231     WRITE_FAT_MATRIX(fatlink_even,mu,  orig_idx);       
01232   }
01233   WRITE_STAPLE_MATRIX(staple_even, mem_idx);    
01234 
01235 #endif
01236     
01237   return;
01238 }
01239 
01240 template<int mu, int nu, int odd_bit, int save_staple>
01241   __global__ void
01242   LLFAT_KERNEL_EX(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 
01243                                                               FloatN* sitelink_even, FloatN* sitelink_odd,
01244                                                               FloatM* fatlink_even, FloatM* fatlink_odd,                            
01245                                                               FloatM* mulink_even, FloatM* mulink_odd, 
01246                                                               Float mycoeff, llfat_kernel_param_t kparam)
01247 {
01248 #if 1
01249   //__shared__ FloatM sd_data[NUM_FLOATS*64];
01250   extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu
01251   //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;  
01252   FloatM  TEMPA5, TEMPA6, TEMPA7, TEMPA8;  
01253   FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
01254   
01255   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
01256   if(mem_idx >= kparam.threads) return;
01257   
01258   int z1 = mem_idx/D1h;
01259   short x1h = mem_idx - z1*D1h;
01260   int z2 = z1/D2;
01261   short x2 = z1 - z2*D2;
01262   short x4 = z2/D3;
01263   short x3 = z2 - x4*D3;
01264 
01265   short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
01266   short x1 = 2*x1h + x1odd;
01267 
01268   x1 += kparam.base_idx;
01269   x2 += kparam.base_idx;
01270   x3 += kparam.base_idx;
01271   x4 += kparam.base_idx;
01272   int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
01273   mem_idx = X/2;
01274 
01275   int new_mem_idx;
01276   DECLARE_VAR_SIGN;
01277   DECLARE_NEW_X;
01278 
01279   /* Upper staple */
01280   /* Computes the staple :
01281    *                mu (BB)
01282    *               +-------+
01283    *       nu      |       | 
01284    *         (A)   |       |(C)
01285    *               X       X
01286    *
01287    */
01288   {             
01289     /* load matrix A*/
01290     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
01291     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2));
01292     RECONSTRUCT_SITE_LINK(sign, a);
01293     
01294     /* load matrix BB*/
01295     LLFAT_COMPUTE_NEW_IDX_PLUS_EX(nu, X);    
01296     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
01297     MULT_SU3_NN(a, bb, tempa);    
01298     
01299     /* load matrix C*/
01300     LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mu, X);    
01301     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
01302     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
01303     RECONSTRUCT_SITE_LINK(sign, c);
01304     
01305     MULT_SU3_NA(tempa, c, staple);
01306   }
01307   
01308   /***************lower staple****************
01309    *
01310    *                   X       X
01311    *             nu    |       | 
01312    *             (A)   |       | (C)
01313    *                   +-------+
01314    *                  mu (B)
01315    *
01316    *********************************************/
01317     
01318   {
01319     /* load matrix A*/
01320     LLFAT_COMPUTE_NEW_IDX_MINUS_EX(nu, X);
01321     
01322     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
01323     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
01324     RECONSTRUCT_SITE_LINK(sign, a);
01325     
01326     /* load matrix B*/
01327     LLFAT_COMPUTE_NEW_IDX_MINUS_EX(nu, X);                              
01328     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
01329     
01330     MULT_SU3_AN(a, bb, tempa);
01331     
01332     /* load matrix C*/
01333  
01334     LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(nu, mu);
01335     
01336     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
01337     COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2));
01338     RECONSTRUCT_SITE_LINK(sign, c);                             
01339     
01340     MULT_SU3_NN(tempa, c, a);   
01341 
01342     LLFAT_ADD_SU3_MATRIX(a, staple, staple);
01343   }
01344 
01345     
01346   if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2
01347         || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){
01348     int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1;
01349     LOAD_EVEN_FAT_MATRIX(mu, orig_idx);    
01350     SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);    
01351     WRITE_FAT_MATRIX(fatlink_even, mu,  orig_idx);      
01352   }
01353 
01354   if(save_staple){
01355     WRITE_STAPLE_MATRIX(staple_even, mem_idx);              
01356   }
01357 #endif
01358   
01359   return;
01360 }
01361 
01362 
01363 __global__ void 
01364 LLFAT_KERNEL_EX(llfatOneLink, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd,
01365                                            FloatM* fatlink_even, FloatM* fatlink_odd,
01366                                            Float coeff0, Float coeff5, llfat_kernel_param_t kparam)
01367 {
01368 #if 1
01369 
01370   FloatN* my_sitelink;
01371   FloatM* my_fatlink;
01372   int sid = blockIdx.x*blockDim.x + threadIdx.x;
01373   int idx = sid;
01374   
01375   if(sid >= 2*kparam.threads) return;
01376   
01377   short odd_bit= 0;
01378   
01379   my_sitelink = sitelink_even;
01380   my_fatlink = fatlink_even;
01381   if (idx >= kparam.threads){
01382     odd_bit=1;
01383     idx = idx - kparam.threads;
01384     my_sitelink = sitelink_odd;
01385     my_fatlink = fatlink_odd;
01386   }
01387   
01388   int z1 = idx/D1h;
01389   short x1h = idx - z1*D1h;
01390   int z2 = z1/D2;
01391   short x2 = z1 - z2*D2;
01392   int x4 = z2/D3;
01393   short x3 = z2 - x4*D3;
01394   short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
01395   short x1 = 2*x1h + x1odd; 
01396   DECLARE_VAR_SIGN;
01397 
01398   x1 += kparam.base_idx;
01399   x2 += kparam.base_idx;
01400   x3 += kparam.base_idx;
01401   x4 += kparam.base_idx;
01402   int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1;
01403   int mem_idx = X/2;
01404 
01405   for(int dir=0;dir < 4; dir++){
01406     LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
01407     COMPUTE_RECONSTRUCT_SIGN(sign, dir, (x1-2), (x2-2), (x3-2), (x4-2));
01408     RECONSTRUCT_SITE_LINK(sign, a);
01409 
01410     LOAD_FAT_MATRIX(my_fatlink, dir, idx);
01411     
01412     SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat); 
01413     
01414     WRITE_FAT_MATRIX(my_fatlink,dir, idx);       
01415   }
01416 #endif
01417     
01418   return;
01419 }
01420 
01421 
01422 
01423 #undef DECLARE_VAR_SIGN 
01424 #undef DECLARE_NEW_X 
01425 #undef DECLARE_X_ARRAY
01426 
01427 #undef a00_re 
01428 #undef a00_im 
01429 #undef a01_re 
01430 #undef a01_im 
01431 #undef a02_re 
01432 #undef a02_im 
01433 #undef a10_re 
01434 #undef a10_im 
01435 #undef a11_re 
01436 #undef a11_im 
01437 #undef a12_re 
01438 #undef a12_im 
01439 #undef a20_re 
01440 #undef a20_im 
01441 #undef a21_re 
01442 #undef a21_im 
01443 #undef a22_re 
01444 #undef a22_im 
01445 
01446 #undef b00_re 
01447 #undef b00_im 
01448 #undef b01_re 
01449 #undef b01_im 
01450 #undef b02_re 
01451 #undef b02_im 
01452 #undef b10_re 
01453 #undef b10_im 
01454 #undef b11_re 
01455 #undef b11_im 
01456 #undef b12_re 
01457 #undef b12_im 
01458 #undef b20_re 
01459 #undef b20_im 
01460 #undef b21_re 
01461 #undef b21_im 
01462 #undef b22_re 
01463 #undef b22_im 
01464 
01465 #undef bb00_re 
01466 #undef bb00_im 
01467 #undef bb01_re 
01468 #undef bb01_im 
01469 #undef bb02_re 
01470 #undef bb02_im 
01471 #undef bb10_re 
01472 #undef bb10_im 
01473 #undef bb11_re 
01474 #undef bb11_im 
01475 #undef bb12_re 
01476 #undef bb12_im 
01477 #undef bb20_re 
01478 #undef bb20_im 
01479 #undef bb21_re 
01480 #undef bb21_im 
01481 #undef bb22_re 
01482 #undef bb22_im 
01483 
01484 #undef c00_re 
01485 #undef c00_im 
01486 #undef c01_re 
01487 #undef c01_im 
01488 #undef c02_re 
01489 #undef c02_im 
01490 #undef c10_re 
01491 #undef c10_im 
01492 #undef c11_re 
01493 #undef c11_im 
01494 #undef c12_re 
01495 #undef c12_im 
01496 #undef c20_re 
01497 #undef c20_im 
01498 #undef c21_re 
01499 #undef c21_im 
01500 #undef c22_re 
01501 #undef c22_im 
01502 
01503 #undef aT00_re 
01504 #undef aT00_im 
01505 #undef aT01_re 
01506 #undef aT01_im 
01507 #undef aT02_re 
01508 #undef aT02_im 
01509 #undef aT10_re 
01510 #undef aT10_im 
01511 #undef aT11_re 
01512 #undef aT11_im 
01513 #undef aT12_re 
01514 #undef aT12_im 
01515 #undef aT20_re 
01516 #undef aT20_im 
01517 #undef aT21_re 
01518 #undef aT21_im 
01519 #undef aT22_re 
01520 #undef aT22_im 
01521 
01522 #undef bT00_re 
01523 #undef bT00_im 
01524 #undef bT01_re 
01525 #undef bT01_im 
01526 #undef bT02_re 
01527 #undef bT02_im 
01528 #undef bT10_re 
01529 #undef bT10_im 
01530 #undef bT11_re 
01531 #undef bT11_im 
01532 #undef bT12_re 
01533 #undef bT12_im 
01534 #undef bT20_re 
01535 #undef bT20_im 
01536 #undef bT21_re 
01537 #undef bT21_im 
01538 #undef bT22_re 
01539 #undef bT22_im 
01540 
01541 #undef cT00_re 
01542 #undef cT00_im 
01543 #undef cT01_re 
01544 #undef cT01_im 
01545 #undef cT02_re 
01546 #undef cT02_im 
01547 #undef cT10_re 
01548 #undef cT10_im 
01549 #undef cT11_re 
01550 #undef cT11_im 
01551 #undef cT12_re 
01552 #undef cT12_im 
01553 #undef cT20_re 
01554 #undef cT20_im 
01555 #undef cT21_re 
01556 #undef cT21_im 
01557 #undef cT22_re 
01558 #undef cT22_im 
01559 
01560 
01561 #undef tempa00_re 
01562 #undef tempa00_im 
01563 #undef tempa01_re 
01564 #undef tempa01_im 
01565 #undef tempa02_re 
01566 #undef tempa02_im 
01567 #undef tempa10_re 
01568 #undef tempa10_im 
01569 #undef tempa11_re 
01570 #undef tempa11_im 
01571 #undef tempa12_re 
01572 #undef tempa12_im 
01573 #undef tempa20_re 
01574 #undef tempa20_im 
01575 #undef tempa21_re 
01576 #undef tempa21_im 
01577 #undef tempa22_re 
01578 #undef tempa22_im 
01579 
01580 #undef tempb00_re 
01581 #undef tempb00_im 
01582 #undef tempb01_re 
01583 #undef tempb01_im 
01584 #undef tempb02_re 
01585 #undef tempb02_im 
01586 #undef tempb10_re 
01587 #undef tempb10_im 
01588 #undef tempb11_re 
01589 #undef tempb11_im 
01590 #undef tempb12_re 
01591 #undef tempb12_im 
01592 #undef tempb20_re 
01593 #undef tempb20_im 
01594 #undef tempb21_re 
01595 #undef tempb21_im 
01596 #undef tempb22_re 
01597 #undef tempb22_im 
01598 
01599 #undef fat00_re 
01600 #undef fat00_im 
01601 #undef fat01_re 
01602 #undef fat01_im 
01603 #undef fat02_re 
01604 #undef fat02_im 
01605 #undef fat10_re 
01606 #undef fat10_im 
01607 #undef fat11_re 
01608 #undef fat11_im 
01609 #undef fat12_re 
01610 #undef fat12_im 
01611 #undef fat20_re 
01612 #undef fat20_im 
01613 #undef fat21_re 
01614 #undef fat21_im 
01615 #undef fat22_re 
01616 #undef fat22_im 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines