QUDA v0.3.2
A library for QCD on GPUs

quda/lib/llfat_core.h

Go to the documentation of this file.
00001 
00002 #if ((PRECISION == 1) && (RECONSTRUCT == 12 || RECONSTRUCT == 8))
00003 #define a00_re A0.x
00004 #define a00_im A0.y
00005 #define a01_re A0.z
00006 #define a01_im A0.w
00007 #define a02_re A1.x
00008 #define a02_im A1.y
00009 #define a10_re A1.z
00010 #define a10_im A1.w
00011 #define a11_re A2.x
00012 #define a11_im A2.y
00013 #define a12_re A2.z
00014 #define a12_im A2.w
00015 #define a20_re A3.x
00016 #define a20_im A3.y
00017 #define a21_re A3.z
00018 #define a21_im A3.w
00019 #define a22_re A4.x
00020 #define a22_im A4.y
00021 
00022 #define b00_re B0.x
00023 #define b00_im B0.y
00024 #define b01_re B0.z
00025 #define b01_im B0.w
00026 #define b02_re B1.x
00027 #define b02_im B1.y
00028 #define b10_re B1.z
00029 #define b10_im B1.w
00030 #define b11_re B2.x
00031 #define b11_im B2.y
00032 #define b12_re B2.z
00033 #define b12_im B2.w
00034 #define b20_re B3.x
00035 #define b20_im B3.y
00036 #define b21_re B3.z
00037 #define b21_im B3.w
00038 #define b22_re B4.x
00039 #define b22_im B4.y
00040 
00041 #define c00_re C0.x
00042 #define c00_im C0.y
00043 #define c01_re C0.z
00044 #define c01_im C0.w
00045 #define c02_re C1.x
00046 #define c02_im C1.y
00047 #define c10_re C1.z
00048 #define c10_im C1.w
00049 #define c11_re C2.x
00050 #define c11_im C2.y
00051 #define c12_re C2.z
00052 #define c12_im C2.w
00053 #define c20_re C3.x
00054 #define c20_im C3.y
00055 #define c21_re C3.z
00056 #define c21_im C3.w
00057 #define c22_re C4.x
00058 #define c22_im C4.y
00059 
00060 #else
00061 
00062 #define a00_re A0.x
00063 #define a00_im A0.y
00064 #define a01_re A1.x
00065 #define a01_im A1.y
00066 #define a02_re A2.x
00067 #define a02_im A2.y
00068 #define a10_re A3.x
00069 #define a10_im A3.y
00070 #define a11_re A4.x
00071 #define a11_im A4.y
00072 #define a12_re A5.x
00073 #define a12_im A5.y
00074 #define a20_re A6.x
00075 #define a20_im A6.y
00076 #define a21_re A7.x
00077 #define a21_im A7.y
00078 #define a22_re A8.x
00079 #define a22_im A8.y
00080 
00081 #define b00_re B0.x
00082 #define b00_im B0.y
00083 #define b01_re B1.x
00084 #define b01_im B1.y
00085 #define b02_re B2.x
00086 #define b02_im B2.y
00087 #define b10_re B3.x
00088 #define b10_im B3.y
00089 #define b11_re B4.x
00090 #define b11_im B4.y
00091 #define b12_re B5.x
00092 #define b12_im B5.y
00093 #define b20_re B6.x
00094 #define b20_im B6.y
00095 #define b21_re B7.x
00096 #define b21_im B7.y
00097 #define b22_re B8.x
00098 #define b22_im B8.y
00099 
00100 #define c00_re C0.x
00101 #define c00_im C0.y
00102 #define c01_re C1.x
00103 #define c01_im C1.y
00104 #define c02_re C2.x
00105 #define c02_im C2.y
00106 #define c10_re C3.x
00107 #define c10_im C3.y
00108 #define c11_re C4.x
00109 #define c11_im C4.y
00110 #define c12_re C5.x
00111 #define c12_im C5.y
00112 #define c20_re C6.x
00113 #define c20_im C6.y
00114 #define c21_re C7.x
00115 #define c21_im C7.y
00116 #define c22_re C8.x
00117 #define c22_im C8.y
00118 
00119 #endif
00120 
00121 
00122 #define bb00_re BB0.x
00123 #define bb00_im BB0.y
00124 #define bb01_re BB1.x
00125 #define bb01_im BB1.y
00126 #define bb02_re BB2.x
00127 #define bb02_im BB2.y
00128 #define bb10_re BB3.x
00129 #define bb10_im BB3.y
00130 #define bb11_re BB4.x
00131 #define bb11_im BB4.y
00132 #define bb12_re BB5.x
00133 #define bb12_im BB5.y
00134 #define bb20_re BB6.x
00135 #define bb20_im BB6.y
00136 #define bb21_re BB7.x
00137 #define bb21_im BB7.y
00138 #define bb22_re BB8.x
00139 #define bb22_im BB8.y
00140 
00141 
00142 
00143 #define aT00_re (+a00_re)
00144 #define aT00_im (-a00_im)
00145 #define aT01_re (+a10_re)
00146 #define aT01_im (-a10_im)
00147 #define aT02_re (+a20_re)
00148 #define aT02_im (-a20_im)
00149 #define aT10_re (+a01_re)
00150 #define aT10_im (-a01_im)
00151 #define aT11_re (+a11_re)
00152 #define aT11_im (-a11_im)
00153 #define aT12_re (+a21_re)
00154 #define aT12_im (-a21_im)
00155 #define aT20_re (+a02_re)
00156 #define aT20_im (-a02_im)
00157 #define aT21_re (+a12_re)
00158 #define aT21_im (-a12_im)
00159 #define aT22_re (+a22_re)
00160 #define aT22_im (-a22_im)
00161 
00162 #define bT00_re (+b00_re)
00163 #define bT00_im (-b00_im)
00164 #define bT01_re (+b10_re)
00165 #define bT01_im (-b10_im)
00166 #define bT02_re (+b20_re)
00167 #define bT02_im (-b20_im)
00168 #define bT10_re (+b01_re)
00169 #define bT10_im (-b01_im)
00170 #define bT11_re (+b11_re)
00171 #define bT11_im (-b11_im)
00172 #define bT12_re (+b21_re)
00173 #define bT12_im (-b21_im)
00174 #define bT20_re (+b02_re)
00175 #define bT20_im (-b02_im)
00176 #define bT21_re (+b12_re)
00177 #define bT21_im (-b12_im)
00178 #define bT22_re (+b22_re)
00179 #define bT22_im (-b22_im)
00180 
00181 #define cT00_re (+c00_re)
00182 #define cT00_im (-c00_im)
00183 #define cT01_re (+c10_re)
00184 #define cT01_im (-c10_im)
00185 #define cT02_re (+c20_re)
00186 #define cT02_im (-c20_im)
00187 #define cT10_re (+c01_re)
00188 #define cT10_im (-c01_im)
00189 #define cT11_re (+c11_re)
00190 #define cT11_im (-c11_im)
00191 #define cT12_re (+c21_re)
00192 #define cT12_im (-c21_im)
00193 #define cT20_re (+c02_re)
00194 #define cT20_im (-c02_im)
00195 #define cT21_re (+c12_re)
00196 #define cT21_im (-c12_im)
00197 #define cT22_re (+c22_re)
00198 #define cT22_im (-c22_im)
00199 
00200 
00201 #define tempa00_re TEMPA0.x
00202 #define tempa00_im TEMPA0.y
00203 #define tempa01_re TEMPA1.x
00204 #define tempa01_im TEMPA1.y
00205 #define tempa02_re TEMPA2.x
00206 #define tempa02_im TEMPA2.y
00207 #define tempa10_re TEMPA3.x
00208 #define tempa10_im TEMPA3.y
00209 #define tempa11_re TEMPA4.x
00210 #define tempa11_im TEMPA4.y
00211 #define tempa12_re TEMPA5.x
00212 #define tempa12_im TEMPA5.y
00213 #define tempa20_re TEMPA6.x
00214 #define tempa20_im TEMPA6.y
00215 #define tempa21_re TEMPA7.x
00216 #define tempa21_im TEMPA7.y
00217 #define tempa22_re TEMPA8.x
00218 #define tempa22_im TEMPA8.y
00219 
00220 #define tempb00_re TEMPB0.x
00221 #define tempb00_im TEMPB0.y
00222 #define tempb01_re TEMPB1.x
00223 #define tempb01_im TEMPB1.y
00224 #define tempb02_re TEMPB2.x
00225 #define tempb02_im TEMPB2.y
00226 #define tempb10_re TEMPB3.x
00227 #define tempb10_im TEMPB3.y
00228 #define tempb11_re TEMPB4.x
00229 #define tempb11_im TEMPB4.y
00230 #define tempb12_re TEMPB5.x
00231 #define tempb12_im TEMPB5.y
00232 #define tempb20_re TEMPB6.x
00233 #define tempb20_im TEMPB6.y
00234 #define tempb21_re TEMPB7.x
00235 #define tempb21_im TEMPB7.y
00236 #define tempb22_re TEMPB8.x
00237 #define tempb22_im TEMPB8.y
00238 
00239 
00240 //fat link is not compressible
00241 #define fat00_re FAT0.x
00242 #define fat00_im FAT0.y
00243 #define fat01_re FAT1.x
00244 #define fat01_im FAT1.y
00245 #define fat02_re FAT2.x
00246 #define fat02_im FAT2.y
00247 #define fat10_re FAT3.x
00248 #define fat10_im FAT3.y
00249 #define fat11_re FAT4.x
00250 #define fat11_im FAT4.y
00251 #define fat12_re FAT5.x
00252 #define fat12_im FAT5.y
00253 #define fat20_re FAT6.x
00254 #define fat20_im FAT6.y
00255 #define fat21_re FAT7.x
00256 #define fat21_im FAT7.y
00257 #define fat22_re FAT8.x
00258 #define fat22_im FAT8.y
00259 
00260 template<int mu, int nu, int odd_bit>
00261   __global__ void
00262   LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 
00263                                                            FloatN* sitelink_even, FloatN* sitelink_odd, 
00264                                                            FloatM* fatlink_even, FloatM* fatlink_odd,   
00265                                                            Float mycoeff)
00266 {
00267   FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
00268   FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
00269     
00270   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00271     
00272   int z1 = FAST_INT_DIVIDE(mem_idx, X1h);
00273   short x1h = mem_idx - z1*X1h;
00274   int z2 = FAST_INT_DIVIDE(z1, X2);
00275   short x2 = z1 - z2*X2;
00276   short x4 = FAST_INT_DIVIDE(z2, X3);
00277   short x3 = z2 - x4*X3;
00278   short x1odd = (x2 + x3 + x4 + odd_bit) & 1;
00279   short x1 = 2*x1h + x1odd;
00280   int X = 2*mem_idx + x1odd;    
00281   float sign =1;    
00282   int new_mem_idx;
00283   int new_x1 = x1;
00284   int new_x2 = x2;
00285   int new_x3 = x3;
00286   int new_x4 = x4;
00287     
00288     
00289   /* Upper staple */
00290   /* Computes the staple :
00291    *                 mu (B)
00292    *             +-------+
00293    *       nu      |       | 
00294    *         (A)   |       |(C)
00295    *               X       X
00296    *
00297    */
00298     
00299   {
00300     /* load matrix A*/
00301     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);   
00302     COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
00303     RECONSTRUCT_SITE_LINK(nu, mem_idx, sign, a);
00304     
00305     /* load matrix B*/  
00306     LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X);    
00307     LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B);
00308     COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);    
00309     RECONSTRUCT_SITE_LINK(mu, new_mem_idx, sign, b);
00310     
00311     MULT_SU3_NN(a, b, tempa);    
00312     
00313     /* load matrix C*/
00314         
00315     LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X);    
00316     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
00317     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);    
00318     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c);
00319     
00320     MULT_SU3_NA(tempa, c, staple);              
00321   }
00322 
00323   /***************lower staple****************
00324    *
00325    *                 X       X
00326    *           nu    |       | 
00327    *             (A)   |       | (C)
00328    *                   +-------+
00329    *                  mu (B)
00330    *
00331    *********************************************/
00332   {
00333     /* load matrix A*/
00334     LLFAT_COMPUTE_NEW_IDX_MINUS(nu,X);    
00335     
00336     LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
00337     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);        
00338     RECONSTRUCT_SITE_LINK(nu, (new_mem_idx), sign, a);
00339     
00340     /* load matrix B*/                          
00341     LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B);
00342     COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4);    
00343     RECONSTRUCT_SITE_LINK(mu, (new_mem_idx), sign, b);
00344     
00345     MULT_SU3_AN(a, b, tempa);
00346     
00347     /* load matrix C*/
00348     LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
00349     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
00350     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);        
00351     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c);
00352     
00353     
00354     MULT_SU3_NN(tempa, c, b);           
00355     LLFAT_ADD_SU3_MATRIX(b, staple, staple);
00356   }
00357   
00358   LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
00359     SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
00360     WRITE_FAT_MATRIX(fatlink_even,mu,  mem_idx);        
00361     
00362     WRITE_STAPLE_MATRIX(staple_even, mem_idx);  
00363     
00364   return;
00365 }
00366 
00367 
00368 template<int mu, int nu, int odd_bit>
00369   __global__ void
00370   LLFAT_KERNEL(do_computeGenStapleFieldParity, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd,
00371                                                             FloatM* fatlink_even, FloatM* fatlink_odd,                      
00372                                                             FloatM* mulink_even, FloatM* mulink_odd, 
00373                                                             Float mycoeff)
00374 {
00375   FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;
00376   FloatM TEMPB0, TEMPB1, TEMPB2, TEMPB3, TEMPB4, TEMPB5, TEMPB6, TEMPB7, TEMPB8;
00377 
00378   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00379     
00380   int z1 = FAST_INT_DIVIDE(mem_idx, X1h);
00381   int x1h = mem_idx - z1*X1h;
00382   int z2 = FAST_INT_DIVIDE(z1, X2);
00383   int x2 = z1 - z2*X2;
00384   int x4 = FAST_INT_DIVIDE(z2, X3);
00385   int x3 = z2 - x4*X3;
00386   int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
00387   int x1 = 2*x1h + x1odd;
00388   int X = 2*mem_idx + x1odd;
00389     
00390   int sign =1;
00391 
00392   int new_mem_idx;
00393   int new_x1 = x1;
00394   int new_x2 = x2;
00395   int new_x3 = x3;
00396   int new_x4 = x4;
00397 
00398     
00399   /* Upper staple */
00400   /* Computes the staple :
00401    *                mu (BB)
00402    *               +-------+
00403    *       nu      |       | 
00404    *         (A)   |       |(C)
00405    *               X       X
00406    *
00407    */
00408   {
00409     /* load matrix A*/
00410     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
00411     COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
00412     RECONSTRUCT_SITE_LINK(nu, mem_idx, sign, a);
00413     
00414     /* load matrix BB*/
00415     
00416     LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X);
00417     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
00418     
00419     MULT_SU3_NN(a, bb, tempa);
00420     /* load matrix C*/
00421     
00422     LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X);    
00423     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
00424     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00425     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c);
00426     
00427     
00428     MULT_SU3_NA(tempa, c, tempb);               
00429   }
00430  
00431   /***************lower staple****************
00432    *
00433    *                   X       X
00434    *             nu    |       | 
00435    *             (A)   |       | (C)
00436    *                   +-------+
00437    *                  mu (B)
00438    *
00439    *********************************************/
00440                 
00441   {             
00442     /* load matrix A*/
00443     LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X);
00444     
00445     LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A);
00446     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00447     RECONSTRUCT_SITE_LINK(nu, (new_mem_idx), sign, a);
00448     
00449     /* load matrix B*/                          
00450     LOAD_ODD_MULINK_MATRIX(0, (new_mem_idx), BB);
00451     
00452     MULT_SU3_AN(a, bb, tempa);
00453     
00454     /* load matrix C*/
00455     LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
00456     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
00457     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00458     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c);
00459     
00460     
00461     MULT_SU3_NN(tempa, c, a);
00462         
00463     LLFAT_ADD_SU3_MATRIX(a, tempb, tempb);
00464   }
00465 
00466   LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
00467   SCALAR_MULT_ADD_SU3_MATRIX(fat, tempb, mycoeff, fat); 
00468   
00469   WRITE_FAT_MATRIX(fatlink_even, mu,  mem_idx); 
00470   
00471   return;
00472 }
00473 
00474 template<int mu, int nu, int odd_bit>
00475   __global__ void
00476 LLFAT_KERNEL(do_computeGenStapleFieldSaveParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 
00477                                                              FloatN* sitelink_even, FloatN* sitelink_odd,
00478                                                              FloatM* fatlink_even, FloatM* fatlink_odd,                     
00479                                                              FloatM* mulink_even, FloatM* mulink_odd, 
00480                                                              Float mycoeff)
00481 {
00482   FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8;  
00483   FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8;
00484     
00485   int mem_idx = blockIdx.x*blockDim.x + threadIdx.x;
00486     
00487   int z1 = FAST_INT_DIVIDE(mem_idx, X1h);
00488   int x1h = mem_idx - z1*X1h;
00489   int z2 = FAST_INT_DIVIDE(z1, X2);
00490   int x2 = z1 - z2*X2;
00491   int x4 = FAST_INT_DIVIDE(z2, X3);
00492   int x3 = z2 - x4*X3;
00493   int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
00494   int x1 = 2*x1h + x1odd;
00495   int X = 2*mem_idx + x1odd;
00496     
00497   int sign =1;
00498     
00499   int new_mem_idx;
00500   int new_x1 = x1;
00501   int new_x2 = x2;
00502   int new_x3 = x3;
00503   int new_x4 = x4;
00504 
00505     
00506   /* Upper staple */
00507   /* Computes the staple :
00508    *                mu (BB)
00509    *             +-------+
00510    *       nu      |       | 
00511    *         (A)   |       |(C)
00512    *               X       X
00513    *
00514    */
00515   {             
00516     /* load matrix A*/
00517     LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A);
00518     COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4);
00519     RECONSTRUCT_SITE_LINK(nu, mem_idx, sign, a);
00520     
00521     /* load matrix BB*/
00522     LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X);    
00523     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
00524     
00525     MULT_SU3_NN(a, bb, tempa);    
00526     
00527     /* load matrix C*/
00528     LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X);    
00529     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C);
00530     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00531     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c);
00532     
00533     MULT_SU3_NA(tempa, c, staple);
00534   }
00535   
00536   /***************lower staple****************
00537    *
00538    *                 X       X
00539    *           nu    |       | 
00540    *             (A)   |       | (C)
00541    *                   +-------+
00542    *                  mu (B)
00543    *
00544    *********************************************/
00545     
00546 
00547   {
00548     /* load matrix A*/
00549     LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X);
00550     
00551     LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A);
00552     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00553     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, a);
00554     
00555     /* load matrix B*/                          
00556     LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB);
00557     
00558     MULT_SU3_AN(a, bb, tempa);
00559     
00560     /* load matrix C*/
00561     LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu);
00562     
00563     LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C);
00564     COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4);
00565     RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c);                            
00566     
00567     MULT_SU3_NN(tempa, c, a);   
00568     LLFAT_ADD_SU3_MATRIX(staple, a, staple);
00569   }
00570 
00571   LOAD_EVEN_FAT_MATRIX(mu, mem_idx);
00572   SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat);
00573     
00574   WRITE_FAT_MATRIX(fatlink_even, mu,  mem_idx); 
00575   WRITE_STAPLE_MATRIX(staple_even, mem_idx);                
00576   
00577   return;
00578 }
00579 
00580 __global__ void 
00581 LLFAT_KERNEL(llfatOneLink, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd,
00582                                         FloatM* fatlink_even, FloatM* fatlink_odd,
00583                                         Float coeff0, Float coeff5)
00584 {
00585 
00586   FloatN* my_sitelink;
00587   FloatM* my_fatlink;
00588   int sid = blockIdx.x*blockDim.x + threadIdx.x;
00589   int mem_idx = sid;
00590 
00591   int odd_bit= 0;
00592   my_sitelink = sitelink_even;
00593   my_fatlink = fatlink_even;
00594   if (mem_idx >= Vh){
00595     odd_bit=1;
00596     mem_idx = mem_idx - Vh;
00597     my_sitelink = sitelink_odd;
00598     my_fatlink = fatlink_odd;
00599   }
00600    
00601   int z1 = FAST_INT_DIVIDE(mem_idx, X1h);
00602   int x1h = mem_idx - z1*X1h;
00603   int z2 = FAST_INT_DIVIDE(z1, X2);
00604   int x2 = z1 - z2*X2;
00605   int x4 = FAST_INT_DIVIDE(z2, X3);
00606   int x3 = z2 - x4*X3;
00607   int x1odd = (x2 + x3 + x4 + odd_bit) & 1;
00608   int x1 = 2*x1h + x1odd;
00609   int sign =1;          
00610 
00611   for(int dir=0;dir < 4; dir++){
00612     LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A);
00613     COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4);
00614     RECONSTRUCT_SITE_LINK(dir, mem_idx, sign, a);
00615 
00616     LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx);
00617         
00618     SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat); 
00619     
00620     WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx);  
00621   }
00622     
00623   return;
00624 }
00625 
00626 
00627 
00628 #undef a00_re 
00629 #undef a00_im 
00630 #undef a01_re 
00631 #undef a01_im 
00632 #undef a02_re 
00633 #undef a02_im 
00634 #undef a10_re 
00635 #undef a10_im 
00636 #undef a11_re 
00637 #undef a11_im 
00638 #undef a12_re 
00639 #undef a12_im 
00640 #undef a20_re 
00641 #undef a20_im 
00642 #undef a21_re 
00643 #undef a21_im 
00644 #undef a22_re 
00645 #undef a22_im 
00646 
00647 #undef b00_re 
00648 #undef b00_im 
00649 #undef b01_re 
00650 #undef b01_im 
00651 #undef b02_re 
00652 #undef b02_im 
00653 #undef b10_re 
00654 #undef b10_im 
00655 #undef b11_re 
00656 #undef b11_im 
00657 #undef b12_re 
00658 #undef b12_im 
00659 #undef b20_re 
00660 #undef b20_im 
00661 #undef b21_re 
00662 #undef b21_im 
00663 #undef b22_re 
00664 #undef b22_im 
00665 
00666 #undef bb00_re 
00667 #undef bb00_im 
00668 #undef bb01_re 
00669 #undef bb01_im 
00670 #undef bb02_re 
00671 #undef bb02_im 
00672 #undef bb10_re 
00673 #undef bb10_im 
00674 #undef bb11_re 
00675 #undef bb11_im 
00676 #undef bb12_re 
00677 #undef bb12_im 
00678 #undef bb20_re 
00679 #undef bb20_im 
00680 #undef bb21_re 
00681 #undef bb21_im 
00682 #undef bb22_re 
00683 #undef bb22_im 
00684 
00685 #undef c00_re 
00686 #undef c00_im 
00687 #undef c01_re 
00688 #undef c01_im 
00689 #undef c02_re 
00690 #undef c02_im 
00691 #undef c10_re 
00692 #undef c10_im 
00693 #undef c11_re 
00694 #undef c11_im 
00695 #undef c12_re 
00696 #undef c12_im 
00697 #undef c20_re 
00698 #undef c20_im 
00699 #undef c21_re 
00700 #undef c21_im 
00701 #undef c22_re 
00702 #undef c22_im 
00703 
00704 #undef aT00_re 
00705 #undef aT00_im 
00706 #undef aT01_re 
00707 #undef aT01_im 
00708 #undef aT02_re 
00709 #undef aT02_im 
00710 #undef aT10_re 
00711 #undef aT10_im 
00712 #undef aT11_re 
00713 #undef aT11_im 
00714 #undef aT12_re 
00715 #undef aT12_im 
00716 #undef aT20_re 
00717 #undef aT20_im 
00718 #undef aT21_re 
00719 #undef aT21_im 
00720 #undef aT22_re 
00721 #undef aT22_im 
00722 
00723 #undef bT00_re 
00724 #undef bT00_im 
00725 #undef bT01_re 
00726 #undef bT01_im 
00727 #undef bT02_re 
00728 #undef bT02_im 
00729 #undef bT10_re 
00730 #undef bT10_im 
00731 #undef bT11_re 
00732 #undef bT11_im 
00733 #undef bT12_re 
00734 #undef bT12_im 
00735 #undef bT20_re 
00736 #undef bT20_im 
00737 #undef bT21_re 
00738 #undef bT21_im 
00739 #undef bT22_re 
00740 #undef bT22_im 
00741 
00742 #undef cT00_re 
00743 #undef cT00_im 
00744 #undef cT01_re 
00745 #undef cT01_im 
00746 #undef cT02_re 
00747 #undef cT02_im 
00748 #undef cT10_re 
00749 #undef cT10_im 
00750 #undef cT11_re 
00751 #undef cT11_im 
00752 #undef cT12_re 
00753 #undef cT12_im 
00754 #undef cT20_re 
00755 #undef cT20_im 
00756 #undef cT21_re 
00757 #undef cT21_im 
00758 #undef cT22_re 
00759 #undef cT22_im 
00760 
00761 
00762 #undef tempa00_re 
00763 #undef tempa00_im 
00764 #undef tempa01_re 
00765 #undef tempa01_im 
00766 #undef tempa02_re 
00767 #undef tempa02_im 
00768 #undef tempa10_re 
00769 #undef tempa10_im 
00770 #undef tempa11_re 
00771 #undef tempa11_im 
00772 #undef tempa12_re 
00773 #undef tempa12_im 
00774 #undef tempa20_re 
00775 #undef tempa20_im 
00776 #undef tempa21_re 
00777 #undef tempa21_im 
00778 #undef tempa22_re 
00779 #undef tempa22_im 
00780 
00781 #undef tempb00_re 
00782 #undef tempb00_im 
00783 #undef tempb01_re 
00784 #undef tempb01_im 
00785 #undef tempb02_re 
00786 #undef tempb02_im 
00787 #undef tempb10_re 
00788 #undef tempb10_im 
00789 #undef tempb11_re 
00790 #undef tempb11_im 
00791 #undef tempb12_re 
00792 #undef tempb12_im 
00793 #undef tempb20_re 
00794 #undef tempb20_im 
00795 #undef tempb21_re 
00796 #undef tempb21_im 
00797 #undef tempb22_re 
00798 #undef tempb22_im 
00799 
00800 #undef fat00_re 
00801 #undef fat00_im 
00802 #undef fat01_re 
00803 #undef fat01_im 
00804 #undef fat02_re 
00805 #undef fat02_im 
00806 #undef fat10_re 
00807 #undef fat10_im 
00808 #undef fat11_re 
00809 #undef fat11_im 
00810 #undef fat12_re 
00811 #undef fat12_im 
00812 #undef fat20_re 
00813 #undef fat20_im 
00814 #undef fat21_re 
00815 #undef fat21_im 
00816 #undef fat22_re 
00817 #undef fat22_im 
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines