QUDA v0.3.2
A library for QCD on GPUs

quda/lib/llfat_quda.cu

Go to the documentation of this file.
00001 #include <stdio.h>
00002 
00003 #include <quda_internal.h>
00004 #include <llfat_quda.h>
00005 #include <read_gauge.h>
00006 #include <gauge_quda.h>
00007 #include <cuda_runtime.h>
00008 #include <cuda.h>
00009 
00010 #define WRITE_FAT_MATRIX(gauge, dir, idx)do {           \
00011     gauge[idx + dir*Vhx9] = FAT0;                       \
00012     gauge[idx + dir*Vhx9 + Vh  ] = FAT1;                \
00013     gauge[idx + dir*Vhx9 + Vhx2] = FAT2;                \
00014     gauge[idx + dir*Vhx9 + Vhx3] = FAT3;                \
00015     gauge[idx + dir*Vhx9 + Vhx4] = FAT4;                \
00016     gauge[idx + dir*Vhx9 + Vhx5] = FAT5;                \
00017     gauge[idx + dir*Vhx9 + Vhx6] = FAT6;                \
00018     gauge[idx + dir*Vhx9 + Vhx7] = FAT7;                \
00019     gauge[idx + dir*Vhx9 + Vhx8] = FAT8;} while(0)                      
00020 
00021 
00022 #define WRITE_STAPLE_MATRIX(gauge, idx)         \
00023   gauge[idx] = STAPLE0;                         \
00024   gauge[idx + Vh] = STAPLE1;                    \
00025   gauge[idx + Vhx2] = STAPLE2;                  \
00026   gauge[idx + Vhx3] = STAPLE3;                  \
00027   gauge[idx + Vhx4] = STAPLE4;                  \
00028   gauge[idx + Vhx5] = STAPLE5;                  \
00029   gauge[idx + Vhx6] = STAPLE6;                  \
00030   gauge[idx + Vhx7] = STAPLE7;                  \
00031   gauge[idx + Vhx8] = STAPLE8;                                  
00032     
00033 
00034 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \
00035   c##00_re = a*b##00_re;                \
00036   c##00_im = a*b##00_im;                \
00037   c##01_re = a*b##01_re;                \
00038   c##01_im = a*b##01_im;                \
00039   c##02_re = a*b##02_re;                \
00040   c##02_im = a*b##02_im;                \
00041   c##10_re = a*b##10_re;                \
00042   c##10_im = a*b##10_im;                \
00043   c##11_re = a*b##11_re;                \
00044   c##11_im = a*b##11_im;                \
00045   c##12_re = a*b##12_re;                \
00046   c##12_im = a*b##12_im;                \
00047   c##20_re = a*b##20_re;                \
00048   c##20_im = a*b##20_im;                \
00049   c##21_re = a*b##21_re;                \
00050   c##21_im = a*b##21_im;                \
00051   c##22_re = a*b##22_re;                \
00052   c##22_im = a*b##22_im;                \
00053     
00054 
00055 #define LOAD_MATRIX_18_SINGLE(gauge, dir, idx, var)                     \
00056   float2 var##0 = gauge[idx + dir*Vhx9];                                \
00057   float2 var##1 = gauge[idx + dir*Vhx9 + Vh];                           \
00058   float2 var##2 = gauge[idx + dir*Vhx9 + Vhx2];                         \
00059   float2 var##3 = gauge[idx + dir*Vhx9 + Vhx3];                         \
00060   float2 var##4 = gauge[idx + dir*Vhx9 + Vhx4];                         \
00061   float2 var##5 = gauge[idx + dir*Vhx9 + Vhx5];                         \
00062   float2 var##6 = gauge[idx + dir*Vhx9 + Vhx6];                         \
00063   float2 var##7 = gauge[idx + dir*Vhx9 + Vhx7];                         \
00064   float2 var##8 = gauge[idx + dir*Vhx9 + Vhx8];
00065 
00066 #define LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var)                 \
00067   float2 var##0 = tex1Dfetch(gauge, idx + dir*Vhx9);                    \
00068   float2 var##1 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vh);               \
00069   float2 var##2 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx2);             \
00070   float2 var##3 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx3);             \
00071   float2 var##4 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx4);             \
00072   float2 var##5 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx5);             \
00073   float2 var##6 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx6);             \
00074   float2 var##7 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx7);             \
00075   float2 var##8 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx8); 
00076 
00077 #define LOAD_MATRIX_18_DOUBLE(gauge, dir, idx, var)                     \
00078   double2 var##0 = gauge[idx + dir*Vhx9];                               \
00079   double2 var##1 = gauge[idx + dir*Vhx9 + Vh];                          \
00080   double2 var##2 = gauge[idx + dir*Vhx9 + Vhx2];                                \
00081   double2 var##3 = gauge[idx + dir*Vhx9 + Vhx3];                                \
00082   double2 var##4 = gauge[idx + dir*Vhx9 + Vhx4];                                \
00083   double2 var##5 = gauge[idx + dir*Vhx9 + Vhx5];                                \
00084   double2 var##6 = gauge[idx + dir*Vhx9 + Vhx6];                                \
00085   double2 var##7 = gauge[idx + dir*Vhx9 + Vhx7];                                \
00086   double2 var##8 = gauge[idx + dir*Vhx9 + Vhx8];
00087 
00088 #define LOAD_MATRIX_18_DOUBLE_TEX(gauge, dir, idx, var)                 \
00089   double2 var##0 = fetch_double2(gauge, idx + dir*Vhx9);                        \
00090   double2 var##1 = fetch_double2(gauge, idx + dir*Vhx9 + Vh);           \
00091   double2 var##2 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx2);         \
00092   double2 var##3 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx3);         \
00093   double2 var##4 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx4);         \
00094   double2 var##5 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx5);         \
00095   double2 var##6 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx6);         \
00096   double2 var##7 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx7);         \
00097   double2 var##8 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx8); 
00098 
00099 
00100 #define SITE_MATRIX_LOAD_TEX 0
00101 #define MULINK_LOAD_TEX 0
00102 #define FATLINK_LOAD_TEX 0
00103 
00104 
00105 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var)             \
00106   float4 var##0 = gauge[idx + dir*Vhx3];                                \
00107   float4 var##1 = gauge[idx + dir*Vhx3 + Vh];                           \
00108   float4 var##2 = gauge[idx + dir*Vhx3 + Vhx2];                         \
00109   float4 var##3, var##4;
00110 
00111 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var)         \
00112   float4 var##0 = tex1Dfetch(gauge, idx + dir* Vhx3);                   \
00113   float4 var##1 = tex1Dfetch(gauge, idx + dir*Vhx3 + Vh);               \
00114   float4 var##2 = tex1Dfetch(gauge, idx + dir*Vhx3 + Vhx2);             \
00115   float4 var##3, var##4;
00116 
00117 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var)             \
00118   float2 var##0 = gauge[idx + dir*Vhx9];                                \
00119   float2 var##1 = gauge[idx + dir*Vhx9 + Vh];                           \
00120   float2 var##2 = gauge[idx + dir*Vhx9 + Vhx2];                         \
00121   float2 var##3 = gauge[idx + dir*Vhx9 + Vhx3];                         \
00122   float2 var##4 = gauge[idx + dir*Vhx9 + Vhx4];                         \
00123   float2 var##5 = gauge[idx + dir*Vhx9 + Vhx5];                         \
00124   float2 var##6 = gauge[idx + dir*Vhx9 + Vhx6];                         \
00125   float2 var##7 = gauge[idx + dir*Vhx9 + Vhx7];                         \
00126   float2 var##8 = gauge[idx + dir*Vhx9 + Vhx8];                         
00127 
00128 
00129 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var)         \
00130   float2 var##0 = tex1Dfetch(gauge, idx + dir*Vhx9);                    \
00131   float2 var##1 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vh);               \
00132   float2 var##2 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx2);             \
00133   float2 var##3 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx3);             \
00134   float2 var##4 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx4);             \
00135   float2 var##5 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx5);             \
00136   float2 var##6 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx6);             \
00137   float2 var##7 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx7);             \
00138   float2 var##8 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx8);             
00139 
00140 
00141 
00142 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var)             \
00143   double2 var##0 = gauge[idx + dir*Vhx9];                               \
00144   double2 var##1 = gauge[idx + dir*Vhx9 + Vh];                          \
00145   double2 var##2 = gauge[idx + dir*Vhx9 + Vhx2];                        \
00146   double2 var##3 = gauge[idx + dir*Vhx9 + Vhx3];                        \
00147   double2 var##4 = gauge[idx + dir*Vhx9 + Vhx4];                        \
00148   double2 var##5 = gauge[idx + dir*Vhx9 + Vhx5];                        \
00149   double2 var##6 = gauge[idx + dir*Vhx9 + Vhx6];                        \
00150   double2 var##7 = gauge[idx + dir*Vhx9 + Vhx7];                        \
00151   double2 var##8 = gauge[idx + dir*Vhx9 + Vhx8];                                
00152 
00153 
00154 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge, dir, idx, var)         \
00155   double2 var##0 = fetch_double2(gauge, idx + dir*Vhx9);                \
00156   double2 var##1 = fetch_double2(gauge, idx + dir*Vhx9 + Vh);           \
00157   double2 var##2 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx2);         \
00158   double2 var##3 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx3);         \
00159   double2 var##4 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx4);         \
00160   double2 var##5 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx5);         \
00161   double2 var##6 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx6);         \
00162   double2 var##7 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx7);         \
00163   double2 var##8 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx8);         
00164 
00165 
00166 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var)             \
00167   double2 var##0 = gauge[idx + dir*Vhx6];                               \
00168   double2 var##1 = gauge[idx + dir*Vhx6 + Vh];                          \
00169   double2 var##2 = gauge[idx + dir*Vhx6 + Vhx2];                        \
00170   double2 var##3 = gauge[idx + dir*Vhx6 + Vhx3];                        \
00171   double2 var##4 = gauge[idx + dir*Vhx6 + Vhx4];                        \
00172   double2 var##5 = gauge[idx + dir*Vhx6 + Vhx5];                        \
00173   double2 var##6, var##7, var##8;
00174 
00175 
00176 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge, dir, idx, var)         \
00177   double2 var##0 = fetch_double2(gauge, idx + dir*Vhx6);                \
00178   double2 var##1 = fetch_double2(gauge, idx + dir*Vhx6 + Vh);           \
00179   double2 var##2 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx2);         \
00180   double2 var##3 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx3);         \
00181   double2 var##4 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx4);         \
00182   double2 var##5 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx5);         \
00183   double2 var##6, var##7, var##8;
00184 
00185 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc)        \
00186   mc##00_re = ma##00_re + mb##00_re;            \
00187   mc##00_im = ma##00_im + mb##00_im;            \
00188   mc##01_re = ma##01_re + mb##01_re;            \
00189   mc##01_im = ma##01_im + mb##01_im;            \
00190   mc##02_re = ma##02_re + mb##02_re;            \
00191   mc##02_im = ma##02_im + mb##02_im;            \
00192   mc##10_re = ma##10_re + mb##10_re;            \
00193   mc##10_im = ma##10_im + mb##10_im;            \
00194   mc##11_re = ma##11_re + mb##11_re;            \
00195   mc##11_im = ma##11_im + mb##11_im;            \
00196   mc##12_re = ma##12_re + mb##12_re;            \
00197   mc##12_im = ma##12_im + mb##12_im;            \
00198   mc##20_re = ma##20_re + mb##20_re;            \
00199   mc##20_im = ma##20_im + mb##20_im;            \
00200   mc##21_re = ma##21_re + mb##21_re;            \
00201   mc##21_im = ma##21_im + mb##21_im;            \
00202   mc##22_re = ma##22_re + mb##22_re;            \
00203   mc##22_im = ma##22_im + mb##22_im;            
00204 
00205 
00206 
00207 void
00208 llfat_init_cuda(QudaGaugeParam* param)
00209 {
00210   static int llfat_init_cuda_flag = 0;
00211   if (llfat_init_cuda_flag){
00212     return;
00213   }
00214     
00215   llfat_init_cuda_flag = 1;
00216   
00217   init_kernel_cuda(param);
00218 
00219 }
00220 
00221 
00222  
00223 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do {         \
00224     new_x1 = x1;                                                        \
00225     new_x2 = x2;                                                        \
00226     new_x3 = x3;                                                        \
00227     new_x4 = x4;                                                        \
00228     switch(mydir1){                                                     \
00229     case 0:                                                             \
00230       new_mem_idx = ( (x1==0)?X+X1m1:X-1);                              \
00231       new_x1 = (x1==0)?X1m1:x1 - 1;                                     \
00232       break;                                                            \
00233     case 1:                                                             \
00234       new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1);                          \
00235       new_x2 = (x2==0)?X2m1:x2 - 1;                                     \
00236       break;                                                            \
00237     case 2:                                                             \
00238       new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1);                    \
00239       new_x3 = (x3==0)?X3m1:x3 - 1;                                     \
00240       break;                                                            \
00241     case 3:                                                             \
00242       new_mem_idx = ( (x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1);              \
00243       new_x4 = (x4==0)?X4m1:x4 - 1;                                     \
00244       break;                                                            \
00245     }                                                                   \
00246     switch(mydir2){                                                     \
00247     case 0:                                                             \
00248       new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1;   \
00249       new_x1 = (x1==X1m1)?0:x1+1;                                       \
00250       break;                                                            \
00251     case 1:                                                             \
00252       new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \
00253       new_x2 = (x2==X2m1)?0:x2+1;                                       \
00254       break;                                                            \
00255     case 2:                                                             \
00256       new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \
00257       new_x3 = (x3==X3m1)?0:x3+1;                                       \
00258       break;                                                            \
00259     case 3:                                                             \
00260       new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \
00261       new_x4 = (x4==X4m1)?0:x4+1;                                       \
00262       break;                                                            \
00263     }                                                                   \
00264   }while(0)
00265 
00266 
00267 
00268 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do {                     \
00269     new_x1 = x1;                                                        \
00270     new_x2 = x2;                                                        \
00271     new_x3 = x3;                                                        \
00272     new_x4 = x4;                                                        \
00273     switch(mydir){                                                      \
00274     case 0:                                                             \
00275       new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>>1;                    \
00276       new_x1 = (x1==X1m1)?0:x1+1;                                       \
00277       break;                                                            \
00278     case 1:                                                             \
00279       new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1)>>1;                \
00280       new_x2 = (x2==X2m1)?0:x2+1;                                       \
00281       break;                                                            \
00282     case 2:                                                             \
00283       new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1)>>1;          \
00284       new_x3 = (x3==X3m1)?0:x3+1;                                       \
00285       break;                                                            \
00286     case 3:                                                             \
00287       new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1)>>1;    \
00288       new_x4 = (x4==X4m1)?0:x4+1;                                       \
00289       break;                                                            \
00290     }                                                                   \
00291   }while(0)
00292 
00293 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do {                    \
00294     new_x1 = x1;                                                        \
00295     new_x2 = x2;                                                        \
00296     new_x3 = x3;                                                        \
00297     new_x4 = x4;                                                        \
00298     switch(mydir){                                                      \
00299     case 0:                                                             \
00300       new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1;                     \
00301       new_x1 = (x1==0)?X1m1:x1 - 1;                                     \
00302       break;                                                            \
00303     case 1:                                                             \
00304       new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1;                 \
00305       new_x2 = (x2==0)?X2m1:x2 - 1;                                     \
00306       break;                                                            \
00307     case 2:                                                             \
00308       new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1;           \
00309       new_x3 = (x3==0)?X3m1:x3 - 1;                                     \
00310       break;                                                            \
00311     case 3:                                                             \
00312       new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1;     \
00313       new_x4 = (x4==0)?X4m1:x4 - 1;                                     \
00314       break;                                                            \
00315     }                                                                   \
00316   }while(0)
00317 
00318     
00319 
00320 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do {   \
00321     sign =1;                                                    \
00322     switch(dir){                                                \
00323     case XUP:                                                   \
00324       if ( (i4 & 1) == 1){                                      \
00325         sign = -1;                                              \
00326       }                                                         \
00327       break;                                                    \
00328     case YUP:                                                   \
00329       if ( ((i4+i1) & 1) == 1){                                 \
00330         sign = -1;                                              \
00331       }                                                         \
00332       break;                                                    \
00333     case ZUP:                                                   \
00334       if ( ((i4+i1+i2) & 1) == 1){                              \
00335         sign = -1;                                              \
00336       }                                                         \
00337       break;                                                    \
00338     case TUP:                                                   \
00339       if (i4 == X4m1 ){                                         \
00340         sign = -1;                                              \
00341       }                                                         \
00342       break;                                                    \
00343     }                                                           \
00344   }while (0)
00345 
00346 
00347 #define LLFAT_CONCAT(a,b) a##b##Kernel
00348 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b)
00349 
00350 //precision: 0 is for double, 1 is for single
00351 
00352 //single precision, common macro
00353 #define PRECISION 1
00354 #define Float  float
00355 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE(gauge, dir, idx, FAT)
00356 #if (MULINK_LOAD_TEX == 1)
00357 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(muLink0TexSingle, dir, idx, var)
00358 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(muLink1TexSingle, dir, idx, var)
00359 #else
00360 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE(mulink_even, dir, idx, var)
00361 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE(mulink_odd, dir, idx, var)
00362 #endif
00363 
00364 #if (FATLINK_LOAD_TEX == 1)
00365 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX(fatGauge0TexSingle, dir, idx, FAT)
00366 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX(fatGauge1TexSingle, dir, idx, FAT)
00367 #else
00368 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE(fatlink_even, dir, idx, FAT)
00369 #define LOAD_ODD_FAT_MATRIX(dir, idx)  LOAD_MATRIX_18_SINGLE(fatlink_odd, dir, idx, FAT)
00370 #endif
00371 
00372 
00373 //single precision, 12-reconstruct
00374 #define SITELINK0TEX siteLink0TexSingle
00375 #define SITELINK1TEX siteLink1TexSingle
00376 #if (SITE_MATRIX_LOAD_TEX == 1)
00377 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var)
00378 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var)
00379 #else
00380 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var)
00381 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var)
00382 #endif
00383 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var)
00384 
00385 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var)  RECONSTRUCT_LINK_12(dir, idx, sign, var);
00386 #define FloatN float4
00387 #define FloatM float2
00388 #define RECONSTRUCT 12
00389 #include "llfat_core.h"
00390 #undef SITELINK0TEX
00391 #undef SITELINK1TEX
00392 #undef LOAD_EVEN_SITE_MATRIX
00393 #undef LOAD_ODD_SITE_MATRIX
00394 #undef LOAD_SITE_MATRIX
00395 #undef RECONSTRUCT_SITE_LINK
00396 #undef FloatN
00397 #undef FloatM
00398 #undef RECONSTRUCT
00399 
00400 //single precision, 18-reconstruct
00401 #define SITELINK0TEX siteLink0TexSingle_norecon
00402 #define SITELINK1TEX siteLink1TexSingle_norecon
00403 #if (SITE_MATRIX_LOAD_TEX == 1)
00404 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var)
00405 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var)
00406 #else
00407 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var)
00408 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var)
00409 #endif
00410 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE(sitelink, dir, idx, var)
00411 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var)  
00412 #define FloatN float2
00413 #define FloatM float2
00414 #define RECONSTRUCT 18
00415 #include "llfat_core.h"
00416 #undef SITELINK0TEX
00417 #undef SITELINK1TEX
00418 #undef LOAD_EVEN_SITE_MATRIX
00419 #undef LOAD_ODD_SITE_MATRIX
00420 #undef LOAD_SITE_MATRIX
00421 #undef RECONSTRUCT_SITE_LINK
00422 #undef FloatN
00423 #undef FloatM
00424 #undef RECONSTRUCT
00425 
00426 
00427 #undef PRECISION
00428 #undef Float
00429 #undef LOAD_FAT_MATRIX
00430 #undef LOAD_EVEN_MULINK_MATRIX
00431 #undef LOAD_ODD_MULINK_MATRIX
00432 #undef LOAD_EVEN_FAT_MATRIX
00433 #undef LOAD_ODD_FAT_MATRIX
00434 
00435 
00436 //double precision, common macro
00437 #define PRECISION 0
00438 #define Float double
00439 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE(gauge, dir, idx, FAT)
00440 #if (MULINK_LOAD_TEX == 1)
00441 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(muLink0TexDouble, dir, idx, var)
00442 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(muLink1TexDouble, dir, idx, var)
00443 #else
00444 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var)
00445 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var)
00446 #endif
00447 
00448 #if (FATLINK_LOAD_TEX == 1)
00449 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX(fatGauge0TexDouble, dir, idx, FAT)
00450 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX(fatGauge1TexDouble, dir, idx, FAT)
00451 #else
00452 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE(fatlink_even, dir, idx, FAT)
00453 #define LOAD_ODD_FAT_MATRIX(dir, idx)  LOAD_MATRIX_18_DOUBLE(fatlink_odd, dir, idx, FAT)
00454 #endif
00455 
00456 //double precision,  18-reconstruct
00457 #define SITELINK0TEX siteLink0TexDouble
00458 #define SITELINK1TEX siteLink1TexDouble
00459 #if (SITE_MATRIX_LOAD_TEX == 1)
00460 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var)
00461 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var)
00462 #else
00463 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var)
00464 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var)
00465 #endif
00466 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE(sitelink, dir, idx, var)
00467 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var)  
00468 #define FloatN double2
00469 #define FloatM double2
00470 #define RECONSTRUCT 18
00471 #include "llfat_core.h"
00472 #undef SITELINK0TEX
00473 #undef SITELINK1TEX
00474 #undef LOAD_EVEN_SITE_MATRIX
00475 #undef LOAD_ODD_SITE_MATRIX
00476 #undef LOAD_SITE_MATRIX
00477 #undef RECONSTRUCT_SITE_LINK
00478 #undef FloatN
00479 #undef FloatM
00480 #undef RECONSTRUCT
00481 
00482 #if 1
00483 //double precision, 12-reconstruct
00484 #define SITELINK0TEX siteLink0TexDouble
00485 #define SITELINK1TEX siteLink1TexDouble
00486 #if (SITE_MATRIX_LOAD_TEX == 1)
00487 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var)
00488 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var)
00489 #else
00490 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var)
00491 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var)
00492 #endif
00493 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var)
00494 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var)  RECONSTRUCT_LINK_12(dir, idx, sign, var);
00495 #define FloatN double2
00496 #define FloatM double2
00497 #define RECONSTRUCT 12
00498 #include "llfat_core.h"
00499 #undef SITELINK0TEX
00500 #undef SITELINK1TEX
00501 #undef LOAD_EVEN_SITE_MATRIX
00502 #undef LOAD_ODD_SITE_MATRIX
00503 #undef LOAD_SITE_MATRIX
00504 #undef RECONSTRUCT_SITE_LINK
00505 #undef FloatN
00506 #undef FloatM
00507 #undef RECONSTRUCT
00508 #endif
00509 
00510 #undef PRECISION
00511 #undef Float
00512 #undef LOAD_FAT_MATRIX
00513 #undef LOAD_EVEN_MULINK_MATRIX
00514 #undef LOAD_ODD_MULINK_MATRIX
00515 #undef LOAD_EVEN_FAT_MATRIX
00516 #undef LOAD_ODD_FAT_MATRIX
00517 
00518 #undef LLFAT_CONCAT
00519 #undef LLFAT_KERNEL
00520 
00521 #define UNBIND_ALL_TEXTURE do{                                          \
00522     if(prec ==QUDA_DOUBLE_PRECISION){                                   \
00523       cudaUnbindTexture(siteLink0TexDouble);                            \
00524       cudaUnbindTexture(siteLink1TexDouble);                            \
00525       cudaUnbindTexture(fatGauge0TexDouble);                            \
00526       cudaUnbindTexture(fatGauge1TexDouble);                            \
00527       cudaUnbindTexture(muLink0TexDouble);                              \
00528       cudaUnbindTexture(muLink1TexDouble);                              \
00529     }else{                                                              \
00530       if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){              \
00531         cudaUnbindTexture(siteLink0TexSingle_norecon);                  \
00532         cudaUnbindTexture(siteLink1TexSingle_norecon);                  \
00533       }else{                                                            \
00534         cudaUnbindTexture(siteLink0TexSingle);                          \
00535         cudaUnbindTexture(siteLink1TexSingle);                          \
00536       }                                                                 \
00537       cudaUnbindTexture(fatGauge0TexSingle);                            \
00538       cudaUnbindTexture(fatGauge1TexSingle);                            \
00539       cudaUnbindTexture(muLink0TexSingle);                              \
00540       cudaUnbindTexture(muLink1TexSingle);                              \
00541     }                                                                   \
00542   }while(0)
00543 
00544 #define UNBIND_SITE_AND_FAT_LINK do{                                    \
00545     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00546       cudaUnbindTexture(siteLink0TexDouble);                            \
00547       cudaUnbindTexture(siteLink1TexDouble);                            \
00548       cudaUnbindTexture(fatGauge0TexDouble);                            \
00549       cudaUnbindTexture(fatGauge1TexDouble);                            \
00550     }else {                                                             \
00551       if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){              \
00552         cudaUnbindTexture(siteLink0TexSingle_norecon);                  \
00553         cudaUnbindTexture(siteLink1TexSingle_norecon);                  \
00554       }else{                                                            \
00555         cudaUnbindTexture(siteLink0TexSingle);                          \
00556         cudaUnbindTexture(siteLink1TexSingle);                          \
00557       }                                                                 \
00558       cudaUnbindTexture(fatGauge0TexSingle);                            \
00559       cudaUnbindTexture(fatGauge1TexSingle);                            \
00560     }                                                                   \
00561   }while(0)
00562 
00563 #define BIND_SITE_AND_FAT_LINK do {                                     \
00564   if(prec == QUDA_DOUBLE_PRECISION){                                    \
00565     cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \
00566     cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \
00567     cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.even, cudaFatLink.bytes); \
00568     cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.odd,  cudaFatLink.bytes); \
00569   }else{                                                                \
00570     if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){                \
00571       cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \
00572       cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \
00573     }else{                                                              \
00574       cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); \
00575       cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); \
00576     }                                                                   \
00577     cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.even, cudaFatLink.bytes); \
00578     cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.odd,  cudaFatLink.bytes); \
00579     }                                                                   \
00580   }while(0)
00581 
00582 #define BIND_SITE_AND_FAT_LINK_REVERSE do {                             \
00583     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00584       cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \
00585       cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \
00586       cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \
00587       cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd,  cudaFatLink.bytes); \
00588     }else{                                                              \
00589       if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){              \
00590         cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \
00591         cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \
00592       }else{                                                            \
00593         cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); \
00594         cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); \
00595       }                                                                 \
00596       cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \
00597       cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd,  cudaFatLink.bytes); \
00598     }                                                                   \
00599   }while(0)
00600 
00601 
00602 
00603 #define ENUMERATE_FUNCS(mu,nu,odd_bit)  switch(mu) {                    \
00604   case 0:                                                               \
00605     switch(nu){                                                         \
00606     case 0:                                                             \
00607       printf("ERROR: invalid direction combination\n"); exit(1);        \
00608       break;                                                            \
00609     case 1:                                                             \
00610       if (!odd_bit) { CALL_FUNCTION(0,1,0); }                           \
00611       else {CALL_FUNCTION(0,1,1); }                                     \
00612       break;                                                            \
00613     case 2:                                                             \
00614       if (!odd_bit) { CALL_FUNCTION(0,2,0); }                           \
00615       else {CALL_FUNCTION(0,2,1); }                                     \
00616       break;                                                            \
00617     case 3:                                                             \
00618       if (!odd_bit) { CALL_FUNCTION(0,3,0); }                           \
00619       else {CALL_FUNCTION(0,3,1); }                                     \
00620       break;                                                            \
00621     }                                                                   \
00622     break;                                                              \
00623   case 1:                                                               \
00624     switch(nu){                                                         \
00625     case 0:                                                             \
00626       if (!odd_bit) { CALL_FUNCTION(1,0,0); }                           \
00627       else {CALL_FUNCTION(1,0,1); }                                     \
00628       break;                                                            \
00629     case 1:                                                             \
00630       printf("ERROR: invalid direction combination\n"); exit(1);        \
00631       break;                                                            \
00632     case 2:                                                             \
00633       if (!odd_bit) { CALL_FUNCTION(1,2,0); }                           \
00634       else {CALL_FUNCTION(1,2,1); }                                     \
00635       break;                                                            \
00636     case 3:                                                             \
00637       if (!odd_bit) { CALL_FUNCTION(1,3,0); }                           \
00638       else {CALL_FUNCTION(1,3,1); }                                     \
00639       break;                                                            \
00640     }                                                                   \
00641     break;                                                              \
00642   case 2:                                                               \
00643     switch(nu){                                                         \
00644     case 0:                                                             \
00645       if (!odd_bit) { CALL_FUNCTION(2,0,0); }                           \
00646       else {CALL_FUNCTION(2,0,1); }                                     \
00647       break;                                                            \
00648     case 1:                                                             \
00649       if (!odd_bit) { CALL_FUNCTION(2,1,0); }                           \
00650       else {CALL_FUNCTION(2,1,1); }                                     \
00651       break;                                                            \
00652     case 2:                                                             \
00653       printf("ERROR: invalid direction combination\n"); exit(1);        \
00654       break;                                                            \
00655     case 3:                                                             \
00656       if (!odd_bit) { CALL_FUNCTION(2,3,0); }                           \
00657       else {CALL_FUNCTION(2,3,1); }                                     \
00658       break;                                                            \
00659     }                                                                   \
00660     break;                                                              \
00661   case 3:                                                               \
00662     switch(nu){                                                         \
00663     case 0:                                                             \
00664       if (!odd_bit) { CALL_FUNCTION(3,0,0); }                           \
00665       else {CALL_FUNCTION(3,0,1); }                                     \
00666       break;                                                            \
00667     case 1:                                                             \
00668       if (!odd_bit) { CALL_FUNCTION(3,1,0); }                           \
00669       else {CALL_FUNCTION(3,1,1); }                                     \
00670       break;                                                            \
00671     case 2:                                                             \
00672       if (!odd_bit) { CALL_FUNCTION(3,2,0); }                           \
00673       else {CALL_FUNCTION(3,2,1); }                                     \
00674       break;                                                            \
00675     case 3:                                                             \
00676       printf("ERROR: invalid direction combination\n"); exit(1);        \
00677       break;                                                            \
00678     }                                                                   \
00679     break;                                                              \
00680   }
00681 
00682 void siteComputeGenStapleParityKernel(void* staple_even, void* staple_odd, 
00683                                       void* sitelink_even, void* sitelink_odd, 
00684                                       void* fatlink_even, void* fatlink_odd,    
00685                                       int mu, int nu,int odd_bit,
00686                                       double mycoeff,
00687                                       dim3 halfGridDim, dim3 blockDim, 
00688                                       QudaReconstructType recon, QudaPrecision prec)
00689 {
00690     
00691 #define  CALL_FUNCTION(mu, nu, odd_bit)                                 \
00692   if (prec == QUDA_DOUBLE_PRECISION){                                   \
00693     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00694       do_siteComputeGenStapleParity18Kernel<mu,nu, odd_bit>             \
00695         <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \
00696                                     (double2*)sitelink_even, (double2*)sitelink_odd, \
00697                                     (double2*)fatlink_even, (double2*)fatlink_odd, \
00698                                     (double)mycoeff);                   \
00699     }else{                                                              \
00700       do_siteComputeGenStapleParity12Kernel<mu,nu, odd_bit>             \
00701         <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \
00702                                     (double2*)sitelink_even, (double2*)sitelink_odd, \
00703                                     (double2*)fatlink_even, (double2*)fatlink_odd, \
00704                                     (double)mycoeff);                   \
00705     }                                                                   \
00706   }else {                                                               \
00707     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00708       do_siteComputeGenStapleParity18Kernel<mu,nu, odd_bit>             \
00709         <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \
00710                                     (float2*)sitelink_even, (float2*)sitelink_odd, \
00711                                     (float2*)fatlink_even, (float2*)fatlink_odd, \
00712                                     (float)mycoeff);                    \
00713     }else{                                                              \
00714       do_siteComputeGenStapleParity12Kernel<mu,nu, odd_bit>             \
00715         <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \
00716                                     (float4*)sitelink_even, (float4*)sitelink_odd, \
00717                                     (float2*)fatlink_even, (float2*)fatlink_odd, \
00718                                     (float)mycoeff);                    \
00719     }                                                                   \
00720   }
00721   
00722   ENUMERATE_FUNCS(mu,nu,odd_bit);
00723   
00724 #undef CALL_FUNCTION
00725     
00726     
00727 }
00728 
00729 void
00730 computeGenStapleFieldParityKernel(void* sitelink_even, void* sitelink_odd,
00731                                   void* fatlink_even, void* fatlink_odd,                            
00732                                   void* mulink_even, void* mulink_odd, 
00733                                   int mu, int nu, int odd_bit,
00734                                   double mycoeff,
00735                                   dim3 halfGridDim, dim3 blockDim, 
00736                                   QudaReconstructType recon, QudaPrecision prec)
00737 {    
00738 #define  CALL_FUNCTION(mu, nu, odd_bit)                                 \
00739   if (prec == QUDA_DOUBLE_PRECISION){                                   \
00740     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00741       do_computeGenStapleFieldParity18Kernel<mu,nu, odd_bit>            \
00742         <<<halfGridDim, blockDim>>>((double2*)sitelink_even, (double2*)sitelink_odd, \
00743                                     (double2*)fatlink_even, (double2*)fatlink_odd, \
00744                                     (double2*)mulink_even, (double2*)mulink_odd, \
00745                                     (double)mycoeff);                   \
00746     }else{                                                              \
00747       do_computeGenStapleFieldParity12Kernel<mu,nu, odd_bit>            \
00748         <<<halfGridDim, blockDim>>>((double2*)sitelink_even, (double2*)sitelink_odd, \
00749                                     (double2*)fatlink_even, (double2*)fatlink_odd, \
00750                                     (double2*)mulink_even, (double2*)mulink_odd, \
00751                                     (double)mycoeff);                   \
00752     }                                                                   \
00753   }else{                                                                \
00754     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00755       do_computeGenStapleFieldParity18Kernel<mu,nu, odd_bit>            \
00756         <<<halfGridDim, blockDim>>>((float2*)sitelink_even, (float2*)sitelink_odd, \
00757                                     (float2*)fatlink_even, (float2*)fatlink_odd, \
00758                                     (float2*)mulink_even, (float2*)mulink_odd, \
00759                                     (float)mycoeff);                    \
00760     }else{                                                              \
00761       do_computeGenStapleFieldParity12Kernel<mu,nu, odd_bit>            \
00762         <<<halfGridDim, blockDim>>>((float4*)sitelink_even, (float4*)sitelink_odd, \
00763                                     (float2*)fatlink_even, (float2*)fatlink_odd, \
00764                                     (float2*)mulink_even, (float2*)mulink_odd, \
00765                                     (float)mycoeff);                    \
00766     }                                                                   \
00767   }
00768   
00769   ENUMERATE_FUNCS(mu,nu,odd_bit);
00770 
00771 #undef CALL_FUNCTION 
00772     
00773 }
00774 
00775 void
00776 computeGenStapleFieldSaveParityKernel(void* staple_even, void* staple_odd, 
00777                                       void* sitelink_even, void* sitelink_odd,
00778                                       void* fatlink_even, void* fatlink_odd,                        
00779                                       void* mulink_even, void* mulink_odd, 
00780                                       int mu, int nu, int odd_bit,
00781                                       double mycoeff,
00782                                       dim3 halfGridDim, dim3 blockDim,
00783                                       QudaReconstructType recon, QudaPrecision prec)
00784 {
00785 #define  CALL_FUNCTION(mu, nu, odd_bit)                                 \
00786   if (prec == QUDA_DOUBLE_PRECISION){                                   \
00787     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00788       do_computeGenStapleFieldSaveParity18Kernel<mu,nu, odd_bit>        \
00789         <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \
00790                                     (double2*)sitelink_even, (double2*)sitelink_odd, \
00791                                     (double2*)fatlink_even, (double2*)fatlink_odd, \
00792                                     (double2*)mulink_even, (double2*)mulink_odd, \
00793                                     (double)mycoeff);                   \
00794     }else{                                                              \
00795       do_computeGenStapleFieldSaveParity12Kernel<mu,nu, odd_bit>        \
00796         <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \
00797                                     (double2*)sitelink_even, (double2*)sitelink_odd, \
00798                                     (double2*)fatlink_even, (double2*)fatlink_odd, \
00799                                     (double2*)mulink_even, (double2*)mulink_odd, \
00800                                     (double)mycoeff);                   \
00801     }                                                                   \
00802   }else{                                                                \
00803     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00804       do_computeGenStapleFieldSaveParity18Kernel<mu,nu, odd_bit>        \
00805         <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \
00806                                     (float2*)sitelink_even, (float2*)sitelink_odd, \
00807                                     (float2*)fatlink_even, (float2*)fatlink_odd, \
00808                                     (float2*)mulink_even, (float2*)mulink_odd, \
00809                                     (float)mycoeff);                    \
00810   }else{                                                                \
00811       do_computeGenStapleFieldSaveParity12Kernel<mu,nu, odd_bit>        \
00812         <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \
00813                                     (float4*)sitelink_even, (float4*)sitelink_odd, \
00814                                     (float2*)fatlink_even, (float2*)fatlink_odd, \
00815                                     (float2*)mulink_even, (float2*)mulink_odd, \
00816                                     (float)mycoeff);                    \
00817     }                                                                   \
00818   }
00819 
00820   ENUMERATE_FUNCS(mu,nu,odd_bit);
00821 
00822 #undef CALL_FUNCTION 
00823     
00824 }
00825 
00826 void
00827 llfat_cuda(void* fatLink, void* siteLink,
00828            FullGauge cudaFatLink, FullGauge cudaSiteLink, 
00829            FullStaple cudaStaple, FullStaple cudaStaple1,
00830            QudaGaugeParam* param, double* act_path_coeff)
00831 {
00832 
00833   int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
00834   dim3 gridDim(volume/BLOCK_DIM,1,1);
00835   dim3 halfGridDim(volume/(2*BLOCK_DIM),1,1);
00836   dim3 blockDim(BLOCK_DIM , 1, 1);
00837   
00838   QudaPrecision prec = cudaSiteLink.precision;
00839   QudaReconstructType recon = cudaSiteLink.reconstruct;
00840   BIND_SITE_AND_FAT_LINK;
00841   if(prec == QUDA_DOUBLE_PRECISION){
00842     if(recon == QUDA_RECONSTRUCT_NO){
00843       llfatOneLink18Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.even, (double2*)cudaSiteLink.odd,
00844                                                   (double2*)cudaFatLink.even, (double2*)cudaFatLink.odd,
00845                                                   (double)act_path_coeff[0], (double)act_path_coeff[5]);    
00846     }else{
00847       
00848       llfatOneLink12Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.even, (double2*)cudaSiteLink.odd,
00849                                                   (double2*)cudaFatLink.even, (double2*)cudaFatLink.odd,
00850                                                   (double)act_path_coeff[0], (double)act_path_coeff[5]);    
00851       
00852     }
00853   }else{ //single precision
00854     if(recon == QUDA_RECONSTRUCT_NO){    
00855       llfatOneLink18Kernel<<<gridDim, blockDim>>>((float2*)cudaSiteLink.even, (float2*)cudaSiteLink.odd,
00856                                                   (float2*)cudaFatLink.even, (float2*)cudaFatLink.odd,
00857                                                   (float)act_path_coeff[0], (float)act_path_coeff[5]);                                                    
00858     }else{
00859       llfatOneLink12Kernel<<<gridDim, blockDim>>>((float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd,
00860                                                   (float2*)cudaFatLink.even, (float2*)cudaFatLink.odd,
00861                                                   (float)act_path_coeff[0], (float)act_path_coeff[5]);    
00862     }
00863   }
00864   checkCudaError();         
00865 UNBIND_SITE_AND_FAT_LINK;
00866 
00867   for(int dir = 0;dir < 4; dir++){
00868     for(int nu = 0; nu < 4; nu++){
00869       if (nu != dir){
00870                 
00871         //even
00872         BIND_SITE_AND_FAT_LINK;         
00873         siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
00874                                          (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
00875                                          (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
00876                                          dir, nu,0,
00877                                          act_path_coeff[2],
00878                                          halfGridDim, blockDim, recon, prec);
00879         checkCudaError();           
00880         UNBIND_SITE_AND_FAT_LINK;
00881         
00882         //odd
00883         BIND_SITE_AND_FAT_LINK_REVERSE;
00884         siteComputeGenStapleParityKernel((void*)cudaStaple.odd, (void*)cudaStaple.even,
00885                                          (void*)cudaSiteLink.odd, (void*)cudaSiteLink.even,
00886                                          (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 
00887                                          dir, nu,1,
00888                                          act_path_coeff[2],
00889                                          halfGridDim, blockDim, recon, prec);   
00890         checkCudaError();           
00891         UNBIND_SITE_AND_FAT_LINK;       
00892         
00893         
00894         //even
00895         BIND_SITE_AND_FAT_LINK;         
00896         cudaBindTexture(0, muLink0TexSingle, cudaStaple.even, cudaStaple.bytes);
00897         cudaBindTexture(0, muLink1TexSingle, cudaStaple.odd, cudaStaple.bytes);
00898         computeGenStapleFieldParityKernel((void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
00899                                           (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
00900                                           (void*)cudaStaple.even, (void*)cudaStaple.odd,
00901                                           dir, nu,0,
00902                                           act_path_coeff[5],
00903                                           halfGridDim, blockDim, recon, prec);                                                    
00904         checkCudaError();           
00905         UNBIND_ALL_TEXTURE;
00906         
00907         //odd
00908         BIND_SITE_AND_FAT_LINK_REVERSE;         
00909         cudaBindTexture(0, muLink1TexSingle, cudaStaple.even, cudaStaple.bytes);
00910         cudaBindTexture(0, muLink0TexSingle, cudaStaple.odd, cudaStaple.bytes);
00911         computeGenStapleFieldParityKernel((void*)cudaSiteLink.odd, (void*)cudaSiteLink.even,
00912                                           (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 
00913                                           (void*)cudaStaple.odd, (void*)cudaStaple.even,
00914                                           dir, nu,1,
00915                                           act_path_coeff[5],
00916                                           halfGridDim, blockDim, recon, prec);  
00917         checkCudaError();           
00918         UNBIND_ALL_TEXTURE;
00919         
00920         
00921         for(int rho = 0; rho < 4; rho++){
00922           if (rho != dir && rho != nu){
00923             
00924             //even
00925             BIND_SITE_AND_FAT_LINK;             
00926             cudaBindTexture(0, muLink0TexSingle, cudaStaple.even, cudaStaple.bytes);
00927             cudaBindTexture(0, muLink1TexSingle, cudaStaple.odd, cudaStaple.bytes);                     
00928             computeGenStapleFieldSaveParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
00929                                                   (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
00930                                                   (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
00931                                                   (void*)cudaStaple.even, (void*)cudaStaple.odd,
00932                                                   dir, rho,0,
00933                                                   act_path_coeff[3],
00934                                                   halfGridDim, blockDim, recon, prec);                                                                
00935             checkCudaError();       
00936             UNBIND_ALL_TEXTURE;
00937             
00938             //odd
00939             BIND_SITE_AND_FAT_LINK_REVERSE;             
00940             cudaBindTexture(0, muLink1TexSingle, cudaStaple.even, cudaStaple.bytes);
00941             cudaBindTexture(0, muLink0TexSingle, cudaStaple.odd, cudaStaple.bytes);                                             
00942             computeGenStapleFieldSaveParityKernel((void*)cudaStaple1.odd, (void*)cudaStaple1.even,
00943                                                   (void*)cudaSiteLink.odd, (void*)cudaSiteLink.even,
00944                                                   (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 
00945                                                   (void*)cudaStaple.odd, (void*)cudaStaple.even,
00946                                                   dir, rho,1,
00947                                                   act_path_coeff[3],
00948                                                   halfGridDim, blockDim, recon, prec);                                                                
00949             checkCudaError();
00950             UNBIND_ALL_TEXTURE;
00951 
00952                         
00953             for(int sig = 0; sig < 4; sig++){
00954               if (sig != dir && sig != nu && sig != rho){                               
00955                                 
00956                 //even                          
00957                 BIND_SITE_AND_FAT_LINK;         
00958                 cudaBindTexture(0, muLink0TexSingle, cudaStaple1.even, cudaStaple1.bytes);
00959                 cudaBindTexture(0, muLink1TexSingle, cudaStaple1.odd,  cudaStaple1.bytes);
00960                 computeGenStapleFieldParityKernel((void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
00961                                                   (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
00962                                                   (void*)cudaStaple1.even, (void*)cudaStaple1.odd,
00963                                                   dir, sig, 0, 
00964                                                   act_path_coeff[4],
00965                                                   halfGridDim, blockDim, recon, prec);  
00966                 checkCudaError();
00967                 UNBIND_ALL_TEXTURE;
00968   
00969                 
00970                 //odd
00971                 BIND_SITE_AND_FAT_LINK_REVERSE;         
00972                 cudaBindTexture(0, muLink1TexSingle, cudaStaple1.even, cudaStaple1.bytes);
00973                 cudaBindTexture(0, muLink0TexSingle, cudaStaple1.odd,  cudaStaple1.bytes);
00974                 computeGenStapleFieldParityKernel((void*)cudaSiteLink.odd, (void*)cudaSiteLink.even,
00975                                                   (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 
00976                                                   (void*)cudaStaple1.odd, (void*)cudaStaple1.even,
00977                                                   dir, sig, 1, 
00978                                                   act_path_coeff[4],
00979                                                   halfGridDim, blockDim, recon, prec);  
00980                 checkCudaError();
00981                 UNBIND_ALL_TEXTURE;                     
00982   
00983               }                     
00984             }//sig
00985           }
00986         }//rho
00987 
00988 
00989 
00990       }
00991     }//nu
00992   }//dir
00993   
00994   checkCudaError();
00995   return;
00996 }
00997 
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines