QUDA v0.4.0
A library for QCD on GPUs
quda/lib/llfat_quda.cu
Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <cuda_runtime.h>
00003 #include <cuda.h>
00004 
00005 #include <quda_internal.h>
00006 #include <llfat_quda.h>
00007 #include <read_gauge.h>
00008 #include "gauge_field.h"
00009 #include <force_common.h>
00010 
00011 #if (__COMPUTE_CAPABILITY__ >= 200)
00012 #define SITE_MATRIX_LOAD_TEX 1
00013 #define MULINK_LOAD_TEX 1
00014 #define FATLINK_LOAD_TEX 1
00015 #else
00016 #define SITE_MATRIX_LOAD_TEX 0
00017 #define MULINK_LOAD_TEX 1
00018 #define FATLINK_LOAD_TEX 1
00019 #endif
00020 
00021 #define WRITE_FAT_MATRIX(gauge, dir, idx)do {                   \
00022     gauge[idx + dir*9*llfat_ga_stride] = FAT0;                  \
00023     gauge[idx + (dir*9+1) * llfat_ga_stride] = FAT1;                    \
00024     gauge[idx + (dir*9+2) * llfat_ga_stride] = FAT2;                    \
00025     gauge[idx + (dir*9+3) * llfat_ga_stride] = FAT3;                    \
00026     gauge[idx + (dir*9+4) * llfat_ga_stride] = FAT4;            \
00027     gauge[idx + (dir*9+5) * llfat_ga_stride] = FAT5;            \
00028     gauge[idx + (dir*9+6) * llfat_ga_stride] = FAT6;            \
00029     gauge[idx + (dir*9+7) * llfat_ga_stride] = FAT7;            \
00030     gauge[idx + (dir*9+8) * llfat_ga_stride] = FAT8;} while(0)                  
00031 
00032 
00033 #define WRITE_STAPLE_MATRIX(gauge, idx)                         \
00034   gauge[idx] = STAPLE0;                                         \
00035   gauge[idx + staple_stride] = STAPLE1;                         \
00036   gauge[idx + 2*staple_stride] = STAPLE2;                       \
00037   gauge[idx + 3*staple_stride] = STAPLE3;                       \
00038   gauge[idx + 4*staple_stride] = STAPLE4;                       \
00039   gauge[idx + 5*staple_stride] = STAPLE5;                       \
00040   gauge[idx + 6*staple_stride] = STAPLE6;                       \
00041   gauge[idx + 7*staple_stride] = STAPLE7;                       \
00042   gauge[idx + 8*staple_stride] = STAPLE8;                                       
00043     
00044 
00045 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \
00046   c##00_re = a*b##00_re;                \
00047   c##00_im = a*b##00_im;                \
00048   c##01_re = a*b##01_re;                \
00049   c##01_im = a*b##01_im;                \
00050   c##02_re = a*b##02_re;                \
00051   c##02_im = a*b##02_im;                \
00052   c##10_re = a*b##10_re;                \
00053   c##10_im = a*b##10_im;                \
00054   c##11_re = a*b##11_re;                \
00055   c##11_im = a*b##11_im;                \
00056   c##12_re = a*b##12_re;                \
00057   c##12_im = a*b##12_im;                \
00058   c##20_re = a*b##20_re;                \
00059   c##20_im = a*b##20_im;                \
00060   c##21_re = a*b##21_re;                \
00061   c##21_im = a*b##21_im;                \
00062   c##22_re = a*b##22_re;                \
00063   c##22_im = a*b##22_im;                \
00064   
00065 /*
00066 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride)     \
00067   float2 var##0 = gauge[idx + dir*6*stride];                            \
00068   float2 var##1 = gauge[idx + dir*6*stride + stride];                   \
00069   float2 var##2 = gauge[idx + dir*6*stride + 2*stride];                 \
00070   float2 var##3 = gauge[idx + dir*6*stride + 3*stride];                 \
00071   float2 var##4 = gauge[idx + dir*6*stride + 4*stride];                 \
00072   float2 var##5 = gauge[idx + dir*6*stride + 5*stride];                 \
00073   float2 var##6, var##7, var##8;
00074 
00075 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
00076   float2 var##0 = tex1Dfetch(gauge, idx + dir*6*stride);                \
00077   float2 var##1 = tex1Dfetch(gauge, idx + dir*6*stride + stride);       \
00078   float2 var##2 = tex1Dfetch(gauge, idx + dir*6*stride + 2*stride);     \
00079   float2 var##3 = tex1Dfetch(gauge, idx + dir*6*stride + 3*stride);     \
00080   float2 var##4 = tex1Dfetch(gauge, idx + dir*6*stride + 4*stride);     \
00081   float2 var##5 = tex1Dfetch(gauge, idx + dir*6*stride + 5*stride);     \
00082   float2 var##6, var##7, var##8;
00083 */
00084 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride)     \
00085   float4 var##0 = gauge[idx + dir*3*stride];                            \
00086   float4 var##1 = gauge[idx + dir*3*stride + stride];                   \
00087   float4 var##2 = gauge[idx + dir*3*stride + 2*stride];                 \
00088   float4 var##3, var##4;                                        
00089 
00090 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
00091   float4 var##0 = tex1Dfetch(gauge, idx + dir*3*stride);                \
00092   float4 var##1 = tex1Dfetch(gauge, idx + dir*3*stride + stride);       \
00093   float4 var##2 = tex1Dfetch(gauge, idx + dir*3*stride + 2*stride);     \
00094   float4 var##3, var##4;
00095 
00096 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var, stride)     \
00097   float2 var##0 = gauge[idx + dir*9*stride];                            \
00098   float2 var##1 = gauge[idx + dir*9*stride + stride];                   \
00099   float2 var##2 = gauge[idx + dir*9*stride + 2*stride];                 \
00100   float2 var##3 = gauge[idx + dir*9*stride + 3*stride];                 \
00101   float2 var##4 = gauge[idx + dir*9*stride + 4*stride];                 \
00102   float2 var##5 = gauge[idx + dir*9*stride + 5*stride];                 \
00103   float2 var##6 = gauge[idx + dir*9*stride + 6*stride];                 \
00104   float2 var##7 = gauge[idx + dir*9*stride + 7*stride];                 \
00105   float2 var##8 = gauge[idx + dir*9*stride + 8*stride];                 
00106 
00107 
00108 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
00109   float2 var##0 = tex1Dfetch(gauge, idx + dir*9*stride);                \
00110   float2 var##1 = tex1Dfetch(gauge, idx + dir*9*stride + stride);       \
00111   float2 var##2 = tex1Dfetch(gauge, idx + dir*9*stride + 2*stride);     \
00112   float2 var##3 = tex1Dfetch(gauge, idx + dir*9*stride + 3*stride);     \
00113   float2 var##4 = tex1Dfetch(gauge, idx + dir*9*stride + 4*stride);     \
00114   float2 var##5 = tex1Dfetch(gauge, idx + dir*9*stride + 5*stride);     \
00115   float2 var##6 = tex1Dfetch(gauge, idx + dir*9*stride + 6*stride);     \
00116   float2 var##7 = tex1Dfetch(gauge, idx + dir*9*stride + 7*stride);     \
00117   float2 var##8 = tex1Dfetch(gauge, idx + dir*9*stride + 8*stride);                     
00118 
00119 
00120 
00121 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var, stride)     \
00122   double2 var##0 = gauge[idx + dir*9*stride];                           \
00123   double2 var##1 = gauge[idx + dir*9*stride + stride];                  \
00124   double2 var##2 = gauge[idx + dir*9*stride + 2*stride];                \
00125   double2 var##3 = gauge[idx + dir*9*stride + 3*stride];                \
00126   double2 var##4 = gauge[idx + dir*9*stride + 4*stride];                \
00127   double2 var##5 = gauge[idx + dir*9*stride + 5*stride];                \
00128   double2 var##6 = gauge[idx + dir*9*stride + 6*stride];                \
00129   double2 var##7 = gauge[idx + dir*9*stride + 7*stride];                \
00130   double2 var##8 = gauge[idx + dir*9*stride + 8*stride];                        
00131 
00132 
00133 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
00134   double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride); \
00135   double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + stride); \
00136   double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 2*stride); \
00137   double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 3*stride); \
00138   double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 4*stride); \
00139   double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 5*stride); \
00140   double2 var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 6*stride); \
00141   double2 var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 7*stride); \
00142   double2 var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 8*stride);       
00143 
00144 
00145 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var, stride)             \
00146   double2 var##0 = gauge[idx + dir*6*stride];                           \
00147   double2 var##1 = gauge[idx + dir*6*stride + stride];                  \
00148   double2 var##2 = gauge[idx + dir*6*stride + 2*stride];                \
00149   double2 var##3 = gauge[idx + dir*6*stride + 3*stride];                \
00150   double2 var##4 = gauge[idx + dir*6*stride + 4*stride];                \
00151   double2 var##5 = gauge[idx + dir*6*stride + 5*stride];                \
00152   double2 var##6, var##7, var##8;
00153 
00154 
00155 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
00156   double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride); \
00157   double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + stride); \
00158   double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 2*stride); \
00159   double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 3*stride); \
00160   double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 4*stride); \
00161   double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 5*stride); \
00162   double2 var##6, var##7, var##8;
00163 
00164 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc)        \
00165   mc##00_re = ma##00_re + mb##00_re;            \
00166   mc##00_im = ma##00_im + mb##00_im;            \
00167   mc##01_re = ma##01_re + mb##01_re;            \
00168   mc##01_im = ma##01_im + mb##01_im;            \
00169   mc##02_re = ma##02_re + mb##02_re;            \
00170   mc##02_im = ma##02_im + mb##02_im;            \
00171   mc##10_re = ma##10_re + mb##10_re;            \
00172   mc##10_im = ma##10_im + mb##10_im;            \
00173   mc##11_re = ma##11_re + mb##11_re;            \
00174   mc##11_im = ma##11_im + mb##11_im;            \
00175   mc##12_re = ma##12_re + mb##12_re;            \
00176   mc##12_im = ma##12_im + mb##12_im;            \
00177   mc##20_re = ma##20_re + mb##20_re;            \
00178   mc##20_im = ma##20_im + mb##20_im;            \
00179   mc##21_re = ma##21_re + mb##21_re;            \
00180   mc##21_im = ma##21_im + mb##21_im;            \
00181   mc##22_re = ma##22_re + mb##22_re;            \
00182   mc##22_im = ma##22_im + mb##22_im;            
00183 
00184 __constant__ int dir1_array[16];
00185 __constant__ int dir2_array[16];
00186 __constant__ int last_proc_in_tdim;
00187 __constant__ int first_proc_in_tdim;
00188 
00189 unsigned long staple_bytes=0;
00190 
00191 void
00192 llfat_init_cuda(QudaGaugeParam* param)
00193 {
00194   static int llfat_init_cuda_flag = 0;
00195   if (llfat_init_cuda_flag){
00196     return;
00197   }
00198   
00199   llfat_init_cuda_flag = 1;
00200   
00201   init_kernel_cuda(param);
00202   int Vh = param->X[0]*param->X[1]*param->X[2]*param->X[3]/2;
00203   int site_ga_stride = param->site_ga_pad + Vh;
00204   int staple_stride = param->staple_pad + Vh;
00205   int llfat_ga_stride = param->llfat_ga_pad + Vh;
00206   
00207   cudaMemcpyToSymbol("site_ga_stride", &site_ga_stride, sizeof(int));  
00208   cudaMemcpyToSymbol("staple_stride", &staple_stride, sizeof(int));  
00209   cudaMemcpyToSymbol("llfat_ga_stride", &llfat_ga_stride, sizeof(int));
00210   int dir1[16];
00211   int dir2[16];
00212   for(int nu =0; nu < 4; nu++)
00213     for(int mu=0; mu < 4; mu++){
00214       if(nu == mu) continue;
00215       int d1, d2;
00216       for(d1=0; d1 < 4; d1 ++){
00217         if(d1 != nu && d1 != mu){
00218           break;
00219         }
00220       }
00221       dir1[nu*4+mu] = d1;
00222 
00223       for(d2=0; d2 < 4; d2 ++){
00224         if(d2 != nu && d2 != mu && d2 != d1){
00225           break;
00226         }
00227       }
00228 
00229       dir2[nu*4+mu] = d2;
00230     }
00231   
00232   cudaMemcpyToSymbol("dir1_array", &dir1, sizeof(dir1));
00233   cudaMemcpyToSymbol("dir2_array", &dir2, sizeof(dir2));   
00234   
00235   int first_proc_in_tdim = 0;
00236   int last_proc_in_tdim = 0;
00237   if(commCoords(3) == (commDim(3) -1)){
00238     last_proc_in_tdim =  1;
00239   }
00240   
00241   if(commCoords(3) == 0){
00242     first_proc_in_tdim =  1;    
00243   }
00244 
00245   cudaMemcpyToSymbol("last_proc_in_tdim", &last_proc_in_tdim, sizeof(int));
00246   cudaMemcpyToSymbol("first_proc_in_tdim", &first_proc_in_tdim, sizeof(int));
00247   
00248 }
00249 
00250 
00251 void
00252 llfat_init_cuda_ex(QudaGaugeParam* param_ex)
00253 {
00254   static int llfat_init_cuda_flag = 0;
00255   if (llfat_init_cuda_flag){
00256     return;
00257   }
00258 
00259   llfat_init_cuda_flag = 1;
00260   
00261   init_kernel_cuda(param_ex);
00262   int Vh_ex = param_ex->X[0]*param_ex->X[1]*param_ex->X[2]*param_ex->X[3]/2;
00263   int Vh = (param_ex->X[0]-4)*(param_ex->X[1]-4)*(param_ex->X[2]-4)*(param_ex->X[3]-4)/2;
00264   int site_ga_stride = param_ex->site_ga_pad + Vh_ex;
00265   int staple_stride = param_ex->staple_pad + Vh_ex;
00266   int llfat_ga_stride = param_ex->llfat_ga_pad + Vh;
00267   
00268   cudaMemcpyToSymbol("site_ga_stride", &site_ga_stride, sizeof(int));
00269   cudaMemcpyToSymbol("staple_stride", &staple_stride, sizeof(int));
00270   cudaMemcpyToSymbol("llfat_ga_stride", &llfat_ga_stride, sizeof(int));
00271   
00272   int first_proc_in_tdim = 0;
00273   int last_proc_in_tdim = 0;
00274   if(commCoords(3) == (commDim(3) -1)){
00275     last_proc_in_tdim =  1;
00276   }
00277   
00278   if(commCoords(3) == 0){
00279     first_proc_in_tdim =  1;
00280   }
00281   
00282   cudaMemcpyToSymbol("last_proc_in_tdim", &last_proc_in_tdim, sizeof(int));
00283   cudaMemcpyToSymbol("first_proc_in_tdim", &first_proc_in_tdim, sizeof(int));
00284   
00285   int E1 = param_ex->X[0];
00286   int E1h = E1/2;
00287   int E2 = param_ex->X[1];
00288   int E3 = param_ex->X[2];
00289   int E4 = param_ex->X[3];
00290   int E2E1 =E2*E1;
00291   int E3E2E1=E3*E2*E1;
00292   
00293   cudaMemcpyToSymbol("E1", &E1, sizeof(int));
00294   cudaMemcpyToSymbol("E1h", &E1h, sizeof(int));
00295   cudaMemcpyToSymbol("E2", &E2, sizeof(int));
00296   cudaMemcpyToSymbol("E3", &E3, sizeof(int));
00297   cudaMemcpyToSymbol("E4", &E4, sizeof(int));
00298   cudaMemcpyToSymbol("E2E1", &E2E1, sizeof(int));
00299   cudaMemcpyToSymbol("E3E2E1", &E3E2E1, sizeof(int));
00300 
00301   cudaMemcpyToSymbol("Vh_ex", &Vh_ex, sizeof(int));
00302   
00303 }
00304 
00305 
00306 
00307 
00308 #define LLFAT_CONCAT(a,b) a##b##Kernel
00309 #define LLFAT_CONCAT_EX(a,b) a##b##Kernel_ex
00310 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b)
00311 #define LLFAT_KERNEL_EX(a,b) LLFAT_CONCAT_EX(a,b)
00312 
00313 //precision: 0 is for double, 1 is for single
00314 
00315 //single precision, common macro
00316 #define PRECISION 1
00317 #define Float  float
00318 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, FAT, llfat_ga_stride)
00319 #if (MULINK_LOAD_TEX == 1)
00320 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink1TexSingle:muLink0TexSingle), dir, idx, var, staple_stride)
00321 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink0TexSingle:muLink1TexSingle), dir, idx, var, staple_stride)
00322 #else
00323 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_even, dir, idx, var, staple_stride)
00324 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_odd, dir, idx, var, staple_stride)
00325 #endif
00326 
00327 #if (FATLINK_LOAD_TEX == 1)
00328 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge1TexSingle:fatGauge0TexSingle), dir, idx, FAT, llfat_ga_stride);
00329 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge0TexSingle:fatGauge1TexSingle), dir, idx, FAT, llfat_ga_stride);
00330 #else
00331 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_even, dir, idx, FAT, llfat_ga_stride)
00332 #define LOAD_ODD_FAT_MATRIX(dir, idx)  LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_odd, dir, idx, FAT, llfat_ga_stride)
00333 #endif
00334 
00335 
00336 //single precision, 12-reconstruct
00337 #define DECLARE_VAR_SIGN short sign=1
00338 #define SITELINK0TEX siteLink0TexSingle_recon
00339 #define SITELINK1TEX siteLink1TexSingle_recon
00340 #if (SITE_MATRIX_LOAD_TEX == 1)
00341 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, site_ga_stride)
00342 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, site_ga_stride)
00343 #else
00344 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride)
00345 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride)
00346 #endif
00347 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var, site_ga_stride)
00348 
00349 #define RECONSTRUCT_SITE_LINK(sign, var)  RECONSTRUCT_LINK_12(sign, var);
00350 #define FloatN float4
00351 #define FloatM float2
00352 #define RECONSTRUCT 12
00353 #define sd_data float_12_sd_data
00354 #include "llfat_core.h"
00355 #undef DECLARE_VAR_SIGN
00356 #undef SITELINK0TEX
00357 #undef SITELINK1TEX
00358 #undef LOAD_EVEN_SITE_MATRIX
00359 #undef LOAD_ODD_SITE_MATRIX
00360 #undef LOAD_SITE_MATRIX
00361 #undef RECONSTRUCT_SITE_LINK
00362 #undef FloatN
00363 #undef FloatM
00364 #undef RECONSTRUCT
00365 #undef sd_data
00366 
00367 //single precision, 18-reconstruct
00368 #define SITELINK0TEX siteLink0TexSingle_norecon
00369 #define SITELINK1TEX siteLink1TexSingle_norecon
00370 #if (SITE_MATRIX_LOAD_TEX == 1)
00371 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var)  LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, site_ga_stride)
00372 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, site_ga_stride)
00373 #else
00374 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride)
00375 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride)
00376 #endif
00377 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink, dir, idx, var, site_ga_stride)
00378 #define RECONSTRUCT_SITE_LINK(sign, var)  
00379 #define FloatN float2
00380 #define FloatM float2
00381 #define RECONSTRUCT 18
00382 #define sd_data float_18_sd_data
00383 #include "llfat_core.h"
00384 #undef SITELINK0TEX
00385 #undef SITELINK1TEX
00386 #undef LOAD_EVEN_SITE_MATRIX
00387 #undef LOAD_ODD_SITE_MATRIX
00388 #undef LOAD_SITE_MATRIX
00389 #undef RECONSTRUCT_SITE_LINK
00390 #undef FloatN
00391 #undef FloatM
00392 #undef RECONSTRUCT
00393 #undef sd_data
00394 
00395 
00396 #undef PRECISION
00397 #undef Float
00398 #undef LOAD_FAT_MATRIX
00399 #undef LOAD_EVEN_MULINK_MATRIX
00400 #undef LOAD_ODD_MULINK_MATRIX
00401 #undef LOAD_EVEN_FAT_MATRIX
00402 #undef LOAD_ODD_FAT_MATRIX
00403 
00404 
00405 //double precision, common macro
00406 #define PRECISION 0
00407 #define Float double
00408 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, FAT, llfat_ga_stride)
00409 #if (MULINK_LOAD_TEX == 1)
00410 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(odd_bit?muLink1TexDouble:muLink0TexDouble), mulink_even, dir, idx, var, staple_stride)
00411 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?muLink0TexDouble:muLink1TexDouble), mulink_odd, dir, idx, var, staple_stride)
00412 #else
00413 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var, staple_stride)
00414 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var, staple_stride)
00415 #endif
00416 
00417 #if (FATLINK_LOAD_TEX == 1)
00418 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge1TexDouble:fatGauge0TexDouble), fatlink_even, dir, idx, FAT, llfat_ga_stride)
00419 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge0TexDouble:fatGauge1TexDouble), fatlink_odd, dir, idx, FAT, llfat_ga_stride)
00420 #else
00421 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_even, dir, idx, FAT, llfat_ga_stride)
00422 #define LOAD_ODD_FAT_MATRIX(dir, idx)  LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_odd, dir, idx, FAT, llfat_ga_stride)
00423 #endif
00424 
00425 //double precision,  18-reconstruct
00426 #define SITELINK0TEX siteLink0TexDouble
00427 #define SITELINK1TEX siteLink1TexDouble
00428 #if (SITE_MATRIX_LOAD_TEX == 1)
00429 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, site_ga_stride)
00430 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, site_ga_stride)
00431 #else
00432 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride)
00433 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride)
00434 #endif
00435 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink, dir, idx, var, site_ga_stride)
00436 #define RECONSTRUCT_SITE_LINK(sign, var)  
00437 #define FloatN double2
00438 #define FloatM double2
00439 #define RECONSTRUCT 18
00440 #define sd_data double_18_sd_data
00441 #include "llfat_core.h"
00442 #undef SITELINK0TEX
00443 #undef SITELINK1TEX
00444 #undef LOAD_EVEN_SITE_MATRIX
00445 #undef LOAD_ODD_SITE_MATRIX
00446 #undef LOAD_SITE_MATRIX
00447 #undef RECONSTRUCT_SITE_LINK
00448 #undef FloatN
00449 #undef FloatM
00450 #undef RECONSTRUCT
00451 #undef sd_data
00452 
00453 
00454 
00455 #if 1
00456 //double precision, 12-reconstruct
00457 #define SITELINK0TEX siteLink0TexDouble
00458 #define SITELINK1TEX siteLink1TexDouble
00459 #if (SITE_MATRIX_LOAD_TEX == 1)
00460 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, site_ga_stride)
00461 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, site_ga_stride)
00462 #else
00463 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride)
00464 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride)
00465 #endif
00466 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var, site_ga_stride)
00467 #define RECONSTRUCT_SITE_LINK(sign, var)  RECONSTRUCT_LINK_12(sign, var);
00468 #define FloatN double2
00469 #define FloatM double2
00470 #define RECONSTRUCT 12
00471 #define sd_data double_12_sd_data
00472 #include "llfat_core.h"
00473 #undef SITELINK0TEX
00474 #undef SITELINK1TEX
00475 #undef LOAD_EVEN_SITE_MATRIX
00476 #undef LOAD_ODD_SITE_MATRIX
00477 #undef LOAD_SITE_MATRIX
00478 #undef RECONSTRUCT_SITE_LINK
00479 #undef FloatN
00480 #undef FloatM
00481 #undef RECONSTRUCT
00482 #undef sd_data
00483 #endif
00484 
00485 #undef PRECISION
00486 #undef Float
00487 #undef LOAD_FAT_MATRIX
00488 #undef LOAD_EVEN_MULINK_MATRIX
00489 #undef LOAD_ODD_MULINK_MATRIX
00490 #undef LOAD_EVEN_FAT_MATRIX
00491 #undef LOAD_ODD_FAT_MATRIX
00492 
00493 #undef LLFAT_CONCAT
00494 #undef LLFAT_KERNEL
00495 
00496 #define UNBIND_ALL_TEXTURE do{                                          \
00497     if(prec ==QUDA_DOUBLE_PRECISION){                                   \
00498       cudaUnbindTexture(siteLink0TexDouble);                            \
00499       cudaUnbindTexture(siteLink1TexDouble);                            \
00500       cudaUnbindTexture(fatGauge0TexDouble);                            \
00501       cudaUnbindTexture(fatGauge1TexDouble);                            \
00502       cudaUnbindTexture(muLink0TexDouble);                              \
00503       cudaUnbindTexture(muLink1TexDouble);                              \
00504     }else{                                                              \
00505       if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){              \
00506         cudaUnbindTexture(siteLink0TexSingle_norecon);                  \
00507         cudaUnbindTexture(siteLink1TexSingle_norecon);                  \
00508       }else{                                                            \
00509         cudaUnbindTexture(siteLink0TexSingle_recon);                    \
00510         cudaUnbindTexture(siteLink1TexSingle_recon);                    \
00511       }                                                                 \
00512       cudaUnbindTexture(fatGauge0TexSingle);                            \
00513       cudaUnbindTexture(fatGauge1TexSingle);                            \
00514       cudaUnbindTexture(muLink0TexSingle);                              \
00515       cudaUnbindTexture(muLink1TexSingle);                              \
00516     }                                                                   \
00517   }while(0)
00518 
00519 #define UNBIND_SITE_AND_FAT_LINK do{                                    \
00520     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00521       cudaUnbindTexture(siteLink0TexDouble);                            \
00522       cudaUnbindTexture(siteLink1TexDouble);                            \
00523       cudaUnbindTexture(fatGauge0TexDouble);                            \
00524       cudaUnbindTexture(fatGauge1TexDouble);                            \
00525     }else {                                                             \
00526       if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){              \
00527         cudaUnbindTexture(siteLink0TexSingle_norecon);                  \
00528         cudaUnbindTexture(siteLink1TexSingle_norecon);                  \
00529       }else{                                                            \
00530         cudaUnbindTexture(siteLink0TexSingle_recon);                    \
00531         cudaUnbindTexture(siteLink1TexSingle_recon);                    \
00532       }                                                                 \
00533       cudaUnbindTexture(fatGauge0TexSingle);                            \
00534       cudaUnbindTexture(fatGauge1TexSingle);                            \
00535     }                                                                   \
00536   }while(0)
00537 
00538 
00539 #define BIND_MU_LINK() do{                                              \
00540     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00541       cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes);  \
00542       cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes);   \
00543     }else{                                                              \
00544       cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes);  \
00545       cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes);   \
00546     }                                                                   \
00547   }while(0)
00548 
00549 #define UNBIND_MU_LINK() do{                      \
00550     if(prec == QUDA_DOUBLE_PRECISION){            \
00551       cudaUnbindTexture(muLink0TexSingle);        \
00552       cudaUnbindTexture(muLink1TexSingle);        \
00553     }else{                                        \
00554       cudaUnbindTexture(muLink0TexDouble);        \
00555       cudaUnbindTexture(muLink1TexDouble);        \
00556     }                                             \
00557   }while(0)                
00558 
00559 
00560 #define BIND_SITE_AND_FAT_LINK do {                                     \
00561   if(prec == QUDA_DOUBLE_PRECISION){                                    \
00562     cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
00563     cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
00564     cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
00565     cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.Odd_p(),  cudaFatLink.Bytes()); \
00566   }else{                                                                \
00567     if(cudaSiteLink.Reconstruct() == QUDA_RECONSTRUCT_NO){              \
00568       cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
00569       cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
00570     }else{                                                              \
00571       cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
00572       cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
00573     }                                                                   \
00574     cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
00575     cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.Odd_p(),  cudaFatLink.Bytes()); \
00576     }                                                                   \
00577   }while(0)
00578 
00579 #define BIND_MU_LINK() do{                                              \
00580     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00581       cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes);  \
00582       cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes);   \
00583     }else{                                                              \
00584       cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes);  \
00585       cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes);   \
00586     }                                                                   \
00587   }while(0)
00588 
00589 #define UNBIND_MU_LINK() do{                                            \
00590     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00591       cudaUnbindTexture(muLink0TexSingle);                              \
00592       cudaUnbindTexture(muLink1TexSingle);                              \
00593     }else{                                                              \
00594       cudaUnbindTexture(muLink0TexDouble);                              \
00595       cudaUnbindTexture(muLink1TexDouble);                              \
00596     }                                                                   \
00597   }while(0)                                                             
00598 
00599 #define BIND_SITE_AND_FAT_LINK_REVERSE do {                             \
00600     if(prec == QUDA_DOUBLE_PRECISION){                                  \
00601       cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \
00602       cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \
00603       cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \
00604       cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd,  cudaFatLink.bytes); \
00605     }else{                                                              \
00606       if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){              \
00607         cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \
00608         cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \
00609       }else{                                                            \
00610         cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.even, cudaSiteLink.bytes); \
00611         cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.odd, cudaSiteLink.bytes); \
00612       }                                                                 \
00613       cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \
00614       cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd,  cudaFatLink.bytes); \
00615     }                                                                   \
00616   }while(0)
00617 
00618 
00619 
00620 #define ENUMERATE_FUNCS(mu,nu)  switch(mu) {                            \
00621   case 0:                                                               \
00622     switch(nu){                                                         \
00623     case 0:                                                             \
00624       printf("ERROR: invalid direction combination\n"); exit(1);        \
00625       break;                                                            \
00626     case 1:                                                             \
00627       CALL_FUNCTION(0,1);                                               \
00628       break;                                                            \
00629     case 2:                                                             \
00630       CALL_FUNCTION(0,2);                                               \
00631       break;                                                            \
00632     case 3:                                                             \
00633       CALL_FUNCTION(0,3);                                               \
00634       break;                                                            \
00635     }                                                                   \
00636     break;                                                              \
00637   case 1:                                                               \
00638     switch(nu){                                                         \
00639     case 0:                                                             \
00640       CALL_FUNCTION(1,0);                                               \
00641       break;                                                            \
00642     case 1:                                                             \
00643       printf("ERROR: invalid direction combination\n"); exit(1);        \
00644       break;                                                            \
00645     case 2:                                                             \
00646       CALL_FUNCTION(1,2);                                               \
00647       break;                                                            \
00648     case 3:                                                             \
00649       CALL_FUNCTION(1,3);                                               \
00650       break;                                                            \
00651     }                                                                   \
00652     break;                                                              \
00653   case 2:                                                               \
00654     switch(nu){                                                         \
00655     case 0:                                                             \
00656       CALL_FUNCTION(2,0);                                               \
00657       break;                                                            \
00658     case 1:                                                             \
00659       CALL_FUNCTION(2,1);                                               \
00660       break;                                                            \
00661     case 2:                                                             \
00662       printf("ERROR: invalid direction combination\n"); exit(1);        \
00663       break;                                                            \
00664     case 3:                                                             \
00665       CALL_FUNCTION(2,3);                                               \
00666       break;                                                            \
00667     }                                                                   \
00668     break;                                                              \
00669   case 3:                                                               \
00670     switch(nu){                                                         \
00671     case 0:                                                             \
00672       CALL_FUNCTION(3,0);                                               \
00673       break;                                                            \
00674     case 1:                                                             \
00675       CALL_FUNCTION(3,1);                                               \
00676       break;                                                            \
00677     case 2:                                                             \
00678       CALL_FUNCTION(3,2);                                               \
00679       break;                                                            \
00680     case 3:                                                             \
00681       printf("ERROR: invalid direction combination\n"); exit(1);        \
00682       break;                                                            \
00683     }                                                                   \
00684     break;                                                              \
00685   }
00686 
00687 #define ENUMERATE_FUNCS_SAVE(mu,nu, save_staple) if(save_staple){ \
00688     switch(mu) {                                                        \
00689     case 0:                                                             \
00690       switch(nu){                                                       \
00691       case 0:                                                           \
00692         printf("ERROR: invalid direction combination\n"); exit(1);      \
00693         break;                                                          \
00694       case 1:                                                           \
00695         CALL_FUNCTION(0,1,1);                                           \
00696         break;                                                          \
00697       case 2:                                                           \
00698         CALL_FUNCTION(0,2,1);                                           \
00699         break;                                                          \
00700       case 3:                                                           \
00701         CALL_FUNCTION(0,3,1);                                           \
00702         break;                                                          \
00703       }                                                                 \
00704       break;                                                            \
00705     case 1:                                                             \
00706       switch(nu){                                                       \
00707       case 0:                                                           \
00708         CALL_FUNCTION(1,0,1);                                           \
00709         break;                                                          \
00710       case 1:                                                           \
00711         printf("ERROR: invalid direction combination\n"); exit(1);      \
00712         break;                                                          \
00713       case 2:                                                           \
00714         CALL_FUNCTION(1,2,1);                                           \
00715         break;                                                          \
00716       case 3:                                                           \
00717         CALL_FUNCTION(1,3,1);                                           \
00718         break;                                                          \
00719       }                                                                 \
00720       break;                                                            \
00721     case 2:                                                             \
00722       switch(nu){                                                       \
00723       case 0:                                                           \
00724         CALL_FUNCTION(2,0,1);                                           \
00725         break;                                                          \
00726       case 1:                                                           \
00727         CALL_FUNCTION(2,1,1);                                           \
00728         break;                                                          \
00729       case 2:                                                           \
00730         printf("ERROR: invalid direction combination\n"); exit(1);      \
00731         break;                                                          \
00732       case 3:                                                           \
00733         CALL_FUNCTION(2,3,1);                                           \
00734         break;                                                          \
00735       }                                                                 \
00736       break;                                                            \
00737     case 3:                                                             \
00738       switch(nu){                                                       \
00739       case 0:                                                           \
00740         CALL_FUNCTION(3,0,1);                                           \
00741         break;                                                          \
00742       case 1:                                                           \
00743         CALL_FUNCTION(3,1,1);                                           \
00744         break;                                                          \
00745       case 2:                                                           \
00746         CALL_FUNCTION(3,2,1);                                           \
00747         break;                                                          \
00748       case 3:                                                           \
00749         printf("ERROR: invalid direction combination\n"); exit(1);      \
00750         break;                                                          \
00751       }                                                                 \
00752       break;                                                            \
00753     }                                                                   \
00754   }else{                                                                \
00755     switch(mu) {                                                        \
00756     case 0:                                                             \
00757       switch(nu){                                                       \
00758       case 0:                                                           \
00759         printf("ERROR: invalid direction combination\n"); exit(1);      \
00760         break;                                                          \
00761       case 1:                                                           \
00762         CALL_FUNCTION(0,1,0);                                           \
00763         break;                                                          \
00764       case 2:                                                           \
00765         CALL_FUNCTION(0,2,0);                                           \
00766         break;                                                          \
00767       case 3:                                                           \
00768         CALL_FUNCTION(0,3,0);                                           \
00769         break;                                                          \
00770       }                                                                 \
00771       break;                                                            \
00772     case 1:                                                             \
00773       switch(nu){                                                       \
00774       case 0:                                                           \
00775         CALL_FUNCTION(1,0,0);                                           \
00776         break;                                                          \
00777       case 1:                                                           \
00778         printf("ERROR: invalid direction combination\n"); exit(1);      \
00779         break;                                                          \
00780       case 2:                                                           \
00781         CALL_FUNCTION(1,2,0);                                           \
00782         break;                                                          \
00783       case 3:                                                           \
00784         CALL_FUNCTION(1,3,0);                                           \
00785         break;                                                          \
00786       }                                                                 \
00787       break;                                                            \
00788     case 2:                                                             \
00789       switch(nu){                                                       \
00790       case 0:                                                           \
00791         CALL_FUNCTION(2,0,0);                                           \
00792         break;                                                          \
00793       case 1:                                                           \
00794         CALL_FUNCTION(2,1,0);                                           \
00795         break;                                                          \
00796       case 2:                                                           \
00797         printf("ERROR: invalid direction combination\n"); exit(1);      \
00798         break;                                                          \
00799       case 3:                                                           \
00800         CALL_FUNCTION(2,3,0);                                           \
00801         break;                                                          \
00802       }                                                                 \
00803       break;                                                            \
00804     case 3:                                                             \
00805       switch(nu){                                                       \
00806       case 0:                                                           \
00807         CALL_FUNCTION(3,0,0);                                           \
00808         break;                                                          \
00809       case 1:                                                           \
00810         CALL_FUNCTION(3,1,0);                                           \
00811         break;                                                          \
00812       case 2:                                                           \
00813         CALL_FUNCTION(3,2,0);                                           \
00814         break;                                                          \
00815       case 3:                                                           \
00816         printf("ERROR: invalid direction combination\n"); exit(1);      \
00817         break;                                                          \
00818       }                                                                 \
00819       break;                                                            \
00820     }                                                                   \
00821   }
00822 
00823 void siteComputeGenStapleParityKernel(void* staple_even, void* staple_odd, 
00824                                       void* sitelink_even, void* sitelink_odd, 
00825                                       void* fatlink_even, void* fatlink_odd,    
00826                                       int mu, int nu, double mycoeff,
00827                                       QudaReconstructType recon, QudaPrecision prec,
00828                                       dim3 halfGridDim,  llfat_kernel_param_t kparam,
00829                                       cudaStream_t* stream)
00830 {
00831 
00832   //compute even and odd
00833   
00834 #define  CALL_FUNCTION(mu, nu)                                          \
00835   if (prec == QUDA_DOUBLE_PRECISION){                                   \
00836     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00837       do_siteComputeGenStapleParity18Kernel<mu,nu, 0>           \
00838         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
00839                                                 (double2*)sitelink_even, (double2*)sitelink_odd, \
00840                                                 (double2*)fatlink_even, (double2*)fatlink_odd, \
00841                                                 (double)mycoeff, kparam);       \
00842       do_siteComputeGenStapleParity18Kernel<mu,nu, 1>           \
00843         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
00844                                                 (double2*)sitelink_odd, (double2*)sitelink_even, \
00845                                                 (double2*)fatlink_odd, (double2*)fatlink_even, \
00846                                                 (double)mycoeff, kparam);       \
00847     }else{                                                              \
00848       do_siteComputeGenStapleParity12Kernel<mu,nu, 0>           \
00849         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
00850                                                 (double2*)sitelink_even, (double2*)sitelink_odd, \
00851                                                 (double2*)fatlink_even, (double2*)fatlink_odd, \
00852                                                 (double)mycoeff, kparam);       \
00853       do_siteComputeGenStapleParity12Kernel<mu,nu, 1>           \
00854         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
00855                                                 (double2*)sitelink_odd, (double2*)sitelink_even, \
00856                                                 (double2*)fatlink_odd, (double2*)fatlink_even, \
00857                                                 (double)mycoeff, kparam);       \
00858     }                                                                   \
00859   }else {                                                               \
00860     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00861       do_siteComputeGenStapleParity18Kernel<mu,nu, 0>           \
00862         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
00863                                                 (float2*)sitelink_even, (float2*)sitelink_odd, \
00864                                                 (float2*)fatlink_even, (float2*)fatlink_odd, \
00865                                                 (float)mycoeff, kparam);        \
00866       do_siteComputeGenStapleParity18Kernel<mu,nu, 1>           \
00867         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
00868                                                 (float2*)sitelink_odd, (float2*)sitelink_even, \
00869                                                 (float2*)fatlink_odd, (float2*)fatlink_even, \
00870                                                 (float)mycoeff, kparam); \
00871     }else{                                                              \
00872       do_siteComputeGenStapleParity12Kernel<mu,nu, 0>           \
00873         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
00874                                                 (float4*)sitelink_even, (float4*)sitelink_odd, \
00875                                                 (float2*)fatlink_even, (float2*)fatlink_odd, \
00876                                                 (float)mycoeff, kparam); \
00877       do_siteComputeGenStapleParity12Kernel<mu,nu, 1>           \
00878         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
00879                                                 (float4*)sitelink_odd, (float4*)sitelink_even, \
00880                                                 (float2*)fatlink_odd, (float2*)fatlink_even, \
00881                                                 (float)mycoeff, kparam); \
00882     }                                                                   \
00883   }
00884   
00885 
00886   dim3 blockDim(BLOCK_DIM , 1, 1);  
00887   ENUMERATE_FUNCS(mu,nu);  
00888 
00889 #undef CALL_FUNCTION
00890     
00891     
00892 }
00893 
00894 
00895 void
00896 computeGenStapleFieldParityKernel(void* staple_even, void* staple_odd, 
00897                                   void* sitelink_even, void* sitelink_odd,
00898                                   void* fatlink_even, void* fatlink_odd,                            
00899                                   void* mulink_even, void* mulink_odd, 
00900                                   int mu, int nu, int save_staple,
00901                                   double mycoeff,
00902                                   QudaReconstructType recon, QudaPrecision prec,
00903                                   dim3 halfGridDim, llfat_kernel_param_t kparam,
00904                                   cudaStream_t* stream)
00905 {
00906 
00907 #define  CALL_FUNCTION(mu, nu, save_staple)                             \
00908   if (prec == QUDA_DOUBLE_PRECISION){                                   \
00909     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00910       do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple>     \
00911         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
00912                                                 (double2*)sitelink_even, (double2*)sitelink_odd, \
00913                                                 (double2*)fatlink_even, (double2*)fatlink_odd, \
00914                                                 (double2*)mulink_even, (double2*)mulink_odd, \
00915                                                 (double)mycoeff, kparam); \
00916       do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
00917         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
00918                                                 (double2*)sitelink_odd, (double2*)sitelink_even, \
00919                                                 (double2*)fatlink_odd, (double2*)fatlink_even, \
00920                                                 (double2*)mulink_odd, (double2*)mulink_even, \
00921                                                 (double)mycoeff, kparam); \
00922     }else{                                                              \
00923       do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
00924         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
00925                                                 (double2*)sitelink_even, (double2*)sitelink_odd, \
00926                                                 (double2*)fatlink_even, (double2*)fatlink_odd, \
00927                                                 (double2*)mulink_even, (double2*)mulink_odd, \
00928                                                 (double)mycoeff, kparam); \
00929       do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
00930         <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
00931                                                 (double2*)sitelink_odd, (double2*)sitelink_even, \
00932                                                 (double2*)fatlink_odd, (double2*)fatlink_even, \
00933                                                 (double2*)mulink_odd, (double2*)mulink_even, \
00934                                                 (double)mycoeff, kparam); \
00935     }                                                                   \
00936   }else{                                                                \
00937     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00938       do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
00939         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
00940                                                 (float2*)sitelink_even, (float2*)sitelink_odd, \
00941                                                 (float2*)fatlink_even, (float2*)fatlink_odd, \
00942                                                 (float2*)mulink_even, (float2*)mulink_odd, \
00943                                                 (float)mycoeff, kparam); \
00944       do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
00945         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
00946                                                 (float2*)sitelink_odd, (float2*)sitelink_even, \
00947                                                 (float2*)fatlink_odd, (float2*)fatlink_even, \
00948                                                 (float2*)mulink_odd, (float2*)mulink_even, \
00949                                                 (float)mycoeff, kparam); \
00950     }else{                                                              \
00951       do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple>     \
00952         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
00953                                                 (float4*)sitelink_even, (float4*)sitelink_odd, \
00954                                                 (float2*)fatlink_even, (float2*)fatlink_odd, \
00955                                                 (float2*)mulink_even, (float2*)mulink_odd, \
00956                                                 (float)mycoeff, kparam); \
00957       do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
00958         <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
00959                                                 (float4*)sitelink_odd, (float4*)sitelink_even, \
00960                                                 (float2*)fatlink_odd, (float2*)fatlink_even, \
00961                                                 (float2*)mulink_odd, (float2*)mulink_even, \
00962                                                 (float)mycoeff, kparam); \
00963     }                                                                   \
00964   }
00965   
00966   BIND_MU_LINK();
00967   dim3 blockDim(BLOCK_DIM , 1, 1);
00968   ENUMERATE_FUNCS_SAVE(mu,nu,save_staple);
00969 
00970   UNBIND_MU_LINK();
00971 
00972 #undef CALL_FUNCTION 
00973     
00974 }
00975 
00976 
00977 void siteComputeGenStapleParityKernel_ex(void* staple_even, void* staple_odd, 
00978                                          void* sitelink_even, void* sitelink_odd, 
00979                                          void* fatlink_even, void* fatlink_odd, 
00980                                          int mu, int nu, double mycoeff,
00981                                          QudaReconstructType recon, QudaPrecision prec,
00982                                          llfat_kernel_param_t kparam)
00983 {
00984   
00985   //compute even and odd
00986   dim3 blockDim = kparam.blockDim;
00987   dim3 halfGridDim = kparam.halfGridDim;
00988   int sbytes_dp = blockDim.x*5*sizeof(double2);
00989   int sbytes_sp = blockDim.x*5*sizeof(float2);
00990   
00991 #define  CALL_FUNCTION(mu, nu)                                          \
00992   if (prec == QUDA_DOUBLE_PRECISION){                                   \
00993     if(recon == QUDA_RECONSTRUCT_NO){                                   \
00994       do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0>                \
00995         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
00996                                                (double2*)sitelink_even, (double2*)sitelink_odd, \
00997                                                (double2*)fatlink_even, (double2*)fatlink_odd, \
00998                                                (double)mycoeff, kparam); \
00999       do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1>                \
01000         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
01001                                                (double2*)sitelink_odd, (double2*)sitelink_even, \
01002                                                (double2*)fatlink_odd, (double2*)fatlink_even, \
01003                                                (double)mycoeff, kparam); \
01004     }else{                                                              \
01005       do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0>                \
01006         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
01007                                                (double2*)sitelink_even, (double2*)sitelink_odd, \
01008                                                (double2*)fatlink_even, (double2*)fatlink_odd, \
01009                                                (double)mycoeff, kparam); \
01010       do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1>                \
01011         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
01012                                                (double2*)sitelink_odd, (double2*)sitelink_even, \
01013                                                (double2*)fatlink_odd, (double2*)fatlink_even, \
01014                                                (double)mycoeff, kparam); \
01015     }                                                                   \
01016   }else {                                                               \
01017     if(recon == QUDA_RECONSTRUCT_NO){                                   \
01018       do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0>                \
01019         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
01020                                                (float2*)sitelink_even, (float2*)sitelink_odd, \
01021                                                (float2*)fatlink_even, (float2*)fatlink_odd, \
01022                                                (float)mycoeff, kparam); \
01023       do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1>                \
01024         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
01025                                                (float2*)sitelink_odd, (float2*)sitelink_even, \
01026                                                (float2*)fatlink_odd, (float2*)fatlink_even, \
01027                                                (float)mycoeff, kparam); \
01028     }else{                                                              \
01029       do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0>                \
01030         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
01031                                                (float4*)sitelink_even, (float4*)sitelink_odd, \
01032                                                (float2*)fatlink_even, (float2*)fatlink_odd, \
01033                                                (float)mycoeff, kparam); \
01034       do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1>                \
01035         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
01036                                                (float4*)sitelink_odd, (float4*)sitelink_even, \
01037                                                (float2*)fatlink_odd, (float2*)fatlink_even, \
01038                                                (float)mycoeff, kparam); \
01039     }                                                                   \
01040   }
01041   
01042   
01043   ENUMERATE_FUNCS(mu,nu);  
01044 
01045 #undef CALL_FUNCTION
01046     
01047     
01048 }
01049 
01050 
01051 
01052 void
01053 computeGenStapleFieldParityKernel_ex(void* staple_even, void* staple_odd, 
01054                                      void* sitelink_even, void* sitelink_odd,
01055                                      void* fatlink_even, void* fatlink_odd,                         
01056                                      void* mulink_even, void* mulink_odd, 
01057                                      int mu, int nu, int save_staple,
01058                                      double mycoeff,
01059                                      QudaReconstructType recon, QudaPrecision prec,
01060                                      llfat_kernel_param_t kparam)
01061 {
01062   
01063   dim3 blockDim = kparam.blockDim;
01064   dim3 halfGridDim= kparam.halfGridDim;
01065   
01066   int sbytes_dp = blockDim.x*5*sizeof(double2);
01067   int sbytes_sp = blockDim.x*5*sizeof(float2);
01068 
01069 #define  CALL_FUNCTION(mu, nu, save_staple)                             \
01070   if (prec == QUDA_DOUBLE_PRECISION){                                   \
01071     if(recon == QUDA_RECONSTRUCT_NO){                                   \
01072       do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple>  \
01073         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
01074                                                (double2*)sitelink_even, (double2*)sitelink_odd, \
01075                                                (double2*)fatlink_even, (double2*)fatlink_odd, \
01076                                                (double2*)mulink_even, (double2*)mulink_odd, \
01077                                                (double)mycoeff, kparam); \
01078       do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple>  \
01079         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
01080                                                (double2*)sitelink_odd, (double2*)sitelink_even, \
01081                                                (double2*)fatlink_odd, (double2*)fatlink_even, \
01082                                                (double2*)mulink_odd, (double2*)mulink_even, \
01083                                                (double)mycoeff, kparam); \
01084     }else{                                                              \
01085       do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple>  \
01086         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
01087                                                (double2*)sitelink_even, (double2*)sitelink_odd, \
01088                                                (double2*)fatlink_even, (double2*)fatlink_odd, \
01089                                                (double2*)mulink_even, (double2*)mulink_odd, \
01090                                                (double)mycoeff, kparam); \
01091       do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple>  \
01092         <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
01093                                                (double2*)sitelink_odd, (double2*)sitelink_even, \
01094                                                (double2*)fatlink_odd, (double2*)fatlink_even, \
01095                                                (double2*)mulink_odd, (double2*)mulink_even, \
01096                                                (double)mycoeff, kparam); \
01097     }                                                                   \
01098   }else{                                                                \
01099     if(recon == QUDA_RECONSTRUCT_NO){                                   \
01100       do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple>  \
01101         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
01102                                                (float2*)sitelink_even, (float2*)sitelink_odd, \
01103                                                (float2*)fatlink_even, (float2*)fatlink_odd, \
01104                                                (float2*)mulink_even, (float2*)mulink_odd, \
01105                                                (float)mycoeff, kparam); \
01106       do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple>  \
01107         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
01108                                                (float2*)sitelink_odd, (float2*)sitelink_even, \
01109                                                (float2*)fatlink_odd, (float2*)fatlink_even, \
01110                                                (float2*)mulink_odd, (float2*)mulink_even, \
01111                                                (float)mycoeff, kparam); \
01112     }else{                                                              \
01113       do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple>  \
01114         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
01115                                                (float4*)sitelink_even, (float4*)sitelink_odd, \
01116                                                (float2*)fatlink_even, (float2*)fatlink_odd, \
01117                                                (float2*)mulink_even, (float2*)mulink_odd, \
01118                                                (float)mycoeff, kparam); \
01119       do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple>  \
01120         <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
01121                                                (float4*)sitelink_odd, (float4*)sitelink_even, \
01122                                                (float2*)fatlink_odd, (float2*)fatlink_even, \
01123                                                (float2*)mulink_odd, (float2*)mulink_even, \
01124                                                (float)mycoeff, kparam); \
01125     }                                                                   \
01126   }
01127   
01128   BIND_MU_LINK();
01129   ENUMERATE_FUNCS_SAVE(mu,nu,save_staple);
01130 
01131   UNBIND_MU_LINK();
01132 
01133 #undef CALL_FUNCTION 
01134     
01135 }
01136 
01137 
01138 
01139 
01140 void llfatOneLinkKernel(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink,
01141                         cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
01142                         QudaGaugeParam* param, double* act_path_coeff)
01143 {  
01144   QudaPrecision prec = cudaSiteLink.Precision();
01145   QudaReconstructType recon = cudaSiteLink.Reconstruct();
01146   
01147   BIND_SITE_AND_FAT_LINK;
01148   int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];  
01149   dim3 gridDim(volume/BLOCK_DIM,1,1);
01150   dim3 blockDim(BLOCK_DIM , 1, 1);
01151 
01152   staple_bytes = cudaStaple.Bytes();
01153 
01154   if(prec == QUDA_DOUBLE_PRECISION){
01155     if(recon == QUDA_RECONSTRUCT_NO){
01156       llfatOneLink18Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(),
01157                                                   (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
01158                                                   (double)act_path_coeff[0], (double)act_path_coeff[5]);    
01159     }else{
01160       
01161       llfatOneLink12Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(),
01162                                                   (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
01163                                                   (double)act_path_coeff[0], (double)act_path_coeff[5]);    
01164       
01165     }
01166   }else{ //single precision
01167     if(recon == QUDA_RECONSTRUCT_NO){    
01168       llfatOneLink18Kernel<<<gridDim, blockDim>>>((float2*)cudaSiteLink.Even_p(), (float2*)cudaSiteLink.Odd_p(),
01169                                                   (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
01170                                                   (float)act_path_coeff[0], (float)act_path_coeff[5]);                                                    
01171     }else{
01172       llfatOneLink12Kernel<<<gridDim, blockDim>>>((float4*)cudaSiteLink.Even_p(), (float4*)cudaSiteLink.Odd_p(),
01173                                                   (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
01174                                                   (float)act_path_coeff[0], (float)act_path_coeff[5]);    
01175     }
01176   }
01177 }
01178 
01179 
01180 
01181 void llfatOneLinkKernel_ex(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink,
01182                            cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
01183                            QudaGaugeParam* param, double* act_path_coeff,
01184                            llfat_kernel_param_t kparam)
01185 {
01186   QudaPrecision prec = cudaSiteLink.Precision();
01187   QudaReconstructType recon = cudaSiteLink.Reconstruct();
01188 
01189   BIND_SITE_AND_FAT_LINK;
01190 
01191   dim3 gridDim;
01192   dim3 blockDim = kparam.blockDim;
01193   gridDim.x = 2* kparam.halfGridDim.x;
01194   gridDim.y = 1;
01195   gridDim.z = 1;
01196   staple_bytes = cudaStaple.Bytes();
01197 
01198   if(prec == QUDA_DOUBLE_PRECISION){
01199     if(recon == QUDA_RECONSTRUCT_NO){
01200       llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(),
01201                                                      (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
01202                                                      (double)act_path_coeff[0], (double)act_path_coeff[5], kparam);
01203     }else{
01204       
01205       llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(),
01206                                                      (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
01207                                                      (double)act_path_coeff[0], (double)act_path_coeff[5], kparam);
01208 
01209     }
01210   }else{ //single precision
01211     if(recon == QUDA_RECONSTRUCT_NO){
01212       llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((float2*)cudaSiteLink.Even_p(), (float2*)cudaSiteLink.Odd_p(),
01213                                                      (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
01214                                                      (float)act_path_coeff[0], (float)act_path_coeff[5], kparam);
01215     }else{
01216       llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((float4*)cudaSiteLink.Even_p(), (float4*)cudaSiteLink.Odd_p(),
01217                                                      (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
01218                                                      (float)act_path_coeff[0], (float)act_path_coeff[5], kparam);
01219     }
01220   }
01221 }
01222 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines