QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 #include <cuda_runtime.h> 00003 #include <cuda.h> 00004 00005 #include <quda_internal.h> 00006 #include <llfat_quda.h> 00007 #include <read_gauge.h> 00008 #include "gauge_field.h" 00009 #include <force_common.h> 00010 00011 #if (__COMPUTE_CAPABILITY__ >= 200) 00012 #define SITE_MATRIX_LOAD_TEX 1 00013 #define MULINK_LOAD_TEX 1 00014 #define FATLINK_LOAD_TEX 1 00015 #else 00016 #define SITE_MATRIX_LOAD_TEX 0 00017 #define MULINK_LOAD_TEX 1 00018 #define FATLINK_LOAD_TEX 1 00019 #endif 00020 00021 #define WRITE_FAT_MATRIX(gauge, dir, idx)do { \ 00022 gauge[idx + dir*9*llfat_ga_stride] = FAT0; \ 00023 gauge[idx + (dir*9+1) * llfat_ga_stride] = FAT1; \ 00024 gauge[idx + (dir*9+2) * llfat_ga_stride] = FAT2; \ 00025 gauge[idx + (dir*9+3) * llfat_ga_stride] = FAT3; \ 00026 gauge[idx + (dir*9+4) * llfat_ga_stride] = FAT4; \ 00027 gauge[idx + (dir*9+5) * llfat_ga_stride] = FAT5; \ 00028 gauge[idx + (dir*9+6) * llfat_ga_stride] = FAT6; \ 00029 gauge[idx + (dir*9+7) * llfat_ga_stride] = FAT7; \ 00030 gauge[idx + (dir*9+8) * llfat_ga_stride] = FAT8;} while(0) 00031 00032 00033 #define WRITE_STAPLE_MATRIX(gauge, idx) \ 00034 gauge[idx] = STAPLE0; \ 00035 gauge[idx + staple_stride] = STAPLE1; \ 00036 gauge[idx + 2*staple_stride] = STAPLE2; \ 00037 gauge[idx + 3*staple_stride] = STAPLE3; \ 00038 gauge[idx + 4*staple_stride] = STAPLE4; \ 00039 gauge[idx + 5*staple_stride] = STAPLE5; \ 00040 gauge[idx + 6*staple_stride] = STAPLE6; \ 00041 gauge[idx + 7*staple_stride] = STAPLE7; \ 00042 gauge[idx + 8*staple_stride] = STAPLE8; 00043 00044 00045 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \ 00046 c##00_re = a*b##00_re; \ 00047 c##00_im = a*b##00_im; \ 00048 c##01_re = a*b##01_re; \ 00049 c##01_im = a*b##01_im; \ 00050 c##02_re = a*b##02_re; \ 00051 c##02_im = a*b##02_im; \ 00052 c##10_re = a*b##10_re; \ 00053 c##10_im = a*b##10_im; \ 00054 c##11_re = a*b##11_re; \ 00055 c##11_im = a*b##11_im; \ 00056 c##12_re = a*b##12_re; \ 00057 c##12_im = a*b##12_im; \ 00058 c##20_re = a*b##20_re; \ 00059 c##20_im = a*b##20_im; \ 00060 c##21_re = a*b##21_re; \ 00061 c##21_im = a*b##21_im; \ 00062 c##22_re = a*b##22_re; \ 00063 c##22_im = a*b##22_im; \ 00064 00065 /* 00066 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride) \ 00067 float2 var##0 = gauge[idx + dir*6*stride]; \ 00068 float2 var##1 = gauge[idx + dir*6*stride + stride]; \ 00069 float2 var##2 = gauge[idx + dir*6*stride + 2*stride]; \ 00070 float2 var##3 = gauge[idx + dir*6*stride + 3*stride]; \ 00071 float2 var##4 = gauge[idx + dir*6*stride + 4*stride]; \ 00072 float2 var##5 = gauge[idx + dir*6*stride + 5*stride]; \ 00073 float2 var##6, var##7, var##8; 00074 00075 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \ 00076 float2 var##0 = tex1Dfetch(gauge, idx + dir*6*stride); \ 00077 float2 var##1 = tex1Dfetch(gauge, idx + dir*6*stride + stride); \ 00078 float2 var##2 = tex1Dfetch(gauge, idx + dir*6*stride + 2*stride); \ 00079 float2 var##3 = tex1Dfetch(gauge, idx + dir*6*stride + 3*stride); \ 00080 float2 var##4 = tex1Dfetch(gauge, idx + dir*6*stride + 4*stride); \ 00081 float2 var##5 = tex1Dfetch(gauge, idx + dir*6*stride + 5*stride); \ 00082 float2 var##6, var##7, var##8; 00083 */ 00084 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride) \ 00085 float4 var##0 = gauge[idx + dir*3*stride]; \ 00086 float4 var##1 = gauge[idx + dir*3*stride + stride]; \ 00087 float4 var##2 = gauge[idx + dir*3*stride + 2*stride]; \ 00088 float4 var##3, var##4; 00089 00090 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \ 00091 float4 var##0 = tex1Dfetch(gauge, idx + dir*3*stride); \ 00092 float4 var##1 = tex1Dfetch(gauge, idx + dir*3*stride + stride); \ 00093 float4 var##2 = tex1Dfetch(gauge, idx + dir*3*stride + 2*stride); \ 00094 float4 var##3, var##4; 00095 00096 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var, stride) \ 00097 float2 var##0 = gauge[idx + dir*9*stride]; \ 00098 float2 var##1 = gauge[idx + dir*9*stride + stride]; \ 00099 float2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \ 00100 float2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \ 00101 float2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \ 00102 float2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \ 00103 float2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \ 00104 float2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \ 00105 float2 var##8 = gauge[idx + dir*9*stride + 8*stride]; 00106 00107 00108 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \ 00109 float2 var##0 = tex1Dfetch(gauge, idx + dir*9*stride); \ 00110 float2 var##1 = tex1Dfetch(gauge, idx + dir*9*stride + stride); \ 00111 float2 var##2 = tex1Dfetch(gauge, idx + dir*9*stride + 2*stride); \ 00112 float2 var##3 = tex1Dfetch(gauge, idx + dir*9*stride + 3*stride); \ 00113 float2 var##4 = tex1Dfetch(gauge, idx + dir*9*stride + 4*stride); \ 00114 float2 var##5 = tex1Dfetch(gauge, idx + dir*9*stride + 5*stride); \ 00115 float2 var##6 = tex1Dfetch(gauge, idx + dir*9*stride + 6*stride); \ 00116 float2 var##7 = tex1Dfetch(gauge, idx + dir*9*stride + 7*stride); \ 00117 float2 var##8 = tex1Dfetch(gauge, idx + dir*9*stride + 8*stride); 00118 00119 00120 00121 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \ 00122 double2 var##0 = gauge[idx + dir*9*stride]; \ 00123 double2 var##1 = gauge[idx + dir*9*stride + stride]; \ 00124 double2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \ 00125 double2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \ 00126 double2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \ 00127 double2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \ 00128 double2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \ 00129 double2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \ 00130 double2 var##8 = gauge[idx + dir*9*stride + 8*stride]; 00131 00132 00133 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \ 00134 double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride); \ 00135 double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + stride); \ 00136 double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 2*stride); \ 00137 double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 3*stride); \ 00138 double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 4*stride); \ 00139 double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 5*stride); \ 00140 double2 var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 6*stride); \ 00141 double2 var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 7*stride); \ 00142 double2 var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 8*stride); 00143 00144 00145 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \ 00146 double2 var##0 = gauge[idx + dir*6*stride]; \ 00147 double2 var##1 = gauge[idx + dir*6*stride + stride]; \ 00148 double2 var##2 = gauge[idx + dir*6*stride + 2*stride]; \ 00149 double2 var##3 = gauge[idx + dir*6*stride + 3*stride]; \ 00150 double2 var##4 = gauge[idx + dir*6*stride + 4*stride]; \ 00151 double2 var##5 = gauge[idx + dir*6*stride + 5*stride]; \ 00152 double2 var##6, var##7, var##8; 00153 00154 00155 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \ 00156 double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride); \ 00157 double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + stride); \ 00158 double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 2*stride); \ 00159 double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 3*stride); \ 00160 double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 4*stride); \ 00161 double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 5*stride); \ 00162 double2 var##6, var##7, var##8; 00163 00164 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc) \ 00165 mc##00_re = ma##00_re + mb##00_re; \ 00166 mc##00_im = ma##00_im + mb##00_im; \ 00167 mc##01_re = ma##01_re + mb##01_re; \ 00168 mc##01_im = ma##01_im + mb##01_im; \ 00169 mc##02_re = ma##02_re + mb##02_re; \ 00170 mc##02_im = ma##02_im + mb##02_im; \ 00171 mc##10_re = ma##10_re + mb##10_re; \ 00172 mc##10_im = ma##10_im + mb##10_im; \ 00173 mc##11_re = ma##11_re + mb##11_re; \ 00174 mc##11_im = ma##11_im + mb##11_im; \ 00175 mc##12_re = ma##12_re + mb##12_re; \ 00176 mc##12_im = ma##12_im + mb##12_im; \ 00177 mc##20_re = ma##20_re + mb##20_re; \ 00178 mc##20_im = ma##20_im + mb##20_im; \ 00179 mc##21_re = ma##21_re + mb##21_re; \ 00180 mc##21_im = ma##21_im + mb##21_im; \ 00181 mc##22_re = ma##22_re + mb##22_re; \ 00182 mc##22_im = ma##22_im + mb##22_im; 00183 00184 __constant__ int dir1_array[16]; 00185 __constant__ int dir2_array[16]; 00186 __constant__ int last_proc_in_tdim; 00187 __constant__ int first_proc_in_tdim; 00188 00189 unsigned long staple_bytes=0; 00190 00191 void 00192 llfat_init_cuda(QudaGaugeParam* param) 00193 { 00194 static int llfat_init_cuda_flag = 0; 00195 if (llfat_init_cuda_flag){ 00196 return; 00197 } 00198 00199 llfat_init_cuda_flag = 1; 00200 00201 init_kernel_cuda(param); 00202 int Vh = param->X[0]*param->X[1]*param->X[2]*param->X[3]/2; 00203 int site_ga_stride = param->site_ga_pad + Vh; 00204 int staple_stride = param->staple_pad + Vh; 00205 int llfat_ga_stride = param->llfat_ga_pad + Vh; 00206 00207 cudaMemcpyToSymbol("site_ga_stride", &site_ga_stride, sizeof(int)); 00208 cudaMemcpyToSymbol("staple_stride", &staple_stride, sizeof(int)); 00209 cudaMemcpyToSymbol("llfat_ga_stride", &llfat_ga_stride, sizeof(int)); 00210 int dir1[16]; 00211 int dir2[16]; 00212 for(int nu =0; nu < 4; nu++) 00213 for(int mu=0; mu < 4; mu++){ 00214 if(nu == mu) continue; 00215 int d1, d2; 00216 for(d1=0; d1 < 4; d1 ++){ 00217 if(d1 != nu && d1 != mu){ 00218 break; 00219 } 00220 } 00221 dir1[nu*4+mu] = d1; 00222 00223 for(d2=0; d2 < 4; d2 ++){ 00224 if(d2 != nu && d2 != mu && d2 != d1){ 00225 break; 00226 } 00227 } 00228 00229 dir2[nu*4+mu] = d2; 00230 } 00231 00232 cudaMemcpyToSymbol("dir1_array", &dir1, sizeof(dir1)); 00233 cudaMemcpyToSymbol("dir2_array", &dir2, sizeof(dir2)); 00234 00235 int first_proc_in_tdim = 0; 00236 int last_proc_in_tdim = 0; 00237 if(commCoords(3) == (commDim(3) -1)){ 00238 last_proc_in_tdim = 1; 00239 } 00240 00241 if(commCoords(3) == 0){ 00242 first_proc_in_tdim = 1; 00243 } 00244 00245 cudaMemcpyToSymbol("last_proc_in_tdim", &last_proc_in_tdim, sizeof(int)); 00246 cudaMemcpyToSymbol("first_proc_in_tdim", &first_proc_in_tdim, sizeof(int)); 00247 00248 } 00249 00250 00251 void 00252 llfat_init_cuda_ex(QudaGaugeParam* param_ex) 00253 { 00254 static int llfat_init_cuda_flag = 0; 00255 if (llfat_init_cuda_flag){ 00256 return; 00257 } 00258 00259 llfat_init_cuda_flag = 1; 00260 00261 init_kernel_cuda(param_ex); 00262 int Vh_ex = param_ex->X[0]*param_ex->X[1]*param_ex->X[2]*param_ex->X[3]/2; 00263 int Vh = (param_ex->X[0]-4)*(param_ex->X[1]-4)*(param_ex->X[2]-4)*(param_ex->X[3]-4)/2; 00264 int site_ga_stride = param_ex->site_ga_pad + Vh_ex; 00265 int staple_stride = param_ex->staple_pad + Vh_ex; 00266 int llfat_ga_stride = param_ex->llfat_ga_pad + Vh; 00267 00268 cudaMemcpyToSymbol("site_ga_stride", &site_ga_stride, sizeof(int)); 00269 cudaMemcpyToSymbol("staple_stride", &staple_stride, sizeof(int)); 00270 cudaMemcpyToSymbol("llfat_ga_stride", &llfat_ga_stride, sizeof(int)); 00271 00272 int first_proc_in_tdim = 0; 00273 int last_proc_in_tdim = 0; 00274 if(commCoords(3) == (commDim(3) -1)){ 00275 last_proc_in_tdim = 1; 00276 } 00277 00278 if(commCoords(3) == 0){ 00279 first_proc_in_tdim = 1; 00280 } 00281 00282 cudaMemcpyToSymbol("last_proc_in_tdim", &last_proc_in_tdim, sizeof(int)); 00283 cudaMemcpyToSymbol("first_proc_in_tdim", &first_proc_in_tdim, sizeof(int)); 00284 00285 int E1 = param_ex->X[0]; 00286 int E1h = E1/2; 00287 int E2 = param_ex->X[1]; 00288 int E3 = param_ex->X[2]; 00289 int E4 = param_ex->X[3]; 00290 int E2E1 =E2*E1; 00291 int E3E2E1=E3*E2*E1; 00292 00293 cudaMemcpyToSymbol("E1", &E1, sizeof(int)); 00294 cudaMemcpyToSymbol("E1h", &E1h, sizeof(int)); 00295 cudaMemcpyToSymbol("E2", &E2, sizeof(int)); 00296 cudaMemcpyToSymbol("E3", &E3, sizeof(int)); 00297 cudaMemcpyToSymbol("E4", &E4, sizeof(int)); 00298 cudaMemcpyToSymbol("E2E1", &E2E1, sizeof(int)); 00299 cudaMemcpyToSymbol("E3E2E1", &E3E2E1, sizeof(int)); 00300 00301 cudaMemcpyToSymbol("Vh_ex", &Vh_ex, sizeof(int)); 00302 00303 } 00304 00305 00306 00307 00308 #define LLFAT_CONCAT(a,b) a##b##Kernel 00309 #define LLFAT_CONCAT_EX(a,b) a##b##Kernel_ex 00310 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b) 00311 #define LLFAT_KERNEL_EX(a,b) LLFAT_CONCAT_EX(a,b) 00312 00313 //precision: 0 is for double, 1 is for single 00314 00315 //single precision, common macro 00316 #define PRECISION 1 00317 #define Float float 00318 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, FAT, llfat_ga_stride) 00319 #if (MULINK_LOAD_TEX == 1) 00320 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink1TexSingle:muLink0TexSingle), dir, idx, var, staple_stride) 00321 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink0TexSingle:muLink1TexSingle), dir, idx, var, staple_stride) 00322 #else 00323 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_even, dir, idx, var, staple_stride) 00324 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_odd, dir, idx, var, staple_stride) 00325 #endif 00326 00327 #if (FATLINK_LOAD_TEX == 1) 00328 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge1TexSingle:fatGauge0TexSingle), dir, idx, FAT, llfat_ga_stride); 00329 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge0TexSingle:fatGauge1TexSingle), dir, idx, FAT, llfat_ga_stride); 00330 #else 00331 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_even, dir, idx, FAT, llfat_ga_stride) 00332 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_odd, dir, idx, FAT, llfat_ga_stride) 00333 #endif 00334 00335 00336 //single precision, 12-reconstruct 00337 #define DECLARE_VAR_SIGN short sign=1 00338 #define SITELINK0TEX siteLink0TexSingle_recon 00339 #define SITELINK1TEX siteLink1TexSingle_recon 00340 #if (SITE_MATRIX_LOAD_TEX == 1) 00341 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, site_ga_stride) 00342 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, site_ga_stride) 00343 #else 00344 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride) 00345 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride) 00346 #endif 00347 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var, site_ga_stride) 00348 00349 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var); 00350 #define FloatN float4 00351 #define FloatM float2 00352 #define RECONSTRUCT 12 00353 #define sd_data float_12_sd_data 00354 #include "llfat_core.h" 00355 #undef DECLARE_VAR_SIGN 00356 #undef SITELINK0TEX 00357 #undef SITELINK1TEX 00358 #undef LOAD_EVEN_SITE_MATRIX 00359 #undef LOAD_ODD_SITE_MATRIX 00360 #undef LOAD_SITE_MATRIX 00361 #undef RECONSTRUCT_SITE_LINK 00362 #undef FloatN 00363 #undef FloatM 00364 #undef RECONSTRUCT 00365 #undef sd_data 00366 00367 //single precision, 18-reconstruct 00368 #define SITELINK0TEX siteLink0TexSingle_norecon 00369 #define SITELINK1TEX siteLink1TexSingle_norecon 00370 #if (SITE_MATRIX_LOAD_TEX == 1) 00371 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, site_ga_stride) 00372 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, site_ga_stride) 00373 #else 00374 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride) 00375 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride) 00376 #endif 00377 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink, dir, idx, var, site_ga_stride) 00378 #define RECONSTRUCT_SITE_LINK(sign, var) 00379 #define FloatN float2 00380 #define FloatM float2 00381 #define RECONSTRUCT 18 00382 #define sd_data float_18_sd_data 00383 #include "llfat_core.h" 00384 #undef SITELINK0TEX 00385 #undef SITELINK1TEX 00386 #undef LOAD_EVEN_SITE_MATRIX 00387 #undef LOAD_ODD_SITE_MATRIX 00388 #undef LOAD_SITE_MATRIX 00389 #undef RECONSTRUCT_SITE_LINK 00390 #undef FloatN 00391 #undef FloatM 00392 #undef RECONSTRUCT 00393 #undef sd_data 00394 00395 00396 #undef PRECISION 00397 #undef Float 00398 #undef LOAD_FAT_MATRIX 00399 #undef LOAD_EVEN_MULINK_MATRIX 00400 #undef LOAD_ODD_MULINK_MATRIX 00401 #undef LOAD_EVEN_FAT_MATRIX 00402 #undef LOAD_ODD_FAT_MATRIX 00403 00404 00405 //double precision, common macro 00406 #define PRECISION 0 00407 #define Float double 00408 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, FAT, llfat_ga_stride) 00409 #if (MULINK_LOAD_TEX == 1) 00410 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(odd_bit?muLink1TexDouble:muLink0TexDouble), mulink_even, dir, idx, var, staple_stride) 00411 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?muLink0TexDouble:muLink1TexDouble), mulink_odd, dir, idx, var, staple_stride) 00412 #else 00413 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var, staple_stride) 00414 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var, staple_stride) 00415 #endif 00416 00417 #if (FATLINK_LOAD_TEX == 1) 00418 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge1TexDouble:fatGauge0TexDouble), fatlink_even, dir, idx, FAT, llfat_ga_stride) 00419 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge0TexDouble:fatGauge1TexDouble), fatlink_odd, dir, idx, FAT, llfat_ga_stride) 00420 #else 00421 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_even, dir, idx, FAT, llfat_ga_stride) 00422 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_odd, dir, idx, FAT, llfat_ga_stride) 00423 #endif 00424 00425 //double precision, 18-reconstruct 00426 #define SITELINK0TEX siteLink0TexDouble 00427 #define SITELINK1TEX siteLink1TexDouble 00428 #if (SITE_MATRIX_LOAD_TEX == 1) 00429 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, site_ga_stride) 00430 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, site_ga_stride) 00431 #else 00432 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride) 00433 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride) 00434 #endif 00435 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink, dir, idx, var, site_ga_stride) 00436 #define RECONSTRUCT_SITE_LINK(sign, var) 00437 #define FloatN double2 00438 #define FloatM double2 00439 #define RECONSTRUCT 18 00440 #define sd_data double_18_sd_data 00441 #include "llfat_core.h" 00442 #undef SITELINK0TEX 00443 #undef SITELINK1TEX 00444 #undef LOAD_EVEN_SITE_MATRIX 00445 #undef LOAD_ODD_SITE_MATRIX 00446 #undef LOAD_SITE_MATRIX 00447 #undef RECONSTRUCT_SITE_LINK 00448 #undef FloatN 00449 #undef FloatM 00450 #undef RECONSTRUCT 00451 #undef sd_data 00452 00453 00454 00455 #if 1 00456 //double precision, 12-reconstruct 00457 #define SITELINK0TEX siteLink0TexDouble 00458 #define SITELINK1TEX siteLink1TexDouble 00459 #if (SITE_MATRIX_LOAD_TEX == 1) 00460 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, site_ga_stride) 00461 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, site_ga_stride) 00462 #else 00463 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var, site_ga_stride) 00464 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, site_ga_stride) 00465 #endif 00466 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var, site_ga_stride) 00467 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var); 00468 #define FloatN double2 00469 #define FloatM double2 00470 #define RECONSTRUCT 12 00471 #define sd_data double_12_sd_data 00472 #include "llfat_core.h" 00473 #undef SITELINK0TEX 00474 #undef SITELINK1TEX 00475 #undef LOAD_EVEN_SITE_MATRIX 00476 #undef LOAD_ODD_SITE_MATRIX 00477 #undef LOAD_SITE_MATRIX 00478 #undef RECONSTRUCT_SITE_LINK 00479 #undef FloatN 00480 #undef FloatM 00481 #undef RECONSTRUCT 00482 #undef sd_data 00483 #endif 00484 00485 #undef PRECISION 00486 #undef Float 00487 #undef LOAD_FAT_MATRIX 00488 #undef LOAD_EVEN_MULINK_MATRIX 00489 #undef LOAD_ODD_MULINK_MATRIX 00490 #undef LOAD_EVEN_FAT_MATRIX 00491 #undef LOAD_ODD_FAT_MATRIX 00492 00493 #undef LLFAT_CONCAT 00494 #undef LLFAT_KERNEL 00495 00496 #define UNBIND_ALL_TEXTURE do{ \ 00497 if(prec ==QUDA_DOUBLE_PRECISION){ \ 00498 cudaUnbindTexture(siteLink0TexDouble); \ 00499 cudaUnbindTexture(siteLink1TexDouble); \ 00500 cudaUnbindTexture(fatGauge0TexDouble); \ 00501 cudaUnbindTexture(fatGauge1TexDouble); \ 00502 cudaUnbindTexture(muLink0TexDouble); \ 00503 cudaUnbindTexture(muLink1TexDouble); \ 00504 }else{ \ 00505 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00506 cudaUnbindTexture(siteLink0TexSingle_norecon); \ 00507 cudaUnbindTexture(siteLink1TexSingle_norecon); \ 00508 }else{ \ 00509 cudaUnbindTexture(siteLink0TexSingle_recon); \ 00510 cudaUnbindTexture(siteLink1TexSingle_recon); \ 00511 } \ 00512 cudaUnbindTexture(fatGauge0TexSingle); \ 00513 cudaUnbindTexture(fatGauge1TexSingle); \ 00514 cudaUnbindTexture(muLink0TexSingle); \ 00515 cudaUnbindTexture(muLink1TexSingle); \ 00516 } \ 00517 }while(0) 00518 00519 #define UNBIND_SITE_AND_FAT_LINK do{ \ 00520 if(prec == QUDA_DOUBLE_PRECISION){ \ 00521 cudaUnbindTexture(siteLink0TexDouble); \ 00522 cudaUnbindTexture(siteLink1TexDouble); \ 00523 cudaUnbindTexture(fatGauge0TexDouble); \ 00524 cudaUnbindTexture(fatGauge1TexDouble); \ 00525 }else { \ 00526 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00527 cudaUnbindTexture(siteLink0TexSingle_norecon); \ 00528 cudaUnbindTexture(siteLink1TexSingle_norecon); \ 00529 }else{ \ 00530 cudaUnbindTexture(siteLink0TexSingle_recon); \ 00531 cudaUnbindTexture(siteLink1TexSingle_recon); \ 00532 } \ 00533 cudaUnbindTexture(fatGauge0TexSingle); \ 00534 cudaUnbindTexture(fatGauge1TexSingle); \ 00535 } \ 00536 }while(0) 00537 00538 00539 #define BIND_MU_LINK() do{ \ 00540 if(prec == QUDA_DOUBLE_PRECISION){ \ 00541 cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \ 00542 cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \ 00543 }else{ \ 00544 cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \ 00545 cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \ 00546 } \ 00547 }while(0) 00548 00549 #define UNBIND_MU_LINK() do{ \ 00550 if(prec == QUDA_DOUBLE_PRECISION){ \ 00551 cudaUnbindTexture(muLink0TexSingle); \ 00552 cudaUnbindTexture(muLink1TexSingle); \ 00553 }else{ \ 00554 cudaUnbindTexture(muLink0TexDouble); \ 00555 cudaUnbindTexture(muLink1TexDouble); \ 00556 } \ 00557 }while(0) 00558 00559 00560 #define BIND_SITE_AND_FAT_LINK do { \ 00561 if(prec == QUDA_DOUBLE_PRECISION){ \ 00562 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \ 00563 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \ 00564 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \ 00565 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \ 00566 }else{ \ 00567 if(cudaSiteLink.Reconstruct() == QUDA_RECONSTRUCT_NO){ \ 00568 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \ 00569 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \ 00570 }else{ \ 00571 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \ 00572 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \ 00573 } \ 00574 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \ 00575 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \ 00576 } \ 00577 }while(0) 00578 00579 #define BIND_MU_LINK() do{ \ 00580 if(prec == QUDA_DOUBLE_PRECISION){ \ 00581 cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \ 00582 cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \ 00583 }else{ \ 00584 cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \ 00585 cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \ 00586 } \ 00587 }while(0) 00588 00589 #define UNBIND_MU_LINK() do{ \ 00590 if(prec == QUDA_DOUBLE_PRECISION){ \ 00591 cudaUnbindTexture(muLink0TexSingle); \ 00592 cudaUnbindTexture(muLink1TexSingle); \ 00593 }else{ \ 00594 cudaUnbindTexture(muLink0TexDouble); \ 00595 cudaUnbindTexture(muLink1TexDouble); \ 00596 } \ 00597 }while(0) 00598 00599 #define BIND_SITE_AND_FAT_LINK_REVERSE do { \ 00600 if(prec == QUDA_DOUBLE_PRECISION){ \ 00601 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \ 00602 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00603 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \ 00604 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd, cudaFatLink.bytes); \ 00605 }else{ \ 00606 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00607 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \ 00608 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00609 }else{ \ 00610 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.even, cudaSiteLink.bytes); \ 00611 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00612 } \ 00613 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \ 00614 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd, cudaFatLink.bytes); \ 00615 } \ 00616 }while(0) 00617 00618 00619 00620 #define ENUMERATE_FUNCS(mu,nu) switch(mu) { \ 00621 case 0: \ 00622 switch(nu){ \ 00623 case 0: \ 00624 printf("ERROR: invalid direction combination\n"); exit(1); \ 00625 break; \ 00626 case 1: \ 00627 CALL_FUNCTION(0,1); \ 00628 break; \ 00629 case 2: \ 00630 CALL_FUNCTION(0,2); \ 00631 break; \ 00632 case 3: \ 00633 CALL_FUNCTION(0,3); \ 00634 break; \ 00635 } \ 00636 break; \ 00637 case 1: \ 00638 switch(nu){ \ 00639 case 0: \ 00640 CALL_FUNCTION(1,0); \ 00641 break; \ 00642 case 1: \ 00643 printf("ERROR: invalid direction combination\n"); exit(1); \ 00644 break; \ 00645 case 2: \ 00646 CALL_FUNCTION(1,2); \ 00647 break; \ 00648 case 3: \ 00649 CALL_FUNCTION(1,3); \ 00650 break; \ 00651 } \ 00652 break; \ 00653 case 2: \ 00654 switch(nu){ \ 00655 case 0: \ 00656 CALL_FUNCTION(2,0); \ 00657 break; \ 00658 case 1: \ 00659 CALL_FUNCTION(2,1); \ 00660 break; \ 00661 case 2: \ 00662 printf("ERROR: invalid direction combination\n"); exit(1); \ 00663 break; \ 00664 case 3: \ 00665 CALL_FUNCTION(2,3); \ 00666 break; \ 00667 } \ 00668 break; \ 00669 case 3: \ 00670 switch(nu){ \ 00671 case 0: \ 00672 CALL_FUNCTION(3,0); \ 00673 break; \ 00674 case 1: \ 00675 CALL_FUNCTION(3,1); \ 00676 break; \ 00677 case 2: \ 00678 CALL_FUNCTION(3,2); \ 00679 break; \ 00680 case 3: \ 00681 printf("ERROR: invalid direction combination\n"); exit(1); \ 00682 break; \ 00683 } \ 00684 break; \ 00685 } 00686 00687 #define ENUMERATE_FUNCS_SAVE(mu,nu, save_staple) if(save_staple){ \ 00688 switch(mu) { \ 00689 case 0: \ 00690 switch(nu){ \ 00691 case 0: \ 00692 printf("ERROR: invalid direction combination\n"); exit(1); \ 00693 break; \ 00694 case 1: \ 00695 CALL_FUNCTION(0,1,1); \ 00696 break; \ 00697 case 2: \ 00698 CALL_FUNCTION(0,2,1); \ 00699 break; \ 00700 case 3: \ 00701 CALL_FUNCTION(0,3,1); \ 00702 break; \ 00703 } \ 00704 break; \ 00705 case 1: \ 00706 switch(nu){ \ 00707 case 0: \ 00708 CALL_FUNCTION(1,0,1); \ 00709 break; \ 00710 case 1: \ 00711 printf("ERROR: invalid direction combination\n"); exit(1); \ 00712 break; \ 00713 case 2: \ 00714 CALL_FUNCTION(1,2,1); \ 00715 break; \ 00716 case 3: \ 00717 CALL_FUNCTION(1,3,1); \ 00718 break; \ 00719 } \ 00720 break; \ 00721 case 2: \ 00722 switch(nu){ \ 00723 case 0: \ 00724 CALL_FUNCTION(2,0,1); \ 00725 break; \ 00726 case 1: \ 00727 CALL_FUNCTION(2,1,1); \ 00728 break; \ 00729 case 2: \ 00730 printf("ERROR: invalid direction combination\n"); exit(1); \ 00731 break; \ 00732 case 3: \ 00733 CALL_FUNCTION(2,3,1); \ 00734 break; \ 00735 } \ 00736 break; \ 00737 case 3: \ 00738 switch(nu){ \ 00739 case 0: \ 00740 CALL_FUNCTION(3,0,1); \ 00741 break; \ 00742 case 1: \ 00743 CALL_FUNCTION(3,1,1); \ 00744 break; \ 00745 case 2: \ 00746 CALL_FUNCTION(3,2,1); \ 00747 break; \ 00748 case 3: \ 00749 printf("ERROR: invalid direction combination\n"); exit(1); \ 00750 break; \ 00751 } \ 00752 break; \ 00753 } \ 00754 }else{ \ 00755 switch(mu) { \ 00756 case 0: \ 00757 switch(nu){ \ 00758 case 0: \ 00759 printf("ERROR: invalid direction combination\n"); exit(1); \ 00760 break; \ 00761 case 1: \ 00762 CALL_FUNCTION(0,1,0); \ 00763 break; \ 00764 case 2: \ 00765 CALL_FUNCTION(0,2,0); \ 00766 break; \ 00767 case 3: \ 00768 CALL_FUNCTION(0,3,0); \ 00769 break; \ 00770 } \ 00771 break; \ 00772 case 1: \ 00773 switch(nu){ \ 00774 case 0: \ 00775 CALL_FUNCTION(1,0,0); \ 00776 break; \ 00777 case 1: \ 00778 printf("ERROR: invalid direction combination\n"); exit(1); \ 00779 break; \ 00780 case 2: \ 00781 CALL_FUNCTION(1,2,0); \ 00782 break; \ 00783 case 3: \ 00784 CALL_FUNCTION(1,3,0); \ 00785 break; \ 00786 } \ 00787 break; \ 00788 case 2: \ 00789 switch(nu){ \ 00790 case 0: \ 00791 CALL_FUNCTION(2,0,0); \ 00792 break; \ 00793 case 1: \ 00794 CALL_FUNCTION(2,1,0); \ 00795 break; \ 00796 case 2: \ 00797 printf("ERROR: invalid direction combination\n"); exit(1); \ 00798 break; \ 00799 case 3: \ 00800 CALL_FUNCTION(2,3,0); \ 00801 break; \ 00802 } \ 00803 break; \ 00804 case 3: \ 00805 switch(nu){ \ 00806 case 0: \ 00807 CALL_FUNCTION(3,0,0); \ 00808 break; \ 00809 case 1: \ 00810 CALL_FUNCTION(3,1,0); \ 00811 break; \ 00812 case 2: \ 00813 CALL_FUNCTION(3,2,0); \ 00814 break; \ 00815 case 3: \ 00816 printf("ERROR: invalid direction combination\n"); exit(1); \ 00817 break; \ 00818 } \ 00819 break; \ 00820 } \ 00821 } 00822 00823 void siteComputeGenStapleParityKernel(void* staple_even, void* staple_odd, 00824 void* sitelink_even, void* sitelink_odd, 00825 void* fatlink_even, void* fatlink_odd, 00826 int mu, int nu, double mycoeff, 00827 QudaReconstructType recon, QudaPrecision prec, 00828 dim3 halfGridDim, llfat_kernel_param_t kparam, 00829 cudaStream_t* stream) 00830 { 00831 00832 //compute even and odd 00833 00834 #define CALL_FUNCTION(mu, nu) \ 00835 if (prec == QUDA_DOUBLE_PRECISION){ \ 00836 if(recon == QUDA_RECONSTRUCT_NO){ \ 00837 do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \ 00838 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \ 00839 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00840 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00841 (double)mycoeff, kparam); \ 00842 do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \ 00843 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \ 00844 (double2*)sitelink_odd, (double2*)sitelink_even, \ 00845 (double2*)fatlink_odd, (double2*)fatlink_even, \ 00846 (double)mycoeff, kparam); \ 00847 }else{ \ 00848 do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \ 00849 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \ 00850 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00851 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00852 (double)mycoeff, kparam); \ 00853 do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \ 00854 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \ 00855 (double2*)sitelink_odd, (double2*)sitelink_even, \ 00856 (double2*)fatlink_odd, (double2*)fatlink_even, \ 00857 (double)mycoeff, kparam); \ 00858 } \ 00859 }else { \ 00860 if(recon == QUDA_RECONSTRUCT_NO){ \ 00861 do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \ 00862 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \ 00863 (float2*)sitelink_even, (float2*)sitelink_odd, \ 00864 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00865 (float)mycoeff, kparam); \ 00866 do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \ 00867 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \ 00868 (float2*)sitelink_odd, (float2*)sitelink_even, \ 00869 (float2*)fatlink_odd, (float2*)fatlink_even, \ 00870 (float)mycoeff, kparam); \ 00871 }else{ \ 00872 do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \ 00873 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \ 00874 (float4*)sitelink_even, (float4*)sitelink_odd, \ 00875 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00876 (float)mycoeff, kparam); \ 00877 do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \ 00878 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \ 00879 (float4*)sitelink_odd, (float4*)sitelink_even, \ 00880 (float2*)fatlink_odd, (float2*)fatlink_even, \ 00881 (float)mycoeff, kparam); \ 00882 } \ 00883 } 00884 00885 00886 dim3 blockDim(BLOCK_DIM , 1, 1); 00887 ENUMERATE_FUNCS(mu,nu); 00888 00889 #undef CALL_FUNCTION 00890 00891 00892 } 00893 00894 00895 void 00896 computeGenStapleFieldParityKernel(void* staple_even, void* staple_odd, 00897 void* sitelink_even, void* sitelink_odd, 00898 void* fatlink_even, void* fatlink_odd, 00899 void* mulink_even, void* mulink_odd, 00900 int mu, int nu, int save_staple, 00901 double mycoeff, 00902 QudaReconstructType recon, QudaPrecision prec, 00903 dim3 halfGridDim, llfat_kernel_param_t kparam, 00904 cudaStream_t* stream) 00905 { 00906 00907 #define CALL_FUNCTION(mu, nu, save_staple) \ 00908 if (prec == QUDA_DOUBLE_PRECISION){ \ 00909 if(recon == QUDA_RECONSTRUCT_NO){ \ 00910 do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \ 00911 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \ 00912 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00913 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00914 (double2*)mulink_even, (double2*)mulink_odd, \ 00915 (double)mycoeff, kparam); \ 00916 do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \ 00917 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \ 00918 (double2*)sitelink_odd, (double2*)sitelink_even, \ 00919 (double2*)fatlink_odd, (double2*)fatlink_even, \ 00920 (double2*)mulink_odd, (double2*)mulink_even, \ 00921 (double)mycoeff, kparam); \ 00922 }else{ \ 00923 do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \ 00924 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \ 00925 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00926 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00927 (double2*)mulink_even, (double2*)mulink_odd, \ 00928 (double)mycoeff, kparam); \ 00929 do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \ 00930 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \ 00931 (double2*)sitelink_odd, (double2*)sitelink_even, \ 00932 (double2*)fatlink_odd, (double2*)fatlink_even, \ 00933 (double2*)mulink_odd, (double2*)mulink_even, \ 00934 (double)mycoeff, kparam); \ 00935 } \ 00936 }else{ \ 00937 if(recon == QUDA_RECONSTRUCT_NO){ \ 00938 do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \ 00939 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \ 00940 (float2*)sitelink_even, (float2*)sitelink_odd, \ 00941 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00942 (float2*)mulink_even, (float2*)mulink_odd, \ 00943 (float)mycoeff, kparam); \ 00944 do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \ 00945 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \ 00946 (float2*)sitelink_odd, (float2*)sitelink_even, \ 00947 (float2*)fatlink_odd, (float2*)fatlink_even, \ 00948 (float2*)mulink_odd, (float2*)mulink_even, \ 00949 (float)mycoeff, kparam); \ 00950 }else{ \ 00951 do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \ 00952 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \ 00953 (float4*)sitelink_even, (float4*)sitelink_odd, \ 00954 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00955 (float2*)mulink_even, (float2*)mulink_odd, \ 00956 (float)mycoeff, kparam); \ 00957 do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \ 00958 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \ 00959 (float4*)sitelink_odd, (float4*)sitelink_even, \ 00960 (float2*)fatlink_odd, (float2*)fatlink_even, \ 00961 (float2*)mulink_odd, (float2*)mulink_even, \ 00962 (float)mycoeff, kparam); \ 00963 } \ 00964 } 00965 00966 BIND_MU_LINK(); 00967 dim3 blockDim(BLOCK_DIM , 1, 1); 00968 ENUMERATE_FUNCS_SAVE(mu,nu,save_staple); 00969 00970 UNBIND_MU_LINK(); 00971 00972 #undef CALL_FUNCTION 00973 00974 } 00975 00976 00977 void siteComputeGenStapleParityKernel_ex(void* staple_even, void* staple_odd, 00978 void* sitelink_even, void* sitelink_odd, 00979 void* fatlink_even, void* fatlink_odd, 00980 int mu, int nu, double mycoeff, 00981 QudaReconstructType recon, QudaPrecision prec, 00982 llfat_kernel_param_t kparam) 00983 { 00984 00985 //compute even and odd 00986 dim3 blockDim = kparam.blockDim; 00987 dim3 halfGridDim = kparam.halfGridDim; 00988 int sbytes_dp = blockDim.x*5*sizeof(double2); 00989 int sbytes_sp = blockDim.x*5*sizeof(float2); 00990 00991 #define CALL_FUNCTION(mu, nu) \ 00992 if (prec == QUDA_DOUBLE_PRECISION){ \ 00993 if(recon == QUDA_RECONSTRUCT_NO){ \ 00994 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \ 00995 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \ 00996 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00997 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00998 (double)mycoeff, kparam); \ 00999 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \ 01000 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \ 01001 (double2*)sitelink_odd, (double2*)sitelink_even, \ 01002 (double2*)fatlink_odd, (double2*)fatlink_even, \ 01003 (double)mycoeff, kparam); \ 01004 }else{ \ 01005 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \ 01006 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \ 01007 (double2*)sitelink_even, (double2*)sitelink_odd, \ 01008 (double2*)fatlink_even, (double2*)fatlink_odd, \ 01009 (double)mycoeff, kparam); \ 01010 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \ 01011 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \ 01012 (double2*)sitelink_odd, (double2*)sitelink_even, \ 01013 (double2*)fatlink_odd, (double2*)fatlink_even, \ 01014 (double)mycoeff, kparam); \ 01015 } \ 01016 }else { \ 01017 if(recon == QUDA_RECONSTRUCT_NO){ \ 01018 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \ 01019 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \ 01020 (float2*)sitelink_even, (float2*)sitelink_odd, \ 01021 (float2*)fatlink_even, (float2*)fatlink_odd, \ 01022 (float)mycoeff, kparam); \ 01023 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \ 01024 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \ 01025 (float2*)sitelink_odd, (float2*)sitelink_even, \ 01026 (float2*)fatlink_odd, (float2*)fatlink_even, \ 01027 (float)mycoeff, kparam); \ 01028 }else{ \ 01029 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \ 01030 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \ 01031 (float4*)sitelink_even, (float4*)sitelink_odd, \ 01032 (float2*)fatlink_even, (float2*)fatlink_odd, \ 01033 (float)mycoeff, kparam); \ 01034 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \ 01035 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \ 01036 (float4*)sitelink_odd, (float4*)sitelink_even, \ 01037 (float2*)fatlink_odd, (float2*)fatlink_even, \ 01038 (float)mycoeff, kparam); \ 01039 } \ 01040 } 01041 01042 01043 ENUMERATE_FUNCS(mu,nu); 01044 01045 #undef CALL_FUNCTION 01046 01047 01048 } 01049 01050 01051 01052 void 01053 computeGenStapleFieldParityKernel_ex(void* staple_even, void* staple_odd, 01054 void* sitelink_even, void* sitelink_odd, 01055 void* fatlink_even, void* fatlink_odd, 01056 void* mulink_even, void* mulink_odd, 01057 int mu, int nu, int save_staple, 01058 double mycoeff, 01059 QudaReconstructType recon, QudaPrecision prec, 01060 llfat_kernel_param_t kparam) 01061 { 01062 01063 dim3 blockDim = kparam.blockDim; 01064 dim3 halfGridDim= kparam.halfGridDim; 01065 01066 int sbytes_dp = blockDim.x*5*sizeof(double2); 01067 int sbytes_sp = blockDim.x*5*sizeof(float2); 01068 01069 #define CALL_FUNCTION(mu, nu, save_staple) \ 01070 if (prec == QUDA_DOUBLE_PRECISION){ \ 01071 if(recon == QUDA_RECONSTRUCT_NO){ \ 01072 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \ 01073 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \ 01074 (double2*)sitelink_even, (double2*)sitelink_odd, \ 01075 (double2*)fatlink_even, (double2*)fatlink_odd, \ 01076 (double2*)mulink_even, (double2*)mulink_odd, \ 01077 (double)mycoeff, kparam); \ 01078 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \ 01079 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \ 01080 (double2*)sitelink_odd, (double2*)sitelink_even, \ 01081 (double2*)fatlink_odd, (double2*)fatlink_even, \ 01082 (double2*)mulink_odd, (double2*)mulink_even, \ 01083 (double)mycoeff, kparam); \ 01084 }else{ \ 01085 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \ 01086 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \ 01087 (double2*)sitelink_even, (double2*)sitelink_odd, \ 01088 (double2*)fatlink_even, (double2*)fatlink_odd, \ 01089 (double2*)mulink_even, (double2*)mulink_odd, \ 01090 (double)mycoeff, kparam); \ 01091 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \ 01092 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \ 01093 (double2*)sitelink_odd, (double2*)sitelink_even, \ 01094 (double2*)fatlink_odd, (double2*)fatlink_even, \ 01095 (double2*)mulink_odd, (double2*)mulink_even, \ 01096 (double)mycoeff, kparam); \ 01097 } \ 01098 }else{ \ 01099 if(recon == QUDA_RECONSTRUCT_NO){ \ 01100 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \ 01101 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \ 01102 (float2*)sitelink_even, (float2*)sitelink_odd, \ 01103 (float2*)fatlink_even, (float2*)fatlink_odd, \ 01104 (float2*)mulink_even, (float2*)mulink_odd, \ 01105 (float)mycoeff, kparam); \ 01106 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \ 01107 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \ 01108 (float2*)sitelink_odd, (float2*)sitelink_even, \ 01109 (float2*)fatlink_odd, (float2*)fatlink_even, \ 01110 (float2*)mulink_odd, (float2*)mulink_even, \ 01111 (float)mycoeff, kparam); \ 01112 }else{ \ 01113 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \ 01114 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \ 01115 (float4*)sitelink_even, (float4*)sitelink_odd, \ 01116 (float2*)fatlink_even, (float2*)fatlink_odd, \ 01117 (float2*)mulink_even, (float2*)mulink_odd, \ 01118 (float)mycoeff, kparam); \ 01119 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \ 01120 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \ 01121 (float4*)sitelink_odd, (float4*)sitelink_even, \ 01122 (float2*)fatlink_odd, (float2*)fatlink_even, \ 01123 (float2*)mulink_odd, (float2*)mulink_even, \ 01124 (float)mycoeff, kparam); \ 01125 } \ 01126 } 01127 01128 BIND_MU_LINK(); 01129 ENUMERATE_FUNCS_SAVE(mu,nu,save_staple); 01130 01131 UNBIND_MU_LINK(); 01132 01133 #undef CALL_FUNCTION 01134 01135 } 01136 01137 01138 01139 01140 void llfatOneLinkKernel(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink, 01141 cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1, 01142 QudaGaugeParam* param, double* act_path_coeff) 01143 { 01144 QudaPrecision prec = cudaSiteLink.Precision(); 01145 QudaReconstructType recon = cudaSiteLink.Reconstruct(); 01146 01147 BIND_SITE_AND_FAT_LINK; 01148 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 01149 dim3 gridDim(volume/BLOCK_DIM,1,1); 01150 dim3 blockDim(BLOCK_DIM , 1, 1); 01151 01152 staple_bytes = cudaStaple.Bytes(); 01153 01154 if(prec == QUDA_DOUBLE_PRECISION){ 01155 if(recon == QUDA_RECONSTRUCT_NO){ 01156 llfatOneLink18Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(), 01157 (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(), 01158 (double)act_path_coeff[0], (double)act_path_coeff[5]); 01159 }else{ 01160 01161 llfatOneLink12Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(), 01162 (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(), 01163 (double)act_path_coeff[0], (double)act_path_coeff[5]); 01164 01165 } 01166 }else{ //single precision 01167 if(recon == QUDA_RECONSTRUCT_NO){ 01168 llfatOneLink18Kernel<<<gridDim, blockDim>>>((float2*)cudaSiteLink.Even_p(), (float2*)cudaSiteLink.Odd_p(), 01169 (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(), 01170 (float)act_path_coeff[0], (float)act_path_coeff[5]); 01171 }else{ 01172 llfatOneLink12Kernel<<<gridDim, blockDim>>>((float4*)cudaSiteLink.Even_p(), (float4*)cudaSiteLink.Odd_p(), 01173 (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(), 01174 (float)act_path_coeff[0], (float)act_path_coeff[5]); 01175 } 01176 } 01177 } 01178 01179 01180 01181 void llfatOneLinkKernel_ex(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink, 01182 cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1, 01183 QudaGaugeParam* param, double* act_path_coeff, 01184 llfat_kernel_param_t kparam) 01185 { 01186 QudaPrecision prec = cudaSiteLink.Precision(); 01187 QudaReconstructType recon = cudaSiteLink.Reconstruct(); 01188 01189 BIND_SITE_AND_FAT_LINK; 01190 01191 dim3 gridDim; 01192 dim3 blockDim = kparam.blockDim; 01193 gridDim.x = 2* kparam.halfGridDim.x; 01194 gridDim.y = 1; 01195 gridDim.z = 1; 01196 staple_bytes = cudaStaple.Bytes(); 01197 01198 if(prec == QUDA_DOUBLE_PRECISION){ 01199 if(recon == QUDA_RECONSTRUCT_NO){ 01200 llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(), 01201 (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(), 01202 (double)act_path_coeff[0], (double)act_path_coeff[5], kparam); 01203 }else{ 01204 01205 llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((double2*)cudaSiteLink.Even_p(), (double2*)cudaSiteLink.Odd_p(), 01206 (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(), 01207 (double)act_path_coeff[0], (double)act_path_coeff[5], kparam); 01208 01209 } 01210 }else{ //single precision 01211 if(recon == QUDA_RECONSTRUCT_NO){ 01212 llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((float2*)cudaSiteLink.Even_p(), (float2*)cudaSiteLink.Odd_p(), 01213 (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(), 01214 (float)act_path_coeff[0], (float)act_path_coeff[5], kparam); 01215 }else{ 01216 llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((float4*)cudaSiteLink.Even_p(), (float4*)cudaSiteLink.Odd_p(), 01217 (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(), 01218 (float)act_path_coeff[0], (float)act_path_coeff[5], kparam); 01219 } 01220 } 01221 } 01222