|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdio.h> 00002 00003 #include <quda_internal.h> 00004 #include <llfat_quda.h> 00005 #include <read_gauge.h> 00006 #include <gauge_quda.h> 00007 #include <cuda_runtime.h> 00008 #include <cuda.h> 00009 00010 #define WRITE_FAT_MATRIX(gauge, dir, idx)do { \ 00011 gauge[idx + dir*Vhx9] = FAT0; \ 00012 gauge[idx + dir*Vhx9 + Vh ] = FAT1; \ 00013 gauge[idx + dir*Vhx9 + Vhx2] = FAT2; \ 00014 gauge[idx + dir*Vhx9 + Vhx3] = FAT3; \ 00015 gauge[idx + dir*Vhx9 + Vhx4] = FAT4; \ 00016 gauge[idx + dir*Vhx9 + Vhx5] = FAT5; \ 00017 gauge[idx + dir*Vhx9 + Vhx6] = FAT6; \ 00018 gauge[idx + dir*Vhx9 + Vhx7] = FAT7; \ 00019 gauge[idx + dir*Vhx9 + Vhx8] = FAT8;} while(0) 00020 00021 00022 #define WRITE_STAPLE_MATRIX(gauge, idx) \ 00023 gauge[idx] = STAPLE0; \ 00024 gauge[idx + Vh] = STAPLE1; \ 00025 gauge[idx + Vhx2] = STAPLE2; \ 00026 gauge[idx + Vhx3] = STAPLE3; \ 00027 gauge[idx + Vhx4] = STAPLE4; \ 00028 gauge[idx + Vhx5] = STAPLE5; \ 00029 gauge[idx + Vhx6] = STAPLE6; \ 00030 gauge[idx + Vhx7] = STAPLE7; \ 00031 gauge[idx + Vhx8] = STAPLE8; 00032 00033 00034 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \ 00035 c##00_re = a*b##00_re; \ 00036 c##00_im = a*b##00_im; \ 00037 c##01_re = a*b##01_re; \ 00038 c##01_im = a*b##01_im; \ 00039 c##02_re = a*b##02_re; \ 00040 c##02_im = a*b##02_im; \ 00041 c##10_re = a*b##10_re; \ 00042 c##10_im = a*b##10_im; \ 00043 c##11_re = a*b##11_re; \ 00044 c##11_im = a*b##11_im; \ 00045 c##12_re = a*b##12_re; \ 00046 c##12_im = a*b##12_im; \ 00047 c##20_re = a*b##20_re; \ 00048 c##20_im = a*b##20_im; \ 00049 c##21_re = a*b##21_re; \ 00050 c##21_im = a*b##21_im; \ 00051 c##22_re = a*b##22_re; \ 00052 c##22_im = a*b##22_im; \ 00053 00054 00055 #define LOAD_MATRIX_18_SINGLE(gauge, dir, idx, var) \ 00056 float2 var##0 = gauge[idx + dir*Vhx9]; \ 00057 float2 var##1 = gauge[idx + dir*Vhx9 + Vh]; \ 00058 float2 var##2 = gauge[idx + dir*Vhx9 + Vhx2]; \ 00059 float2 var##3 = gauge[idx + dir*Vhx9 + Vhx3]; \ 00060 float2 var##4 = gauge[idx + dir*Vhx9 + Vhx4]; \ 00061 float2 var##5 = gauge[idx + dir*Vhx9 + Vhx5]; \ 00062 float2 var##6 = gauge[idx + dir*Vhx9 + Vhx6]; \ 00063 float2 var##7 = gauge[idx + dir*Vhx9 + Vhx7]; \ 00064 float2 var##8 = gauge[idx + dir*Vhx9 + Vhx8]; 00065 00066 #define LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var) \ 00067 float2 var##0 = tex1Dfetch(gauge, idx + dir*Vhx9); \ 00068 float2 var##1 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vh); \ 00069 float2 var##2 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx2); \ 00070 float2 var##3 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx3); \ 00071 float2 var##4 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx4); \ 00072 float2 var##5 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx5); \ 00073 float2 var##6 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx6); \ 00074 float2 var##7 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx7); \ 00075 float2 var##8 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx8); 00076 00077 #define LOAD_MATRIX_18_DOUBLE(gauge, dir, idx, var) \ 00078 double2 var##0 = gauge[idx + dir*Vhx9]; \ 00079 double2 var##1 = gauge[idx + dir*Vhx9 + Vh]; \ 00080 double2 var##2 = gauge[idx + dir*Vhx9 + Vhx2]; \ 00081 double2 var##3 = gauge[idx + dir*Vhx9 + Vhx3]; \ 00082 double2 var##4 = gauge[idx + dir*Vhx9 + Vhx4]; \ 00083 double2 var##5 = gauge[idx + dir*Vhx9 + Vhx5]; \ 00084 double2 var##6 = gauge[idx + dir*Vhx9 + Vhx6]; \ 00085 double2 var##7 = gauge[idx + dir*Vhx9 + Vhx7]; \ 00086 double2 var##8 = gauge[idx + dir*Vhx9 + Vhx8]; 00087 00088 #define LOAD_MATRIX_18_DOUBLE_TEX(gauge, dir, idx, var) \ 00089 double2 var##0 = fetch_double2(gauge, idx + dir*Vhx9); \ 00090 double2 var##1 = fetch_double2(gauge, idx + dir*Vhx9 + Vh); \ 00091 double2 var##2 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx2); \ 00092 double2 var##3 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx3); \ 00093 double2 var##4 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx4); \ 00094 double2 var##5 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx5); \ 00095 double2 var##6 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx6); \ 00096 double2 var##7 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx7); \ 00097 double2 var##8 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx8); 00098 00099 00100 #define SITE_MATRIX_LOAD_TEX 0 00101 #define MULINK_LOAD_TEX 0 00102 #define FATLINK_LOAD_TEX 0 00103 00104 00105 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var) \ 00106 float4 var##0 = gauge[idx + dir*Vhx3]; \ 00107 float4 var##1 = gauge[idx + dir*Vhx3 + Vh]; \ 00108 float4 var##2 = gauge[idx + dir*Vhx3 + Vhx2]; \ 00109 float4 var##3, var##4; 00110 00111 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var) \ 00112 float4 var##0 = tex1Dfetch(gauge, idx + dir* Vhx3); \ 00113 float4 var##1 = tex1Dfetch(gauge, idx + dir*Vhx3 + Vh); \ 00114 float4 var##2 = tex1Dfetch(gauge, idx + dir*Vhx3 + Vhx2); \ 00115 float4 var##3, var##4; 00116 00117 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var) \ 00118 float2 var##0 = gauge[idx + dir*Vhx9]; \ 00119 float2 var##1 = gauge[idx + dir*Vhx9 + Vh]; \ 00120 float2 var##2 = gauge[idx + dir*Vhx9 + Vhx2]; \ 00121 float2 var##3 = gauge[idx + dir*Vhx9 + Vhx3]; \ 00122 float2 var##4 = gauge[idx + dir*Vhx9 + Vhx4]; \ 00123 float2 var##5 = gauge[idx + dir*Vhx9 + Vhx5]; \ 00124 float2 var##6 = gauge[idx + dir*Vhx9 + Vhx6]; \ 00125 float2 var##7 = gauge[idx + dir*Vhx9 + Vhx7]; \ 00126 float2 var##8 = gauge[idx + dir*Vhx9 + Vhx8]; 00127 00128 00129 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var) \ 00130 float2 var##0 = tex1Dfetch(gauge, idx + dir*Vhx9); \ 00131 float2 var##1 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vh); \ 00132 float2 var##2 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx2); \ 00133 float2 var##3 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx3); \ 00134 float2 var##4 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx4); \ 00135 float2 var##5 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx5); \ 00136 float2 var##6 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx6); \ 00137 float2 var##7 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx7); \ 00138 float2 var##8 = tex1Dfetch(gauge, idx + dir*Vhx9 + Vhx8); 00139 00140 00141 00142 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var) \ 00143 double2 var##0 = gauge[idx + dir*Vhx9]; \ 00144 double2 var##1 = gauge[idx + dir*Vhx9 + Vh]; \ 00145 double2 var##2 = gauge[idx + dir*Vhx9 + Vhx2]; \ 00146 double2 var##3 = gauge[idx + dir*Vhx9 + Vhx3]; \ 00147 double2 var##4 = gauge[idx + dir*Vhx9 + Vhx4]; \ 00148 double2 var##5 = gauge[idx + dir*Vhx9 + Vhx5]; \ 00149 double2 var##6 = gauge[idx + dir*Vhx9 + Vhx6]; \ 00150 double2 var##7 = gauge[idx + dir*Vhx9 + Vhx7]; \ 00151 double2 var##8 = gauge[idx + dir*Vhx9 + Vhx8]; 00152 00153 00154 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge, dir, idx, var) \ 00155 double2 var##0 = fetch_double2(gauge, idx + dir*Vhx9); \ 00156 double2 var##1 = fetch_double2(gauge, idx + dir*Vhx9 + Vh); \ 00157 double2 var##2 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx2); \ 00158 double2 var##3 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx3); \ 00159 double2 var##4 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx4); \ 00160 double2 var##5 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx5); \ 00161 double2 var##6 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx6); \ 00162 double2 var##7 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx7); \ 00163 double2 var##8 = fetch_double2(gauge, idx + dir*Vhx9 + Vhx8); 00164 00165 00166 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var) \ 00167 double2 var##0 = gauge[idx + dir*Vhx6]; \ 00168 double2 var##1 = gauge[idx + dir*Vhx6 + Vh]; \ 00169 double2 var##2 = gauge[idx + dir*Vhx6 + Vhx2]; \ 00170 double2 var##3 = gauge[idx + dir*Vhx6 + Vhx3]; \ 00171 double2 var##4 = gauge[idx + dir*Vhx6 + Vhx4]; \ 00172 double2 var##5 = gauge[idx + dir*Vhx6 + Vhx5]; \ 00173 double2 var##6, var##7, var##8; 00174 00175 00176 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge, dir, idx, var) \ 00177 double2 var##0 = fetch_double2(gauge, idx + dir*Vhx6); \ 00178 double2 var##1 = fetch_double2(gauge, idx + dir*Vhx6 + Vh); \ 00179 double2 var##2 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx2); \ 00180 double2 var##3 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx3); \ 00181 double2 var##4 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx4); \ 00182 double2 var##5 = fetch_double2(gauge, idx + dir*Vhx6 + Vhx5); \ 00183 double2 var##6, var##7, var##8; 00184 00185 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc) \ 00186 mc##00_re = ma##00_re + mb##00_re; \ 00187 mc##00_im = ma##00_im + mb##00_im; \ 00188 mc##01_re = ma##01_re + mb##01_re; \ 00189 mc##01_im = ma##01_im + mb##01_im; \ 00190 mc##02_re = ma##02_re + mb##02_re; \ 00191 mc##02_im = ma##02_im + mb##02_im; \ 00192 mc##10_re = ma##10_re + mb##10_re; \ 00193 mc##10_im = ma##10_im + mb##10_im; \ 00194 mc##11_re = ma##11_re + mb##11_re; \ 00195 mc##11_im = ma##11_im + mb##11_im; \ 00196 mc##12_re = ma##12_re + mb##12_re; \ 00197 mc##12_im = ma##12_im + mb##12_im; \ 00198 mc##20_re = ma##20_re + mb##20_re; \ 00199 mc##20_im = ma##20_im + mb##20_im; \ 00200 mc##21_re = ma##21_re + mb##21_re; \ 00201 mc##21_im = ma##21_im + mb##21_im; \ 00202 mc##22_re = ma##22_re + mb##22_re; \ 00203 mc##22_im = ma##22_im + mb##22_im; 00204 00205 00206 00207 void 00208 llfat_init_cuda(QudaGaugeParam* param) 00209 { 00210 static int llfat_init_cuda_flag = 0; 00211 if (llfat_init_cuda_flag){ 00212 return; 00213 } 00214 00215 llfat_init_cuda_flag = 1; 00216 00217 init_kernel_cuda(param); 00218 00219 } 00220 00221 00222 00223 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \ 00224 new_x1 = x1; \ 00225 new_x2 = x2; \ 00226 new_x3 = x3; \ 00227 new_x4 = x4; \ 00228 switch(mydir1){ \ 00229 case 0: \ 00230 new_mem_idx = ( (x1==0)?X+X1m1:X-1); \ 00231 new_x1 = (x1==0)?X1m1:x1 - 1; \ 00232 break; \ 00233 case 1: \ 00234 new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1); \ 00235 new_x2 = (x2==0)?X2m1:x2 - 1; \ 00236 break; \ 00237 case 2: \ 00238 new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1); \ 00239 new_x3 = (x3==0)?X3m1:x3 - 1; \ 00240 break; \ 00241 case 3: \ 00242 new_mem_idx = ( (x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1); \ 00243 new_x4 = (x4==0)?X4m1:x4 - 1; \ 00244 break; \ 00245 } \ 00246 switch(mydir2){ \ 00247 case 0: \ 00248 new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1; \ 00249 new_x1 = (x1==X1m1)?0:x1+1; \ 00250 break; \ 00251 case 1: \ 00252 new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \ 00253 new_x2 = (x2==X2m1)?0:x2+1; \ 00254 break; \ 00255 case 2: \ 00256 new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \ 00257 new_x3 = (x3==X3m1)?0:x3+1; \ 00258 break; \ 00259 case 3: \ 00260 new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \ 00261 new_x4 = (x4==X4m1)?0:x4+1; \ 00262 break; \ 00263 } \ 00264 }while(0) 00265 00266 00267 00268 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \ 00269 new_x1 = x1; \ 00270 new_x2 = x2; \ 00271 new_x3 = x3; \ 00272 new_x4 = x4; \ 00273 switch(mydir){ \ 00274 case 0: \ 00275 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>>1; \ 00276 new_x1 = (x1==X1m1)?0:x1+1; \ 00277 break; \ 00278 case 1: \ 00279 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1)>>1; \ 00280 new_x2 = (x2==X2m1)?0:x2+1; \ 00281 break; \ 00282 case 2: \ 00283 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1)>>1; \ 00284 new_x3 = (x3==X3m1)?0:x3+1; \ 00285 break; \ 00286 case 3: \ 00287 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1)>>1; \ 00288 new_x4 = (x4==X4m1)?0:x4+1; \ 00289 break; \ 00290 } \ 00291 }while(0) 00292 00293 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \ 00294 new_x1 = x1; \ 00295 new_x2 = x2; \ 00296 new_x3 = x3; \ 00297 new_x4 = x4; \ 00298 switch(mydir){ \ 00299 case 0: \ 00300 new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1; \ 00301 new_x1 = (x1==0)?X1m1:x1 - 1; \ 00302 break; \ 00303 case 1: \ 00304 new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1; \ 00305 new_x2 = (x2==0)?X2m1:x2 - 1; \ 00306 break; \ 00307 case 2: \ 00308 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1; \ 00309 new_x3 = (x3==0)?X3m1:x3 - 1; \ 00310 break; \ 00311 case 3: \ 00312 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1; \ 00313 new_x4 = (x4==0)?X4m1:x4 - 1; \ 00314 break; \ 00315 } \ 00316 }while(0) 00317 00318 00319 00320 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \ 00321 sign =1; \ 00322 switch(dir){ \ 00323 case XUP: \ 00324 if ( (i4 & 1) == 1){ \ 00325 sign = -1; \ 00326 } \ 00327 break; \ 00328 case YUP: \ 00329 if ( ((i4+i1) & 1) == 1){ \ 00330 sign = -1; \ 00331 } \ 00332 break; \ 00333 case ZUP: \ 00334 if ( ((i4+i1+i2) & 1) == 1){ \ 00335 sign = -1; \ 00336 } \ 00337 break; \ 00338 case TUP: \ 00339 if (i4 == X4m1 ){ \ 00340 sign = -1; \ 00341 } \ 00342 break; \ 00343 } \ 00344 }while (0) 00345 00346 00347 #define LLFAT_CONCAT(a,b) a##b##Kernel 00348 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b) 00349 00350 //precision: 0 is for double, 1 is for single 00351 00352 //single precision, common macro 00353 #define PRECISION 1 00354 #define Float float 00355 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE(gauge, dir, idx, FAT) 00356 #if (MULINK_LOAD_TEX == 1) 00357 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(muLink0TexSingle, dir, idx, var) 00358 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(muLink1TexSingle, dir, idx, var) 00359 #else 00360 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE(mulink_even, dir, idx, var) 00361 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE(mulink_odd, dir, idx, var) 00362 #endif 00363 00364 #if (FATLINK_LOAD_TEX == 1) 00365 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX(fatGauge0TexSingle, dir, idx, FAT) 00366 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX(fatGauge1TexSingle, dir, idx, FAT) 00367 #else 00368 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE(fatlink_even, dir, idx, FAT) 00369 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE(fatlink_odd, dir, idx, FAT) 00370 #endif 00371 00372 00373 //single precision, 12-reconstruct 00374 #define SITELINK0TEX siteLink0TexSingle 00375 #define SITELINK1TEX siteLink1TexSingle 00376 #if (SITE_MATRIX_LOAD_TEX == 1) 00377 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var) 00378 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var) 00379 #else 00380 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var) 00381 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var) 00382 #endif 00383 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var) 00384 00385 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var) RECONSTRUCT_LINK_12(dir, idx, sign, var); 00386 #define FloatN float4 00387 #define FloatM float2 00388 #define RECONSTRUCT 12 00389 #include "llfat_core.h" 00390 #undef SITELINK0TEX 00391 #undef SITELINK1TEX 00392 #undef LOAD_EVEN_SITE_MATRIX 00393 #undef LOAD_ODD_SITE_MATRIX 00394 #undef LOAD_SITE_MATRIX 00395 #undef RECONSTRUCT_SITE_LINK 00396 #undef FloatN 00397 #undef FloatM 00398 #undef RECONSTRUCT 00399 00400 //single precision, 18-reconstruct 00401 #define SITELINK0TEX siteLink0TexSingle_norecon 00402 #define SITELINK1TEX siteLink1TexSingle_norecon 00403 #if (SITE_MATRIX_LOAD_TEX == 1) 00404 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var) 00405 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var) 00406 #else 00407 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var) 00408 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var) 00409 #endif 00410 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE(sitelink, dir, idx, var) 00411 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var) 00412 #define FloatN float2 00413 #define FloatM float2 00414 #define RECONSTRUCT 18 00415 #include "llfat_core.h" 00416 #undef SITELINK0TEX 00417 #undef SITELINK1TEX 00418 #undef LOAD_EVEN_SITE_MATRIX 00419 #undef LOAD_ODD_SITE_MATRIX 00420 #undef LOAD_SITE_MATRIX 00421 #undef RECONSTRUCT_SITE_LINK 00422 #undef FloatN 00423 #undef FloatM 00424 #undef RECONSTRUCT 00425 00426 00427 #undef PRECISION 00428 #undef Float 00429 #undef LOAD_FAT_MATRIX 00430 #undef LOAD_EVEN_MULINK_MATRIX 00431 #undef LOAD_ODD_MULINK_MATRIX 00432 #undef LOAD_EVEN_FAT_MATRIX 00433 #undef LOAD_ODD_FAT_MATRIX 00434 00435 00436 //double precision, common macro 00437 #define PRECISION 0 00438 #define Float double 00439 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE(gauge, dir, idx, FAT) 00440 #if (MULINK_LOAD_TEX == 1) 00441 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(muLink0TexDouble, dir, idx, var) 00442 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(muLink1TexDouble, dir, idx, var) 00443 #else 00444 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var) 00445 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var) 00446 #endif 00447 00448 #if (FATLINK_LOAD_TEX == 1) 00449 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX(fatGauge0TexDouble, dir, idx, FAT) 00450 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX(fatGauge1TexDouble, dir, idx, FAT) 00451 #else 00452 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE(fatlink_even, dir, idx, FAT) 00453 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE(fatlink_odd, dir, idx, FAT) 00454 #endif 00455 00456 //double precision, 18-reconstruct 00457 #define SITELINK0TEX siteLink0TexDouble 00458 #define SITELINK1TEX siteLink1TexDouble 00459 #if (SITE_MATRIX_LOAD_TEX == 1) 00460 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var) 00461 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var) 00462 #else 00463 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var) 00464 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var) 00465 #endif 00466 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE(sitelink, dir, idx, var) 00467 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var) 00468 #define FloatN double2 00469 #define FloatM double2 00470 #define RECONSTRUCT 18 00471 #include "llfat_core.h" 00472 #undef SITELINK0TEX 00473 #undef SITELINK1TEX 00474 #undef LOAD_EVEN_SITE_MATRIX 00475 #undef LOAD_ODD_SITE_MATRIX 00476 #undef LOAD_SITE_MATRIX 00477 #undef RECONSTRUCT_SITE_LINK 00478 #undef FloatN 00479 #undef FloatM 00480 #undef RECONSTRUCT 00481 00482 #if 1 00483 //double precision, 12-reconstruct 00484 #define SITELINK0TEX siteLink0TexDouble 00485 #define SITELINK1TEX siteLink1TexDouble 00486 #if (SITE_MATRIX_LOAD_TEX == 1) 00487 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(SITELINK0TEX, dir, idx, var) 00488 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(SITELINK1TEX, dir, idx, var) 00489 #else 00490 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var) 00491 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var) 00492 #endif 00493 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var) 00494 #define RECONSTRUCT_SITE_LINK(dir, idx, sign, var) RECONSTRUCT_LINK_12(dir, idx, sign, var); 00495 #define FloatN double2 00496 #define FloatM double2 00497 #define RECONSTRUCT 12 00498 #include "llfat_core.h" 00499 #undef SITELINK0TEX 00500 #undef SITELINK1TEX 00501 #undef LOAD_EVEN_SITE_MATRIX 00502 #undef LOAD_ODD_SITE_MATRIX 00503 #undef LOAD_SITE_MATRIX 00504 #undef RECONSTRUCT_SITE_LINK 00505 #undef FloatN 00506 #undef FloatM 00507 #undef RECONSTRUCT 00508 #endif 00509 00510 #undef PRECISION 00511 #undef Float 00512 #undef LOAD_FAT_MATRIX 00513 #undef LOAD_EVEN_MULINK_MATRIX 00514 #undef LOAD_ODD_MULINK_MATRIX 00515 #undef LOAD_EVEN_FAT_MATRIX 00516 #undef LOAD_ODD_FAT_MATRIX 00517 00518 #undef LLFAT_CONCAT 00519 #undef LLFAT_KERNEL 00520 00521 #define UNBIND_ALL_TEXTURE do{ \ 00522 if(prec ==QUDA_DOUBLE_PRECISION){ \ 00523 cudaUnbindTexture(siteLink0TexDouble); \ 00524 cudaUnbindTexture(siteLink1TexDouble); \ 00525 cudaUnbindTexture(fatGauge0TexDouble); \ 00526 cudaUnbindTexture(fatGauge1TexDouble); \ 00527 cudaUnbindTexture(muLink0TexDouble); \ 00528 cudaUnbindTexture(muLink1TexDouble); \ 00529 }else{ \ 00530 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00531 cudaUnbindTexture(siteLink0TexSingle_norecon); \ 00532 cudaUnbindTexture(siteLink1TexSingle_norecon); \ 00533 }else{ \ 00534 cudaUnbindTexture(siteLink0TexSingle); \ 00535 cudaUnbindTexture(siteLink1TexSingle); \ 00536 } \ 00537 cudaUnbindTexture(fatGauge0TexSingle); \ 00538 cudaUnbindTexture(fatGauge1TexSingle); \ 00539 cudaUnbindTexture(muLink0TexSingle); \ 00540 cudaUnbindTexture(muLink1TexSingle); \ 00541 } \ 00542 }while(0) 00543 00544 #define UNBIND_SITE_AND_FAT_LINK do{ \ 00545 if(prec == QUDA_DOUBLE_PRECISION){ \ 00546 cudaUnbindTexture(siteLink0TexDouble); \ 00547 cudaUnbindTexture(siteLink1TexDouble); \ 00548 cudaUnbindTexture(fatGauge0TexDouble); \ 00549 cudaUnbindTexture(fatGauge1TexDouble); \ 00550 }else { \ 00551 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00552 cudaUnbindTexture(siteLink0TexSingle_norecon); \ 00553 cudaUnbindTexture(siteLink1TexSingle_norecon); \ 00554 }else{ \ 00555 cudaUnbindTexture(siteLink0TexSingle); \ 00556 cudaUnbindTexture(siteLink1TexSingle); \ 00557 } \ 00558 cudaUnbindTexture(fatGauge0TexSingle); \ 00559 cudaUnbindTexture(fatGauge1TexSingle); \ 00560 } \ 00561 }while(0) 00562 00563 #define BIND_SITE_AND_FAT_LINK do { \ 00564 if(prec == QUDA_DOUBLE_PRECISION){ \ 00565 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \ 00566 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00567 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.even, cudaFatLink.bytes); \ 00568 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.odd, cudaFatLink.bytes); \ 00569 }else{ \ 00570 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00571 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \ 00572 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00573 }else{ \ 00574 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); \ 00575 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00576 } \ 00577 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.even, cudaFatLink.bytes); \ 00578 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.odd, cudaFatLink.bytes); \ 00579 } \ 00580 }while(0) 00581 00582 #define BIND_SITE_AND_FAT_LINK_REVERSE do { \ 00583 if(prec == QUDA_DOUBLE_PRECISION){ \ 00584 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \ 00585 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00586 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \ 00587 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd, cudaFatLink.bytes); \ 00588 }else{ \ 00589 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \ 00590 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \ 00591 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00592 }else{ \ 00593 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); \ 00594 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); \ 00595 } \ 00596 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \ 00597 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd, cudaFatLink.bytes); \ 00598 } \ 00599 }while(0) 00600 00601 00602 00603 #define ENUMERATE_FUNCS(mu,nu,odd_bit) switch(mu) { \ 00604 case 0: \ 00605 switch(nu){ \ 00606 case 0: \ 00607 printf("ERROR: invalid direction combination\n"); exit(1); \ 00608 break; \ 00609 case 1: \ 00610 if (!odd_bit) { CALL_FUNCTION(0,1,0); } \ 00611 else {CALL_FUNCTION(0,1,1); } \ 00612 break; \ 00613 case 2: \ 00614 if (!odd_bit) { CALL_FUNCTION(0,2,0); } \ 00615 else {CALL_FUNCTION(0,2,1); } \ 00616 break; \ 00617 case 3: \ 00618 if (!odd_bit) { CALL_FUNCTION(0,3,0); } \ 00619 else {CALL_FUNCTION(0,3,1); } \ 00620 break; \ 00621 } \ 00622 break; \ 00623 case 1: \ 00624 switch(nu){ \ 00625 case 0: \ 00626 if (!odd_bit) { CALL_FUNCTION(1,0,0); } \ 00627 else {CALL_FUNCTION(1,0,1); } \ 00628 break; \ 00629 case 1: \ 00630 printf("ERROR: invalid direction combination\n"); exit(1); \ 00631 break; \ 00632 case 2: \ 00633 if (!odd_bit) { CALL_FUNCTION(1,2,0); } \ 00634 else {CALL_FUNCTION(1,2,1); } \ 00635 break; \ 00636 case 3: \ 00637 if (!odd_bit) { CALL_FUNCTION(1,3,0); } \ 00638 else {CALL_FUNCTION(1,3,1); } \ 00639 break; \ 00640 } \ 00641 break; \ 00642 case 2: \ 00643 switch(nu){ \ 00644 case 0: \ 00645 if (!odd_bit) { CALL_FUNCTION(2,0,0); } \ 00646 else {CALL_FUNCTION(2,0,1); } \ 00647 break; \ 00648 case 1: \ 00649 if (!odd_bit) { CALL_FUNCTION(2,1,0); } \ 00650 else {CALL_FUNCTION(2,1,1); } \ 00651 break; \ 00652 case 2: \ 00653 printf("ERROR: invalid direction combination\n"); exit(1); \ 00654 break; \ 00655 case 3: \ 00656 if (!odd_bit) { CALL_FUNCTION(2,3,0); } \ 00657 else {CALL_FUNCTION(2,3,1); } \ 00658 break; \ 00659 } \ 00660 break; \ 00661 case 3: \ 00662 switch(nu){ \ 00663 case 0: \ 00664 if (!odd_bit) { CALL_FUNCTION(3,0,0); } \ 00665 else {CALL_FUNCTION(3,0,1); } \ 00666 break; \ 00667 case 1: \ 00668 if (!odd_bit) { CALL_FUNCTION(3,1,0); } \ 00669 else {CALL_FUNCTION(3,1,1); } \ 00670 break; \ 00671 case 2: \ 00672 if (!odd_bit) { CALL_FUNCTION(3,2,0); } \ 00673 else {CALL_FUNCTION(3,2,1); } \ 00674 break; \ 00675 case 3: \ 00676 printf("ERROR: invalid direction combination\n"); exit(1); \ 00677 break; \ 00678 } \ 00679 break; \ 00680 } 00681 00682 void siteComputeGenStapleParityKernel(void* staple_even, void* staple_odd, 00683 void* sitelink_even, void* sitelink_odd, 00684 void* fatlink_even, void* fatlink_odd, 00685 int mu, int nu,int odd_bit, 00686 double mycoeff, 00687 dim3 halfGridDim, dim3 blockDim, 00688 QudaReconstructType recon, QudaPrecision prec) 00689 { 00690 00691 #define CALL_FUNCTION(mu, nu, odd_bit) \ 00692 if (prec == QUDA_DOUBLE_PRECISION){ \ 00693 if(recon == QUDA_RECONSTRUCT_NO){ \ 00694 do_siteComputeGenStapleParity18Kernel<mu,nu, odd_bit> \ 00695 <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \ 00696 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00697 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00698 (double)mycoeff); \ 00699 }else{ \ 00700 do_siteComputeGenStapleParity12Kernel<mu,nu, odd_bit> \ 00701 <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \ 00702 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00703 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00704 (double)mycoeff); \ 00705 } \ 00706 }else { \ 00707 if(recon == QUDA_RECONSTRUCT_NO){ \ 00708 do_siteComputeGenStapleParity18Kernel<mu,nu, odd_bit> \ 00709 <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \ 00710 (float2*)sitelink_even, (float2*)sitelink_odd, \ 00711 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00712 (float)mycoeff); \ 00713 }else{ \ 00714 do_siteComputeGenStapleParity12Kernel<mu,nu, odd_bit> \ 00715 <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \ 00716 (float4*)sitelink_even, (float4*)sitelink_odd, \ 00717 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00718 (float)mycoeff); \ 00719 } \ 00720 } 00721 00722 ENUMERATE_FUNCS(mu,nu,odd_bit); 00723 00724 #undef CALL_FUNCTION 00725 00726 00727 } 00728 00729 void 00730 computeGenStapleFieldParityKernel(void* sitelink_even, void* sitelink_odd, 00731 void* fatlink_even, void* fatlink_odd, 00732 void* mulink_even, void* mulink_odd, 00733 int mu, int nu, int odd_bit, 00734 double mycoeff, 00735 dim3 halfGridDim, dim3 blockDim, 00736 QudaReconstructType recon, QudaPrecision prec) 00737 { 00738 #define CALL_FUNCTION(mu, nu, odd_bit) \ 00739 if (prec == QUDA_DOUBLE_PRECISION){ \ 00740 if(recon == QUDA_RECONSTRUCT_NO){ \ 00741 do_computeGenStapleFieldParity18Kernel<mu,nu, odd_bit> \ 00742 <<<halfGridDim, blockDim>>>((double2*)sitelink_even, (double2*)sitelink_odd, \ 00743 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00744 (double2*)mulink_even, (double2*)mulink_odd, \ 00745 (double)mycoeff); \ 00746 }else{ \ 00747 do_computeGenStapleFieldParity12Kernel<mu,nu, odd_bit> \ 00748 <<<halfGridDim, blockDim>>>((double2*)sitelink_even, (double2*)sitelink_odd, \ 00749 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00750 (double2*)mulink_even, (double2*)mulink_odd, \ 00751 (double)mycoeff); \ 00752 } \ 00753 }else{ \ 00754 if(recon == QUDA_RECONSTRUCT_NO){ \ 00755 do_computeGenStapleFieldParity18Kernel<mu,nu, odd_bit> \ 00756 <<<halfGridDim, blockDim>>>((float2*)sitelink_even, (float2*)sitelink_odd, \ 00757 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00758 (float2*)mulink_even, (float2*)mulink_odd, \ 00759 (float)mycoeff); \ 00760 }else{ \ 00761 do_computeGenStapleFieldParity12Kernel<mu,nu, odd_bit> \ 00762 <<<halfGridDim, blockDim>>>((float4*)sitelink_even, (float4*)sitelink_odd, \ 00763 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00764 (float2*)mulink_even, (float2*)mulink_odd, \ 00765 (float)mycoeff); \ 00766 } \ 00767 } 00768 00769 ENUMERATE_FUNCS(mu,nu,odd_bit); 00770 00771 #undef CALL_FUNCTION 00772 00773 } 00774 00775 void 00776 computeGenStapleFieldSaveParityKernel(void* staple_even, void* staple_odd, 00777 void* sitelink_even, void* sitelink_odd, 00778 void* fatlink_even, void* fatlink_odd, 00779 void* mulink_even, void* mulink_odd, 00780 int mu, int nu, int odd_bit, 00781 double mycoeff, 00782 dim3 halfGridDim, dim3 blockDim, 00783 QudaReconstructType recon, QudaPrecision prec) 00784 { 00785 #define CALL_FUNCTION(mu, nu, odd_bit) \ 00786 if (prec == QUDA_DOUBLE_PRECISION){ \ 00787 if(recon == QUDA_RECONSTRUCT_NO){ \ 00788 do_computeGenStapleFieldSaveParity18Kernel<mu,nu, odd_bit> \ 00789 <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \ 00790 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00791 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00792 (double2*)mulink_even, (double2*)mulink_odd, \ 00793 (double)mycoeff); \ 00794 }else{ \ 00795 do_computeGenStapleFieldSaveParity12Kernel<mu,nu, odd_bit> \ 00796 <<<halfGridDim, blockDim>>>((double2*)staple_even, (double2*)staple_odd, \ 00797 (double2*)sitelink_even, (double2*)sitelink_odd, \ 00798 (double2*)fatlink_even, (double2*)fatlink_odd, \ 00799 (double2*)mulink_even, (double2*)mulink_odd, \ 00800 (double)mycoeff); \ 00801 } \ 00802 }else{ \ 00803 if(recon == QUDA_RECONSTRUCT_NO){ \ 00804 do_computeGenStapleFieldSaveParity18Kernel<mu,nu, odd_bit> \ 00805 <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \ 00806 (float2*)sitelink_even, (float2*)sitelink_odd, \ 00807 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00808 (float2*)mulink_even, (float2*)mulink_odd, \ 00809 (float)mycoeff); \ 00810 }else{ \ 00811 do_computeGenStapleFieldSaveParity12Kernel<mu,nu, odd_bit> \ 00812 <<<halfGridDim, blockDim>>>((float2*)staple_even, (float2*)staple_odd, \ 00813 (float4*)sitelink_even, (float4*)sitelink_odd, \ 00814 (float2*)fatlink_even, (float2*)fatlink_odd, \ 00815 (float2*)mulink_even, (float2*)mulink_odd, \ 00816 (float)mycoeff); \ 00817 } \ 00818 } 00819 00820 ENUMERATE_FUNCS(mu,nu,odd_bit); 00821 00822 #undef CALL_FUNCTION 00823 00824 } 00825 00826 void 00827 llfat_cuda(void* fatLink, void* siteLink, 00828 FullGauge cudaFatLink, FullGauge cudaSiteLink, 00829 FullStaple cudaStaple, FullStaple cudaStaple1, 00830 QudaGaugeParam* param, double* act_path_coeff) 00831 { 00832 00833 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 00834 dim3 gridDim(volume/BLOCK_DIM,1,1); 00835 dim3 halfGridDim(volume/(2*BLOCK_DIM),1,1); 00836 dim3 blockDim(BLOCK_DIM , 1, 1); 00837 00838 QudaPrecision prec = cudaSiteLink.precision; 00839 QudaReconstructType recon = cudaSiteLink.reconstruct; 00840 BIND_SITE_AND_FAT_LINK; 00841 if(prec == QUDA_DOUBLE_PRECISION){ 00842 if(recon == QUDA_RECONSTRUCT_NO){ 00843 llfatOneLink18Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.even, (double2*)cudaSiteLink.odd, 00844 (double2*)cudaFatLink.even, (double2*)cudaFatLink.odd, 00845 (double)act_path_coeff[0], (double)act_path_coeff[5]); 00846 }else{ 00847 00848 llfatOneLink12Kernel<<<gridDim, blockDim>>>((double2*)cudaSiteLink.even, (double2*)cudaSiteLink.odd, 00849 (double2*)cudaFatLink.even, (double2*)cudaFatLink.odd, 00850 (double)act_path_coeff[0], (double)act_path_coeff[5]); 00851 00852 } 00853 }else{ //single precision 00854 if(recon == QUDA_RECONSTRUCT_NO){ 00855 llfatOneLink18Kernel<<<gridDim, blockDim>>>((float2*)cudaSiteLink.even, (float2*)cudaSiteLink.odd, 00856 (float2*)cudaFatLink.even, (float2*)cudaFatLink.odd, 00857 (float)act_path_coeff[0], (float)act_path_coeff[5]); 00858 }else{ 00859 llfatOneLink12Kernel<<<gridDim, blockDim>>>((float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, 00860 (float2*)cudaFatLink.even, (float2*)cudaFatLink.odd, 00861 (float)act_path_coeff[0], (float)act_path_coeff[5]); 00862 } 00863 } 00864 checkCudaError(); 00865 UNBIND_SITE_AND_FAT_LINK; 00866 00867 for(int dir = 0;dir < 4; dir++){ 00868 for(int nu = 0; nu < 4; nu++){ 00869 if (nu != dir){ 00870 00871 //even 00872 BIND_SITE_AND_FAT_LINK; 00873 siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, 00874 (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, 00875 (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 00876 dir, nu,0, 00877 act_path_coeff[2], 00878 halfGridDim, blockDim, recon, prec); 00879 checkCudaError(); 00880 UNBIND_SITE_AND_FAT_LINK; 00881 00882 //odd 00883 BIND_SITE_AND_FAT_LINK_REVERSE; 00884 siteComputeGenStapleParityKernel((void*)cudaStaple.odd, (void*)cudaStaple.even, 00885 (void*)cudaSiteLink.odd, (void*)cudaSiteLink.even, 00886 (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 00887 dir, nu,1, 00888 act_path_coeff[2], 00889 halfGridDim, blockDim, recon, prec); 00890 checkCudaError(); 00891 UNBIND_SITE_AND_FAT_LINK; 00892 00893 00894 //even 00895 BIND_SITE_AND_FAT_LINK; 00896 cudaBindTexture(0, muLink0TexSingle, cudaStaple.even, cudaStaple.bytes); 00897 cudaBindTexture(0, muLink1TexSingle, cudaStaple.odd, cudaStaple.bytes); 00898 computeGenStapleFieldParityKernel((void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, 00899 (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 00900 (void*)cudaStaple.even, (void*)cudaStaple.odd, 00901 dir, nu,0, 00902 act_path_coeff[5], 00903 halfGridDim, blockDim, recon, prec); 00904 checkCudaError(); 00905 UNBIND_ALL_TEXTURE; 00906 00907 //odd 00908 BIND_SITE_AND_FAT_LINK_REVERSE; 00909 cudaBindTexture(0, muLink1TexSingle, cudaStaple.even, cudaStaple.bytes); 00910 cudaBindTexture(0, muLink0TexSingle, cudaStaple.odd, cudaStaple.bytes); 00911 computeGenStapleFieldParityKernel((void*)cudaSiteLink.odd, (void*)cudaSiteLink.even, 00912 (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 00913 (void*)cudaStaple.odd, (void*)cudaStaple.even, 00914 dir, nu,1, 00915 act_path_coeff[5], 00916 halfGridDim, blockDim, recon, prec); 00917 checkCudaError(); 00918 UNBIND_ALL_TEXTURE; 00919 00920 00921 for(int rho = 0; rho < 4; rho++){ 00922 if (rho != dir && rho != nu){ 00923 00924 //even 00925 BIND_SITE_AND_FAT_LINK; 00926 cudaBindTexture(0, muLink0TexSingle, cudaStaple.even, cudaStaple.bytes); 00927 cudaBindTexture(0, muLink1TexSingle, cudaStaple.odd, cudaStaple.bytes); 00928 computeGenStapleFieldSaveParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, 00929 (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, 00930 (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 00931 (void*)cudaStaple.even, (void*)cudaStaple.odd, 00932 dir, rho,0, 00933 act_path_coeff[3], 00934 halfGridDim, blockDim, recon, prec); 00935 checkCudaError(); 00936 UNBIND_ALL_TEXTURE; 00937 00938 //odd 00939 BIND_SITE_AND_FAT_LINK_REVERSE; 00940 cudaBindTexture(0, muLink1TexSingle, cudaStaple.even, cudaStaple.bytes); 00941 cudaBindTexture(0, muLink0TexSingle, cudaStaple.odd, cudaStaple.bytes); 00942 computeGenStapleFieldSaveParityKernel((void*)cudaStaple1.odd, (void*)cudaStaple1.even, 00943 (void*)cudaSiteLink.odd, (void*)cudaSiteLink.even, 00944 (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 00945 (void*)cudaStaple.odd, (void*)cudaStaple.even, 00946 dir, rho,1, 00947 act_path_coeff[3], 00948 halfGridDim, blockDim, recon, prec); 00949 checkCudaError(); 00950 UNBIND_ALL_TEXTURE; 00951 00952 00953 for(int sig = 0; sig < 4; sig++){ 00954 if (sig != dir && sig != nu && sig != rho){ 00955 00956 //even 00957 BIND_SITE_AND_FAT_LINK; 00958 cudaBindTexture(0, muLink0TexSingle, cudaStaple1.even, cudaStaple1.bytes); 00959 cudaBindTexture(0, muLink1TexSingle, cudaStaple1.odd, cudaStaple1.bytes); 00960 computeGenStapleFieldParityKernel((void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, 00961 (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 00962 (void*)cudaStaple1.even, (void*)cudaStaple1.odd, 00963 dir, sig, 0, 00964 act_path_coeff[4], 00965 halfGridDim, blockDim, recon, prec); 00966 checkCudaError(); 00967 UNBIND_ALL_TEXTURE; 00968 00969 00970 //odd 00971 BIND_SITE_AND_FAT_LINK_REVERSE; 00972 cudaBindTexture(0, muLink1TexSingle, cudaStaple1.even, cudaStaple1.bytes); 00973 cudaBindTexture(0, muLink0TexSingle, cudaStaple1.odd, cudaStaple1.bytes); 00974 computeGenStapleFieldParityKernel((void*)cudaSiteLink.odd, (void*)cudaSiteLink.even, 00975 (void*)cudaFatLink.odd, (void*)cudaFatLink.even, 00976 (void*)cudaStaple1.odd, (void*)cudaStaple1.even, 00977 dir, sig, 1, 00978 act_path_coeff[4], 00979 halfGridDim, blockDim, recon, prec); 00980 checkCudaError(); 00981 UNBIND_ALL_TEXTURE; 00982 00983 } 00984 }//sig 00985 } 00986 }//rho 00987 00988 00989 00990 } 00991 }//nu 00992 }//dir 00993 00994 checkCudaError(); 00995 return; 00996 } 00997
1.7.3