|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <read_gauge.h> 00002 #include <gauge_quda.h> 00003 00004 #include "gauge_force_quda.h" 00005 00006 #define MULT_SU3_NN_TEST(ma, mb) do{ \ 00007 float fa_re,fa_im, fb_re, fb_im, fc_re, fc_im; \ 00008 fa_re = \ 00009 ma##00_re * mb##00_re - ma##00_im * mb##00_im + \ 00010 ma##01_re * mb##10_re - ma##01_im * mb##10_im + \ 00011 ma##02_re * mb##20_re - ma##02_im * mb##20_im; \ 00012 fa_im = \ 00013 ma##00_re * mb##00_im + ma##00_im * mb##00_re + \ 00014 ma##01_re * mb##10_im + ma##01_im * mb##10_re + \ 00015 ma##02_re * mb##20_im + ma##02_im * mb##20_re; \ 00016 fb_re = \ 00017 ma##00_re * mb##01_re - ma##00_im * mb##01_im + \ 00018 ma##01_re * mb##11_re - ma##01_im * mb##11_im + \ 00019 ma##02_re * mb##21_re - ma##02_im * mb##21_im; \ 00020 fb_im = \ 00021 ma##00_re * mb##01_im + ma##00_im * mb##01_re + \ 00022 ma##01_re * mb##11_im + ma##01_im * mb##11_re + \ 00023 ma##02_re * mb##21_im + ma##02_im * mb##21_re; \ 00024 fc_re = \ 00025 ma##00_re * mb##02_re - ma##00_im * mb##02_im + \ 00026 ma##01_re * mb##12_re - ma##01_im * mb##12_im + \ 00027 ma##02_re * mb##22_re - ma##02_im * mb##22_im; \ 00028 fc_im = \ 00029 ma##00_re * mb##02_im + ma##00_im * mb##02_re + \ 00030 ma##01_re * mb##12_im + ma##01_im * mb##12_re + \ 00031 ma##02_re * mb##22_im + ma##02_im * mb##22_re; \ 00032 ma##00_re = fa_re; \ 00033 ma##00_im = fa_im; \ 00034 ma##01_re = fb_re; \ 00035 ma##01_im = fb_im; \ 00036 ma##02_re = fc_re; \ 00037 ma##02_im = fc_im; \ 00038 fa_re = \ 00039 ma##10_re * mb##00_re - ma##10_im * mb##00_im + \ 00040 ma##11_re * mb##10_re - ma##11_im * mb##10_im + \ 00041 ma##12_re * mb##20_re - ma##12_im * mb##20_im; \ 00042 fa_im = \ 00043 ma##10_re * mb##00_im + ma##10_im * mb##00_re + \ 00044 ma##11_re * mb##10_im + ma##11_im * mb##10_re + \ 00045 ma##12_re * mb##20_im + ma##12_im * mb##20_re; \ 00046 fb_re = \ 00047 ma##10_re * mb##01_re - ma##10_im * mb##01_im + \ 00048 ma##11_re * mb##11_re - ma##11_im * mb##11_im + \ 00049 ma##12_re * mb##21_re - ma##12_im * mb##21_im; \ 00050 fb_im = \ 00051 ma##10_re * mb##01_im + ma##10_im * mb##01_re + \ 00052 ma##11_re * mb##11_im + ma##11_im * mb##11_re + \ 00053 ma##12_re * mb##21_im + ma##12_im * mb##21_re; \ 00054 fc_re = \ 00055 ma##10_re * mb##02_re - ma##10_im * mb##02_im + \ 00056 ma##11_re * mb##12_re - ma##11_im * mb##12_im + \ 00057 ma##12_re * mb##22_re - ma##12_im * mb##22_im; \ 00058 fc_im = \ 00059 ma##10_re * mb##02_im + ma##10_im * mb##02_re + \ 00060 ma##11_re * mb##12_im + ma##11_im * mb##12_re + \ 00061 ma##12_re * mb##22_im + ma##12_im * mb##22_re; \ 00062 ma##10_re = fa_re; \ 00063 ma##10_im = fa_im; \ 00064 ma##11_re = fb_re; \ 00065 ma##11_im = fb_im; \ 00066 ma##12_re = fc_re; \ 00067 ma##12_im = fc_im; \ 00068 fa_re = \ 00069 ma##20_re * mb##00_re - ma##20_im * mb##00_im + \ 00070 ma##21_re * mb##10_re - ma##21_im * mb##10_im + \ 00071 ma##22_re * mb##20_re - ma##22_im * mb##20_im; \ 00072 fa_im = \ 00073 ma##20_re * mb##00_im + ma##20_im * mb##00_re + \ 00074 ma##21_re * mb##10_im + ma##21_im * mb##10_re + \ 00075 ma##22_re * mb##20_im + ma##22_im * mb##20_re; \ 00076 fb_re = \ 00077 ma##20_re * mb##01_re - ma##20_im * mb##01_im + \ 00078 ma##21_re * mb##11_re - ma##21_im * mb##11_im + \ 00079 ma##22_re * mb##21_re - ma##22_im * mb##21_im; \ 00080 fb_im = \ 00081 ma##20_re * mb##01_im + ma##20_im * mb##01_re + \ 00082 ma##21_re * mb##11_im + ma##21_im * mb##11_re + \ 00083 ma##22_re * mb##21_im + ma##22_im * mb##21_re; \ 00084 fc_re = \ 00085 ma##20_re * mb##02_re - ma##20_im * mb##02_im + \ 00086 ma##21_re * mb##12_re - ma##21_im * mb##12_im + \ 00087 ma##22_re * mb##22_re - ma##22_im * mb##22_im; \ 00088 fc_im = \ 00089 ma##20_re * mb##02_im + ma##20_im * mb##02_re + \ 00090 ma##21_re * mb##12_im + ma##21_im * mb##12_re + \ 00091 ma##22_re * mb##22_im + ma##22_im * mb##22_re; \ 00092 ma##20_re = fa_re; \ 00093 ma##20_im = fa_im; \ 00094 ma##21_re = fb_re; \ 00095 ma##21_im = fb_im; \ 00096 ma##22_re = fc_re; \ 00097 ma##22_im = fc_im; \ 00098 }while(0) 00099 00100 00101 #define MULT_SU3_NA_TEST(ma, mb) do{ \ 00102 float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \ 00103 fa_re = \ 00104 ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \ 00105 ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \ 00106 ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \ 00107 fa_im = \ 00108 ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \ 00109 ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \ 00110 ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \ 00111 fb_re = \ 00112 ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \ 00113 ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \ 00114 ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \ 00115 fb_im = \ 00116 ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \ 00117 ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \ 00118 ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \ 00119 fc_re = \ 00120 ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \ 00121 ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \ 00122 ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \ 00123 fc_im = \ 00124 ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \ 00125 ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \ 00126 ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \ 00127 ma##00_re = fa_re; \ 00128 ma##00_im = fa_im; \ 00129 ma##01_re = fb_re; \ 00130 ma##01_im = fb_im; \ 00131 ma##02_re = fc_re; \ 00132 ma##02_im = fc_im; \ 00133 fa_re = \ 00134 ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \ 00135 ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \ 00136 ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \ 00137 fa_im = \ 00138 ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \ 00139 ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \ 00140 ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \ 00141 fb_re = \ 00142 ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \ 00143 ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \ 00144 ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \ 00145 fb_im = \ 00146 ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \ 00147 ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \ 00148 ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \ 00149 fc_re = \ 00150 ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \ 00151 ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \ 00152 ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \ 00153 fc_im = \ 00154 ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \ 00155 ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \ 00156 ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \ 00157 ma##10_re = fa_re; \ 00158 ma##10_im = fa_im; \ 00159 ma##11_re = fb_re; \ 00160 ma##11_im = fb_im; \ 00161 ma##12_re = fc_re; \ 00162 ma##12_im = fc_im; \ 00163 fa_re = \ 00164 ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \ 00165 ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \ 00166 ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \ 00167 fa_im = \ 00168 ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \ 00169 ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \ 00170 ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \ 00171 fb_re = \ 00172 ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \ 00173 ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \ 00174 ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \ 00175 fb_im = \ 00176 ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \ 00177 ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \ 00178 ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \ 00179 fc_re = \ 00180 ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \ 00181 ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \ 00182 ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \ 00183 fc_im = \ 00184 ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \ 00185 ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \ 00186 ma##22_re * mb##T22_im + ma##22_im * mb##T22_re; \ 00187 ma##20_re = fa_re; \ 00188 ma##20_im = fa_im; \ 00189 ma##21_re = fb_re; \ 00190 ma##21_im = fb_im; \ 00191 ma##22_re = fc_re; \ 00192 ma##22_im = fc_im; \ 00193 }while(0) 00194 00195 00196 00197 #define MULT_SU3_AN_TEST(ma, mb) do{ \ 00198 float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \ 00199 fa_re = \ 00200 ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \ 00201 ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \ 00202 ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \ 00203 fa_im = \ 00204 ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \ 00205 ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \ 00206 ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \ 00207 fb_re = \ 00208 ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \ 00209 ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \ 00210 ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \ 00211 fb_im = \ 00212 ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \ 00213 ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \ 00214 ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \ 00215 fc_re = \ 00216 ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \ 00217 ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \ 00218 ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \ 00219 fc_im = \ 00220 ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \ 00221 ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \ 00222 ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \ 00223 mb##00_re = fa_re; \ 00224 mb##00_im = fa_im; \ 00225 mb##10_re = fb_re; \ 00226 mb##10_im = fb_im; \ 00227 mb##20_re = fc_re; \ 00228 mb##20_im = fc_im; \ 00229 fa_re = \ 00230 ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \ 00231 ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \ 00232 ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \ 00233 fa_im = \ 00234 ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \ 00235 ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \ 00236 ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \ 00237 fb_re = \ 00238 ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \ 00239 ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \ 00240 ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \ 00241 fb_im = \ 00242 ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \ 00243 ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \ 00244 ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \ 00245 fc_re = \ 00246 ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \ 00247 ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \ 00248 ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \ 00249 fc_im = \ 00250 ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \ 00251 ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \ 00252 ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \ 00253 mb##01_re = fa_re; \ 00254 mb##01_im = fa_im; \ 00255 mb##11_re = fb_re; \ 00256 mb##11_im = fb_im; \ 00257 mb##21_re = fc_re; \ 00258 mb##21_im = fc_im; \ 00259 fa_re = \ 00260 ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \ 00261 ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \ 00262 ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \ 00263 fa_im = \ 00264 ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \ 00265 ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \ 00266 ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \ 00267 fb_re = \ 00268 ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \ 00269 ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \ 00270 ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \ 00271 fb_im = \ 00272 ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \ 00273 ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \ 00274 ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \ 00275 fc_re = \ 00276 ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \ 00277 ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \ 00278 ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \ 00279 fc_im = \ 00280 ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \ 00281 ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \ 00282 ma##T22_re * mb##22_im + ma##T22_im * mb##22_re; \ 00283 mb##02_re = fa_re; \ 00284 mb##02_im = fa_im; \ 00285 mb##12_re = fb_re; \ 00286 mb##12_im = fb_im; \ 00287 mb##22_re = fc_re; \ 00288 mb##22_im = fc_im; \ 00289 }while(0) 00290 00291 00292 #define GF_SITE_MATRIX_LOAD_TEX 1 00293 00294 #if (GF_SITE_MATRIX_LOAD_TEX == 1) 00295 00296 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink0TexSingle, dir, idx, var) 00297 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink1TexSingle, dir, idx, var) 00298 #else 00299 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkEven, dir, idx, var) 00300 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkOdd, dir, idx, var) 00301 #endif 00302 00303 00304 #define LOAD_MATRIX LOAD_MATRIX_12_SINGLE 00305 #define LOAD_ANTI_HERMITIAN LOAD_ANTI_HERMITIAN_SINGLE 00306 #define WRITE_ANTI_HERMITIAN WRITE_ANTI_HERMITIAN_SINGLE 00307 #define RECONSTRUCT_MATRIX RECONSTRUCT_LINK_12 00308 00309 00310 __constant__ int path_max_length; 00311 00312 void 00313 gauge_force_init_cuda(QudaGaugeParam* param, int path_max_length) 00314 { 00315 static int gauge_force_init_cuda_flag = 0; 00316 if (gauge_force_init_cuda_flag){ 00317 return; 00318 } 00319 gauge_force_init_cuda_flag=1; 00320 00321 init_kernel_cuda(param); 00322 00323 cudaMemcpyToSymbol("path_max_length", &path_max_length, sizeof(int)); 00324 00325 } 00326 00327 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \ 00328 switch(mydir){ \ 00329 case 0: \ 00330 new_mem_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \ 00331 new_x1 = (new_x1==X1m1)?0:new_x1+1; \ 00332 break; \ 00333 case 1: \ 00334 new_mem_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00335 new_x2 = (new_x2==X2m1)?0:new_x2+1; \ 00336 break; \ 00337 case 2: \ 00338 new_mem_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00339 new_x3 = (new_x3==X3m1)?0:new_x3+1; \ 00340 break; \ 00341 case 3: \ 00342 new_mem_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00343 new_x4 = (new_x4==X4m1)?0:new_x4+1; \ 00344 break; \ 00345 } \ 00346 }while(0) 00347 00348 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \ 00349 switch(mydir){ \ 00350 case 0: \ 00351 new_mem_idx = ( (new_x1==0)?idx+X1m1:idx-1); \ 00352 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \ 00353 break; \ 00354 case 1: \ 00355 new_mem_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \ 00356 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \ 00357 break; \ 00358 case 2: \ 00359 new_mem_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \ 00360 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \ 00361 break; \ 00362 case 3: \ 00363 new_mem_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \ 00364 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \ 00365 break; \ 00366 } \ 00367 }while(0) 00368 00369 #define GF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \ 00370 sign =1; \ 00371 switch(dir){ \ 00372 case XUP: \ 00373 if ( (i4 & 1) == 1){ \ 00374 sign = 1; \ 00375 } \ 00376 break; \ 00377 case YUP: \ 00378 if ( ((i4+i1) & 1) == 1){ \ 00379 sign = 1; \ 00380 } \ 00381 break; \ 00382 case ZUP: \ 00383 if ( ((i4+i1+i2) & 1) == 1){ \ 00384 sign = 1; \ 00385 } \ 00386 break; \ 00387 case TUP: \ 00388 if (i4 == X4m1 ){ \ 00389 sign = 1; \ 00390 } \ 00391 break; \ 00392 } \ 00393 }while (0) 00394 00395 00396 00397 //for now we only consider 12-reconstruct and single precision 00398 00399 template<int oddBit> 00400 __global__ void 00401 parity_compute_gauge_force_kernel(float2* momEven, float2* momOdd, 00402 int dir, double eb3, 00403 float4* linkEven, float4* linkOdd, 00404 int* input_path, 00405 int* length, float* path_coeff, int num_paths) 00406 { 00407 int i,j=0; 00408 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00409 00410 int z1 = FAST_INT_DIVIDE(sid, X1h); 00411 int x1h = sid - z1*X1h; 00412 int z2 = FAST_INT_DIVIDE(z1, X2); 00413 int x2 = z1 - z2*X2; 00414 int x4 = FAST_INT_DIVIDE(z2, X3); 00415 int x3 = z2 - x4*X3; 00416 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00417 int x1 = 2*x1h + x1odd; 00418 int X = 2*sid + x1odd; 00419 00420 int sign = 1; 00421 00422 float2* mymom=momEven; 00423 if (oddBit){ 00424 mymom = momOdd; 00425 } 00426 00427 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00428 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00429 float2 STAPLE0, STAPLE1, STAPLE2, STAPLE3,STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 00430 float2 AH0, AH1, AH2, AH3, AH4; 00431 00432 int new_mem_idx; 00433 00434 00435 SET_SU3_MATRIX(staple, 0); 00436 for(i=0;i < num_paths; i++){ 00437 int nbr_oddbit = (oddBit^1 ); 00438 00439 int new_x1 =x1; 00440 int new_x2 =x2; 00441 int new_x3 =x3; 00442 int new_x4 =x4; 00443 COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(dir, X); 00444 00445 //linka: current matrix 00446 //linkb: the loaded matrix in this round 00447 SET_UNIT_SU3_MATRIX(linka); 00448 int* path = input_path + i*path_max_length; 00449 00450 int lnkdir; 00451 int path0 = path[0]; 00452 if (GOES_FORWARDS(path0)){ 00453 lnkdir=path0; 00454 }else{ 00455 lnkdir=OPP_DIR(path0); 00456 COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(path0), new_mem_idx); 00457 nbr_oddbit = nbr_oddbit^1; 00458 00459 } 00460 00461 int nbr_idx = new_mem_idx >>1; 00462 if (nbr_oddbit){ 00463 LOAD_ODD_MATRIX( lnkdir, nbr_idx, LINKB); 00464 }else{ 00465 LOAD_EVEN_MATRIX( lnkdir, nbr_idx, LINKB); 00466 } 00467 00468 GF_COMPUTE_RECONSTRUCT_SIGN(sign, lnkdir, new_x1, new_x2, new_x3, new_x4); 00469 RECONSTRUCT_MATRIX(lnkdir, nbr_idx, sign, linkb); 00470 if (GOES_FORWARDS(path0)){ 00471 COPY_SU3_MATRIX(linkb, linka); 00472 COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(path0, new_mem_idx); 00473 nbr_oddbit = nbr_oddbit^1; 00474 }else{ 00475 SU3_ADJOINT(linkb, linka); 00476 } 00477 00478 for(j=1; j < length[i]; j++){ 00479 00480 int lnkdir; 00481 int pathj = path[j]; 00482 if (GOES_FORWARDS(pathj)){ 00483 lnkdir=pathj; 00484 }else{ 00485 lnkdir=OPP_DIR(pathj); 00486 COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(pathj), new_mem_idx); 00487 nbr_oddbit = nbr_oddbit^1; 00488 00489 } 00490 00491 int nbr_idx = new_mem_idx >>1; 00492 if (nbr_oddbit){ 00493 LOAD_ODD_MATRIX(lnkdir, nbr_idx, LINKB); 00494 }else{ 00495 LOAD_EVEN_MATRIX(lnkdir, nbr_idx, LINKB); 00496 } 00497 GF_COMPUTE_RECONSTRUCT_SIGN(sign, lnkdir, new_x1, new_x2, new_x3, new_x4); 00498 RECONSTRUCT_MATRIX(lnkdir, nbr_idx, sign, linkb); 00499 if (GOES_FORWARDS(pathj)){ 00500 MULT_SU3_NN_TEST(linka, linkb); 00501 00502 COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(pathj, new_mem_idx); 00503 nbr_oddbit = nbr_oddbit^1; 00504 00505 00506 }else{ 00507 MULT_SU3_NA_TEST(linka, linkb); 00508 } 00509 00510 }//j 00511 SCALAR_MULT_ADD_SU3_MATRIX(staple, linka, path_coeff[i], staple); 00512 }//i 00513 00514 00515 //update mom 00516 if (oddBit){ 00517 LOAD_ODD_MATRIX(dir, sid, LINKA); 00518 }else{ 00519 LOAD_EVEN_MATRIX(dir, sid, LINKA); 00520 } 00521 GF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4); 00522 RECONSTRUCT_MATRIX(dir, sid, sign, linka); 00523 MULT_SU3_NN_TEST(linka, staple); 00524 LOAD_ANTI_HERMITIAN(mymom, dir, sid, AH); 00525 UNCOMPRESS_ANTI_HERMITIAN(ah, linkb); 00526 SCALAR_MULT_SUB_SU3_MATRIX(linkb, linka, eb3, linka); 00527 MAKE_ANTI_HERMITIAN(linka, ah); 00528 00529 WRITE_ANTI_HERMITIAN(mymom, dir, sid, AH); 00530 00531 return; 00532 } 00533 00534 void 00535 gauge_force_cuda(FullMom cudaMom, int dir, double eb3, FullGauge cudaSiteLink, 00536 QudaGaugeParam* param, int** input_path, 00537 int* length, void* path_coeff, int num_paths, int max_length) 00538 { 00539 00540 int i, j; 00541 //input_path 00542 int bytes = num_paths*max_length* sizeof(int); 00543 int* input_path_d; 00544 cudaMalloc((void**)&input_path_d, bytes); checkCudaError(); 00545 cudaMemset(input_path_d, 0, bytes);checkCudaError(); 00546 00547 int* input_path_h = (int*)malloc(bytes); 00548 if (input_path_h == NULL){ 00549 printf("ERROR: malloc failed for input_path_h in function %s\n", __FUNCTION__); 00550 exit(1); 00551 } 00552 00553 memset(input_path_h, 0, bytes); 00554 for(i=0;i < num_paths;i++){ 00555 for(j=0; j < length[i]; j++){ 00556 input_path_h[i*max_length + j] =input_path[i][j]; 00557 } 00558 } 00559 00560 cudaMemcpy(input_path_d, input_path_h, bytes, cudaMemcpyHostToDevice); checkCudaError(); 00561 00562 //length 00563 int* length_d; 00564 cudaMalloc((void**)&length_d, num_paths*sizeof(int)); checkCudaError(); 00565 cudaMemcpy(length_d, length, num_paths*sizeof(int), cudaMemcpyHostToDevice); checkCudaError(); 00566 00567 //path_coeff 00568 int gsize; 00569 if (param->cuda_prec == QUDA_DOUBLE_PRECISION){ 00570 gsize = sizeof(double); 00571 }else{ 00572 gsize= sizeof(float); 00573 } 00574 void* path_coeff_d; 00575 cudaMalloc((void**)&path_coeff_d, num_paths*gsize); checkCudaError(); 00576 cudaMemcpy(path_coeff_d, path_coeff, num_paths*gsize, cudaMemcpyHostToDevice); checkCudaError(); 00577 00578 //compute the gauge forces 00579 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 00580 dim3 blockDim(BLOCK_DIM, 1,1); 00581 dim3 gridDim(volume/blockDim.x, 1, 1); 00582 dim3 halfGridDim(volume/(2*blockDim.x), 1, 1); 00583 00584 float2* momEven = (float2*)cudaMom.even; 00585 float2* momOdd = (float2*)cudaMom.odd; 00586 float4* linkEven = (float4*)cudaSiteLink.even; 00587 float4* linkOdd = (float4*)cudaSiteLink.odd; 00588 00589 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00590 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00591 parity_compute_gauge_force_kernel<0><<<halfGridDim, blockDim>>>(momEven, momOdd, 00592 dir, eb3, 00593 linkEven, linkOdd, 00594 input_path_d, length_d, (float*)path_coeff_d, 00595 num_paths); 00596 //odd 00597 /* The reason we do not switch the even/odd function input paramemters and the texture binding 00598 * is that we use the oddbit to decided where to load, in the kernel function 00599 */ 00600 parity_compute_gauge_force_kernel<1><<<halfGridDim, blockDim>>>(momEven, momOdd, 00601 dir, eb3, 00602 linkEven, linkOdd, 00603 input_path_d, length_d, (float*)path_coeff_d, 00604 num_paths); 00605 00606 00607 00608 cudaUnbindTexture(siteLink0TexSingle); 00609 cudaUnbindTexture(siteLink1TexSingle); 00610 00611 checkCudaError(); 00612 00613 cudaFree(input_path_d); checkCudaError(); 00614 free(input_path_h); 00615 cudaFree(length_d); 00616 cudaFree(path_coeff_d); 00617 00618 00619 00620 } 00621 00622 00623 #undef LOAD_EVEN_MATRIX 00624 #undef LOAD_ODD_MATRIX 00625 #undef LOAD_MATRIX 00626 #undef LOAD_ANTI_HERMITIAN 00627 #undef WRITE_ANTI_HERMITIAN 00628 #undef RECONSTRUCT_MATRIX
1.7.3