QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <math.h> 00004 #include <string.h> 00005 00006 #include <typeinfo> 00007 #include <quda.h> 00008 #include "fat_force_quda.h" 00009 #include <quda_internal.h> 00010 #include <face_quda.h> 00011 #include "misc_helpers.h" 00012 #include <assert.h> 00013 #include <cuda.h> 00014 00015 #define MAX(a,b) ((a)>(b)?(a):(b)) 00016 #define ALIGNMENT 4096 00017 00018 #ifdef MPI_COMMS 00019 #include "face_quda.h" 00020 #endif 00021 00022 static double anisotropy_; 00023 extern float fat_link_max_; 00024 static int X_[4]; 00025 static QudaTboundary t_boundary_; 00026 00027 #define SHORT_LENGTH 65536 00028 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f) 00029 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1)) 00030 00031 #include <pack_gauge.h> 00032 #include "gauge_field.h" 00033 00034 /********************** Staple code, used by link fattening **************/ 00035 00036 #if defined(GPU_FATLINK)||defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) 00037 00038 00039 template <typename Float> 00040 void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd, int nFace, int* X) { 00041 int XY=X[0]*X[1]; 00042 int XYZ=X[0]*X[1]*X[2]; 00043 int volumeCB = X[0]*X[1]*X[2]*X[3]/2; 00044 int faceVolumeCB[4]={ 00045 X[1]*X[2]*X[3]/2, 00046 X[0]*X[2]*X[3]/2, 00047 X[0]*X[1]*X[3]/2, 00048 X[0]*X[1]*X[2]/2 00049 }; 00050 00051 //loop variables: a, b, c with a the most signifcant and c the least significant 00052 //A, B, C the maximum value 00053 //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1 00054 int A[4], B[4], C[4]; 00055 00056 //X dimension 00057 A[0] = X[3]; B[0] = X[2]; C[0] = X[1]; 00058 00059 //Y dimension 00060 A[1] = X[3]; B[1] = X[2]; C[1] = X[0]; 00061 00062 //Z dimension 00063 A[2] = X[3]; B[2] = X[1]; C[2] = X[0]; 00064 00065 //T dimension 00066 A[3] = X[2]; B[3] = X[1]; C[3] = X[0]; 00067 00068 00069 //multiplication factor to compute index in original cpu memory 00070 int f[4][4]={ 00071 {XYZ, XY, X[0], 1}, 00072 {XYZ, XY, 1, X[0]}, 00073 {XYZ, X[0], 1, XY}, 00074 { XY, X[0], 1, XYZ} 00075 }; 00076 00077 00078 for(int ite = 0; ite < 2; ite++){ 00079 //ite == 0: back 00080 //ite == 1: fwd 00081 Float** dst; 00082 if (ite == 0){ 00083 dst = cpuGhostBack; 00084 }else{ 00085 dst = cpuGhostFwd; 00086 } 00087 00088 //collect back ghost staple 00089 for(int dir =0; dir < 4; dir++){ 00090 int d; 00091 int a,b,c; 00092 00093 //ther is only one staple in the same location 00094 for(int linkdir=0; linkdir < 1; linkdir ++){ 00095 Float* even_src = cpuStaple; 00096 Float* odd_src = cpuStaple + volumeCB*gaugeSiteSize; 00097 00098 Float* even_dst; 00099 Float* odd_dst; 00100 00101 //switching odd and even ghost cpuLink when that dimension size is odd 00102 //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1 00103 if((X[dir] % 2 ==0) || (commDim(dir) == 1)){ 00104 even_dst = dst[dir]; 00105 odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize; 00106 }else{ 00107 odd_dst = dst[dir]; 00108 even_dst = dst[dir] + nFace*faceVolumeCB[dir]*gaugeSiteSize; 00109 } 00110 00111 int even_dst_index = 0; 00112 int odd_dst_index = 0; 00113 00114 int startd; 00115 int endd; 00116 if(ite == 0){ //back 00117 startd = 0; 00118 endd= nFace; 00119 }else{//fwd 00120 startd = X[dir] - nFace; 00121 endd =X[dir]; 00122 } 00123 for(d = startd; d < endd; d++){ 00124 for(a = 0; a < A[dir]; a++){ 00125 for(b = 0; b < B[dir]; b++){ 00126 for(c = 0; c < C[dir]; c++){ 00127 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1; 00128 int oddness = (a+b+c+d)%2; 00129 if (oddness == 0){ //even 00130 for(int i=0;i < 18;i++){ 00131 even_dst[18*even_dst_index+i] = even_src[18*index + i]; 00132 } 00133 even_dst_index++; 00134 }else{ //odd 00135 for(int i=0;i < 18;i++){ 00136 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i]; 00137 } 00138 odd_dst_index++; 00139 } 00140 }//c 00141 }//b 00142 }//a 00143 }//d 00144 assert( even_dst_index == nFace*faceVolumeCB[dir]); 00145 assert( odd_dst_index == nFace*faceVolumeCB[dir]); 00146 }//linkdir 00147 00148 }//dir 00149 }//ite 00150 } 00151 00152 00153 void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void** cpuGhostStapleFwd, 00154 int nFace, QudaPrecision precision, int* X) { 00155 00156 if (precision == QUDA_DOUBLE_PRECISION) { 00157 packGhostAllStaples((double*)staple, (double**)cpuGhostStapleBack, (double**) cpuGhostStapleFwd, nFace, X); 00158 } else { 00159 packGhostAllStaples((float*)staple, (float**)cpuGhostStapleBack, (float**)cpuGhostStapleFwd, nFace, X); 00160 } 00161 00162 } 00163 00164 void pack_gauge_diag(void* buf, int* X, void** sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec) 00165 { 00166 /* 00167 nu | | 00168 |__________| 00169 mu 00170 * 00171 * nu, mu are the directions we are working on 00172 * Since we are packing our own data, we need to go to the north-west corner in the diagram 00173 * i.e. x[nu] = X[nu]-1, x[mu]=0, and looop throught x[dir1],x[dir2] 00174 * in the remaining two directions (dir1/dir2), dir2 is the slowest changing dim when computing 00175 * index 00176 */ 00177 00178 00179 int mul_factor[4]={ 00180 1, X[0], X[1]*X[0], X[2]*X[1]*X[0], 00181 }; 00182 00183 int even_dst_idx = 0; 00184 int odd_dst_idx = 0; 00185 char* dst_even =(char*)buf; 00186 char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*gaugeSiteSize*prec; 00187 char* src_even = (char*)sitelink[nu]; 00188 char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*gaugeSiteSize*prec; 00189 00190 if( (X[nu]+X[mu]) % 2 == 1){ 00191 //oddness will change between me and the diagonal neighbor 00192 //switch it now 00193 char* tmp = dst_odd; 00194 dst_odd = dst_even; 00195 dst_even = tmp; 00196 } 00197 00198 for(int i=0;i < X[dir2]; i++){ 00199 for(int j=0; j < X[dir1]; j++){ 00200 int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1; 00201 //int dst_idx = (i*X[dir1]+j) >> 1; 00202 int oddness = ( (X[nu]-1) + 0 + i + j) %2; 00203 00204 if(oddness==0){ 00205 for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){ 00206 memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec); 00207 } 00208 even_dst_idx++; 00209 }else{ 00210 for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){ 00211 memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec); 00212 } 00213 odd_dst_idx++; 00214 }//if 00215 00216 }//for j 00217 }//for i 00218 00219 if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){ 00220 errorQuda("even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n", 00221 even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2); 00222 } 00223 return ; 00224 00225 00226 } 00227 00228 void 00229 packGhostStaple(int* X, void* even, void* odd, int volume, QudaPrecision prec, 00230 int stride, 00231 int dir, int whichway, 00232 void** fwd_nbr_buf_gpu, void** back_nbr_buf_gpu, 00233 void** fwd_nbr_buf, void** back_nbr_buf, 00234 cudaStream_t* stream) 00235 { 00236 int Vs_x, Vs_y, Vs_z, Vs_t; 00237 00238 Vs_x = X[1]*X[2]*X[3]; 00239 Vs_y = X[0]*X[2]*X[3]; 00240 Vs_z = X[0]*X[1]*X[3]; 00241 Vs_t = X[0]*X[1]*X[2]; 00242 int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t}; 00243 00244 if (dir != 3){ //the code would work for dir=3 as well 00245 //even and odd ness switch (if necessary) is taken caren of in collectGhostStaple(); 00246 void* gpu_buf; 00247 int i =dir; 00248 if (whichway == QUDA_BACKWARDS){ 00249 gpu_buf = back_nbr_buf_gpu[i]; 00250 collectGhostStaple(X, even, odd, volume, prec, gpu_buf, i, whichway, stream); 00251 cudaMemcpyAsync(back_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream); 00252 }else{//whichway is QUDA_FORWARDS; 00253 gpu_buf = fwd_nbr_buf_gpu[i]; 00254 collectGhostStaple(X, even, odd, volume, prec, gpu_buf, i, whichway, stream); 00255 cudaMemcpyAsync(fwd_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream); 00256 } 00257 }else{ //special case for dir=3 since no gather kernel is required 00258 int Vh = volume; 00259 int Vsh = X[0]*X[1]*X[2]/2; 00260 int sizeOfFloatN = 2*prec; 00261 int len = Vsh*sizeOfFloatN; 00262 int i; 00263 if(X[3] %2 == 0){ 00264 //back,even 00265 for(i=0;i < 9; i++){ 00266 void* dst = ((char*)back_nbr_buf[3]) + i*len ; 00267 void* src = ((char*)even) + i*stride*sizeOfFloatN; 00268 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00269 } 00270 //back, odd 00271 for(i=0;i < 9; i++){ 00272 void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ; 00273 void* src = ((char*)odd) + i*stride*sizeOfFloatN; 00274 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00275 } 00276 //fwd,even 00277 for(i=0;i < 9; i++){ 00278 void* dst = ((char*)fwd_nbr_buf[3]) + i*len ; 00279 void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN; 00280 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00281 } 00282 //fwd, odd 00283 for(i=0;i < 9; i++){ 00284 void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ; 00285 void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN; 00286 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00287 } 00288 }else{ 00289 //reverse even and odd position 00290 //back,odd 00291 for(i=0;i < 9; i++){ 00292 void* dst = ((char*)back_nbr_buf[3]) + i*len ; 00293 void* src = ((char*)odd) + i*stride*sizeOfFloatN; 00294 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00295 } 00296 //back, even 00297 for(i=0;i < 9; i++){ 00298 void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ; 00299 void* src = ((char*)even) + i*stride*sizeOfFloatN; 00300 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00301 } 00302 //fwd,odd 00303 for(i=0;i < 9; i++){ 00304 void* dst = ((char*)fwd_nbr_buf[3]) + i*len ; 00305 void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN; 00306 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00307 } 00308 //fwd, even 00309 for(i=0;i < 9; i++){ 00310 void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ; 00311 void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN; 00312 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00313 } 00314 00315 } 00316 } 00317 00318 } 00319 00320 00321 void 00322 unpackGhostStaple(int* X, void* _even, void* _odd, int volume, QudaPrecision prec, 00323 int stride, 00324 int dir, int whichway, void** fwd_nbr_buf, void** back_nbr_buf, 00325 cudaStream_t* stream) 00326 { 00327 00328 int Vsh_x, Vsh_y, Vsh_z, Vsh_t; 00329 00330 Vsh_x = X[1]*X[2]*X[3]/2; 00331 Vsh_y = X[0]*X[2]*X[3]/2; 00332 Vsh_z = X[0]*X[1]*X[3]/2; 00333 Vsh_t = X[0]*X[1]*X[2]/2; 00334 int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t}; 00335 00336 int Vh = volume; 00337 int sizeOfFloatN = 2*prec; 00338 int len[4] = { 00339 Vsh_x*sizeOfFloatN, 00340 Vsh_y*sizeOfFloatN, 00341 Vsh_z*sizeOfFloatN, 00342 Vsh_t*sizeOfFloatN 00343 }; 00344 00345 int tmpint[4] = { 00346 0, 00347 Vsh_x, 00348 Vsh_x + Vsh_y, 00349 Vsh_x + Vsh_y + Vsh_z, 00350 }; 00351 00352 char* even = ((char*)_even) + Vh*sizeOfFloatN + 2*tmpint[dir]*sizeOfFloatN; 00353 char* odd = ((char*)_odd) + Vh*sizeOfFloatN +2*tmpint[dir]*sizeOfFloatN; 00354 00355 if(whichway == QUDA_BACKWARDS){ 00356 //back,even 00357 for(int i=0;i < 9; i++){ 00358 void* dst = even + i*stride*sizeOfFloatN; 00359 void* src = ((char*)back_nbr_buf[dir]) + i*len[dir] ; 00360 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 00361 } 00362 //back, odd 00363 for(int i=0;i < 9; i++){ 00364 void* dst = odd + i*stride*sizeOfFloatN; 00365 void* src = ((char*)back_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ; 00366 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 00367 } 00368 }else { //QUDA_FORWARDS 00369 //fwd,even 00370 for(int i=0;i < 9; i++){ 00371 void* dst = even + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN; 00372 void* src = ((char*)fwd_nbr_buf[dir]) + i*len[dir] ; 00373 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 00374 } 00375 //fwd, odd 00376 for(int i=0;i < 9; i++){ 00377 void* dst = odd + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN; 00378 void* src = ((char*)fwd_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ; 00379 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 00380 } 00381 } 00382 } 00383 00384 /* 00385 This is the packing kernel for the multi-dimensional ghost zone in 00386 the padded region. This is called by cpuexchangesitelink in 00387 FaceBuffer (MPI only), which was called by loadLinkToGPU (defined at 00388 the bottom). 00389 00390 Not currently included since it will be replaced by Guochun's new 00391 routine which uses an enlarged domain instead of a ghost zone. 00392 */ 00393 template <typename Float> 00394 void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd, int dir, int nFace, int* X) { 00395 int XY=X[0]*X[1]; 00396 int XYZ=X[0]*X[1]*X[2]; 00397 00398 int volumeCB = X[0]*X[1]*X[2]*X[3]/2; 00399 int faceVolumeCB[4]={ 00400 X[1]*X[2]*X[3]/2, 00401 X[0]*X[2]*X[3]/2, 00402 X[0]*X[1]*X[3]/2, 00403 X[0]*X[1]*X[2]/2 00404 }; 00405 00406 //loop variables: a, b, c with a the most signifcant and c the least significant 00407 //A, B, C the maximum value 00408 //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1 00409 int A[4], B[4], C[4]; 00410 00411 //X dimension 00412 A[0] = X[3]; B[0] = X[2]; C[0] = X[1]; 00413 00414 //Y dimension 00415 A[1] = X[3]; B[1] = X[2]; C[1] = X[0]; 00416 00417 //Z dimension 00418 A[2] = X[3]; B[2] = X[1]; C[2] = X[0]; 00419 00420 //T dimension 00421 A[3] = X[2]; B[3] = X[1]; C[3] = X[0]; 00422 00423 00424 //multiplication factor to compute index in original cpu memory 00425 int f[4][4]={ 00426 {XYZ, XY, X[0], 1}, 00427 {XYZ, XY, 1, X[0]}, 00428 {XYZ, X[0], 1, XY}, 00429 { XY, X[0], 1, XYZ} 00430 }; 00431 00432 00433 for(int ite = 0; ite < 2; ite++){ 00434 //ite == 0: back 00435 //ite == 1: fwd 00436 Float** dst; 00437 if (ite == 0){ 00438 dst = cpuGhostBack; 00439 }else{ 00440 dst = cpuGhostFwd; 00441 } 00442 00443 //collect back ghost gauge field 00444 //for(int dir =0; dir < 4; dir++){ 00445 int d; 00446 int a,b,c; 00447 00448 //we need copy all 4 links in the same location 00449 for(int linkdir=0; linkdir < 4; linkdir ++){ 00450 Float* even_src = cpuLink[linkdir]; 00451 Float* odd_src = cpuLink[linkdir] + volumeCB*gaugeSiteSize; 00452 00453 Float* even_dst; 00454 Float* odd_dst; 00455 00456 //switching odd and even ghost cpuLink when that dimension size is odd 00457 //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1 00458 if((X[dir] % 2 ==0) || (commDim(dir) == 1)){ 00459 even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize; 00460 odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize; 00461 }else{ 00462 odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize; 00463 even_dst = odd_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize; 00464 } 00465 00466 int even_dst_index = 0; 00467 int odd_dst_index = 0; 00468 00469 int startd; 00470 int endd; 00471 if(ite == 0){ //back 00472 startd = 0; 00473 endd= nFace; 00474 }else{//fwd 00475 startd = X[dir] - nFace; 00476 endd =X[dir]; 00477 } 00478 for(d = startd; d < endd; d++){ 00479 for(a = 0; a < A[dir]; a++){ 00480 for(b = 0; b < B[dir]; b++){ 00481 for(c = 0; c < C[dir]; c++){ 00482 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1; 00483 int oddness = (a+b+c+d)%2; 00484 if (oddness == 0){ //even 00485 for(int i=0;i < 18;i++){ 00486 even_dst[18*even_dst_index+i] = even_src[18*index + i]; 00487 } 00488 even_dst_index++; 00489 }else{ //odd 00490 for(int i=0;i < 18;i++){ 00491 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i]; 00492 } 00493 odd_dst_index++; 00494 } 00495 }//c 00496 }//b 00497 }//a 00498 }//d 00499 assert( even_dst_index == nFace*faceVolumeCB[dir]); 00500 assert( odd_dst_index == nFace*faceVolumeCB[dir]); 00501 }//linkdir 00502 00503 //}//dir 00504 }//ite 00505 } 00506 00507 00508 void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void** cpuGhostFwd, 00509 int dir, int nFace, QudaPrecision precision, int *X) { 00510 00511 if (precision == QUDA_DOUBLE_PRECISION) { 00512 packGhostAllLinks((double**)cpuLink, (double**)cpuGhostBack, (double**) cpuGhostFwd, dir, nFace, X); 00513 } else { 00514 packGhostAllLinks((float**)cpuLink, (float**)cpuGhostBack, (float**)cpuGhostFwd, dir, nFace, X); 00515 } 00516 00517 } 00518 00519 /* 00520 Copies the device gauge field to the host. 00521 - no reconstruction support 00522 - device data is always Float2 ordered 00523 - device data is a 1-dimensional array (MILC ordered) 00524 - no support for half precision 00525 */ 00526 00527 static void 00528 do_loadLinkToGPU(int* X, void *even, void*odd, void **cpuGauge, void** ghost_cpuGauge, 00529 void** ghost_cpuGauge_diag, 00530 QudaReconstructType reconstruct, int bytes, int Vh, int pad, 00531 int Vsh_x, int Vsh_y, int Vsh_z, int Vsh_t, 00532 QudaPrecision prec, QudaGaugeFieldOrder cpu_order) 00533 { 00534 00535 int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2); 00536 Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2); 00537 Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2); 00538 Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2); 00539 Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2); 00540 00541 int i; 00542 char* tmp_even; 00543 char* tmp_odd; 00544 int len = Vh*gaugeSiteSize*prec; 00545 00546 #ifdef MULTI_GPU 00547 int glen[4] = { 00548 Vsh_x*gaugeSiteSize*prec, 00549 Vsh_y*gaugeSiteSize*prec, 00550 Vsh_z*gaugeSiteSize*prec, 00551 Vsh_t*gaugeSiteSize*prec 00552 }; 00553 00554 int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t)+4*Vh_2d_max; 00555 #else 00556 int ghostV = 0; 00557 #endif 00558 00559 int glen_sum = ghostV*gaugeSiteSize*prec; 00560 if(cudaMalloc(&tmp_even, 4*(len+glen_sum)) != cudaSuccess){ 00561 errorQuda("cudaMalloc() failed for tmp_even\n"); 00562 } 00563 tmp_odd = tmp_even; 00564 00565 //even links 00566 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ 00567 for(i=0;i < 4; i++){ 00568 #ifdef GPU_DIRECT 00569 cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]); 00570 #else 00571 cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice); 00572 #endif 00573 } 00574 }else{ //QUDA_MILC_GAUGE_ORDER 00575 00576 #ifdef MULTI_GPU 00577 errorQuda("MULTI-GPU for milc format in this functino is not supported yet\n"); 00578 #endif 00579 #ifdef GPU_DIRECT 00580 cudaMemcpyAsync(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice, streams[0]); 00581 #else 00582 cudaMemcpy(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice); 00583 #endif 00584 } 00585 00586 00587 for(i=0;i < 4;i++){ 00588 #ifdef MULTI_GPU 00589 //dir: the source direction 00590 char* dest = tmp_even + i*(len+glen_sum)+len; 00591 for(int dir = 0; dir < 4; dir++){ 00592 #ifdef GPU_DIRECT 00593 cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 00594 cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 00595 #else 00596 cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 00597 cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 00598 #endif 00599 dest += 2*glen[dir]; 00600 } 00601 //fill in diag 00602 //@nu is @i, mu iterats from 0 to 4 and mu != nu 00603 int nu = i; 00604 for(int mu = 0; mu < 4; mu++){ 00605 if(nu == mu ){ 00606 continue; 00607 } 00608 int dir1, dir2; 00609 for(dir1=0; dir1 < 4; dir1 ++){ 00610 if(dir1 != nu && dir1 != mu){ 00611 break; 00612 } 00613 } 00614 for(dir2=0; dir2 < 4; dir2 ++){ 00615 if(dir2 != nu && dir2 != mu && dir2 != dir1){ 00616 break; 00617 } 00618 } 00619 #ifdef GPU_DIRECT 00620 cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], 00621 X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]); 00622 #else 00623 cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], 00624 X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice); 00625 #endif 00626 00627 } 00628 00629 #endif 00630 } 00631 00632 link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); 00633 00634 //odd links 00635 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ 00636 for(i=0;i < 4; i++){ 00637 #ifdef GPU_DIRECT 00638 cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[0]); 00639 #else 00640 cudaMemcpy(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice); 00641 #endif 00642 } 00643 }else{ //QUDA_MILC_GAUGE_ORDER 00644 #ifdef GPU_DIRECT 00645 cudaMemcpyAsync(tmp_odd , ((char*)cpuGauge)+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[0]); 00646 #else 00647 cudaMemcpy(tmp_odd, (char*)cpuGauge+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice); 00648 #endif 00649 } 00650 00651 00652 for(i=0;i < 4; i++){ 00653 #ifdef MULTI_GPU 00654 char* dest = tmp_odd + i*(len+glen_sum)+len; 00655 for(int dir = 0; dir < 4; dir++){ 00656 #ifdef GPU_DIRECT 00657 cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 00658 cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], 00659 cudaMemcpyHostToDevice, streams[0]); 00660 #else 00661 cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 00662 cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 00663 00664 #endif 00665 00666 dest += 2*glen[dir]; 00667 } 00668 //fill in diag 00669 //@nu is @i, mu iterats from 0 to 4 and mu != nu 00670 int nu = i; 00671 for(int mu = 0; mu < 4; mu++){ 00672 if(nu == mu ){ 00673 continue; 00674 } 00675 int dir1, dir2; 00676 for(dir1=0; dir1 < 4; dir1 ++){ 00677 if(dir1 != nu && dir1 != mu){ 00678 break; 00679 } 00680 } 00681 for(dir2=0; dir2 < 4; dir2 ++){ 00682 if(dir2 != nu && dir2 != mu && dir2 != dir1){ 00683 break; 00684 } 00685 } 00686 #ifdef GPU_DIRECT 00687 cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, 00688 X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]); 00689 #else 00690 cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, 00691 X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice ); 00692 #endif 00693 } 00694 00695 00696 #endif 00697 } 00698 link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); 00699 00700 cudaStreamSynchronize(streams[0]); 00701 00702 cudaFree(tmp_even); 00703 00704 } 00705 00706 00707 void 00708 loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param) 00709 { 00710 00711 if (cudaGauge->Precision() != cpuGauge->Precision()){ 00712 printf("ERROR: cpu precision and cuda precision must be the same in this function %s\n", __FUNCTION__); 00713 exit(1); 00714 } 00715 QudaPrecision prec= cudaGauge->Precision(); 00716 00717 #ifdef MULTI_GPU 00718 const int* Z = cudaGauge->X(); 00719 #endif 00720 int pad = cudaGauge->Pad(); 00721 int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2; 00722 int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2; 00723 int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2; 00724 int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2; 00725 00726 00727 00728 static void* ghost_cpuGauge[4]; 00729 static void* ghost_cpuGauge_diag[16]; 00730 00731 #ifdef MULTI_GPU 00732 static int allocated = 0; 00733 int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t}; 00734 00735 if(allocated == 0){ 00736 for(int i=0;i < 4; i++){ 00737 00738 #ifdef GPU_DIRECT 00739 cudaMallocHost((void**)&ghost_cpuGauge[i], 8*Vs[i]*gaugeSiteSize*prec); 00740 #else 00741 ghost_cpuGauge[i] = malloc(8*Vs[i]*gaugeSiteSize*prec); 00742 #endif 00743 if(ghost_cpuGauge[i] == NULL){ 00744 errorQuda("ERROR: malloc failed for ghost_sitelink[%d] \n",i); 00745 } 00746 } 00747 00748 00749 /* 00750 * nu | | 00751 * |_____| 00752 * mu 00753 */ 00754 00755 int ghost_diag_len[16]; 00756 for(int nu=0;nu < 4;nu++){ 00757 for(int mu=0; mu < 4;mu++){ 00758 if(nu == mu){ 00759 ghost_cpuGauge_diag[nu*4+mu] = NULL; 00760 }else{ 00761 //the other directions 00762 int dir1, dir2; 00763 for(dir1= 0; dir1 < 4; dir1++){ 00764 if(dir1 !=nu && dir1 != mu){ 00765 break; 00766 } 00767 } 00768 for(dir2=0; dir2 < 4; dir2++){ 00769 if(dir2 != nu && dir2 != mu && dir2 != dir1){ 00770 break; 00771 } 00772 } 00773 //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec); 00774 #ifdef GPU_DIRECT 00775 cudaMallocHost((void**)&ghost_cpuGauge_diag[nu*4+mu], Z[dir1]*Z[dir2]*gaugeSiteSize*prec); 00776 #else 00777 ghost_cpuGauge_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*prec); 00778 #endif 00779 if(ghost_cpuGauge_diag[nu*4+mu] == NULL){ 00780 errorQuda("malloc failed for ghost_sitelink_diag\n"); 00781 } 00782 00783 memset(ghost_cpuGauge_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*prec); 00784 ghost_diag_len[nu*4+mu] = Z[dir1]*Z[dir2]*gaugeSiteSize*prec; 00785 } 00786 } 00787 } 00788 allocated = 1; 00789 } 00790 00791 int optflag=1; 00792 // driver for for packalllink 00793 exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag); 00794 00795 #endif 00796 00797 do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 00798 ghost_cpuGauge, ghost_cpuGauge_diag, 00799 cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 00800 Vsh_x, Vsh_y, Vsh_z, Vsh_t, 00801 prec, cpuGauge->Order()); 00802 00803 #ifdef MULTI_GPU 00804 if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)){ 00805 00806 for(int i=0;i < 4;i++){ 00807 #ifdef GPU_DIRECT 00808 cudaFreeHost(ghost_cpuGauge[i]); 00809 #else 00810 free(ghost_cpuGauge[i]); 00811 #endif 00812 } 00813 for(int i=0;i <4; i++){ 00814 for(int j=0;j <4; j++){ 00815 if (i==j){ 00816 continue; 00817 } 00818 #ifdef GPU_DIRECT 00819 cudaFreeHost(ghost_cpuGauge_diag[i*4+j]); 00820 #else 00821 free(ghost_cpuGauge_diag[i*4+j]); 00822 #endif 00823 } 00824 } 00825 00826 allocated = 0; 00827 } 00828 #endif 00829 00830 } 00831 00832 static void 00833 do_loadLinkToGPU_ex(const int* X, void *even, void *odd, void**cpuGauge, 00834 QudaReconstructType reconstruct, int bytes, int Vh_ex, int pad, 00835 QudaPrecision prec, QudaGaugeFieldOrder cpu_order) 00836 { 00837 00838 cudaStream_t streams[2]; 00839 for(int i=0;i < 2; i++){ 00840 cudaStreamCreate(&streams[i]); 00841 } 00842 00843 00844 int i; 00845 char* tmp_even = NULL; 00846 char* tmp_odd = NULL; 00847 int len = Vh_ex*gaugeSiteSize*prec; 00848 00849 if(cudaMalloc(&tmp_even, 8*len) != cudaSuccess){ 00850 errorQuda("Error: cudaMalloc failed for tmp_even\n"); 00851 } 00852 tmp_odd = tmp_even + 4*len; 00853 00854 //even links 00855 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ 00856 for(i=0;i < 4; i++){ 00857 #ifdef GPU_DIRECT 00858 cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]); 00859 #else 00860 cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice); 00861 #endif 00862 00863 } 00864 }else{ //QUDA_MILC_GAUGE_ORDER 00865 #ifdef GPU_DIRECT 00866 cudaMemcpyAsync(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice, streams[0]); 00867 #else 00868 cudaMemcpy(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice); 00869 #endif 00870 } 00871 00872 link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh_ex, pad, 0, prec, cpu_order, streams[0]); 00873 00874 //odd links 00875 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ 00876 for(i=0;i < 4; i++){ 00877 #ifdef GPU_DIRECT 00878 cudaMemcpyAsync(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[1]); 00879 #else 00880 cudaMemcpy(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice); 00881 #endif 00882 } 00883 }else{//QUDA_MILC_GAUGE_ORDER 00884 #ifdef GPU_DIRECT 00885 cudaMemcpyAsync(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[1]); 00886 #else 00887 cudaMemcpy(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice); 00888 #endif 00889 } 00890 link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh_ex, pad, 0, prec, cpu_order, streams[1]); 00891 00892 00893 for(int i=0;i < 2;i++){ 00894 cudaStreamSynchronize(streams[i]); 00895 } 00896 00897 cudaFree(tmp_even); 00898 00899 for(int i=0;i < 2;i++){ 00900 cudaStreamDestroy(streams[i]); 00901 } 00902 00903 } 00904 00905 00906 00907 00908 void 00909 loadLinkToGPU_ex(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge) 00910 { 00911 00912 if (cudaGauge->Precision() != cpuGauge->Precision()){ 00913 printf("ERROR: cpu precision and cuda precision must be the same in this function %s\n", __FUNCTION__); 00914 exit(1); 00915 } 00916 QudaPrecision prec= cudaGauge->Precision(); 00917 const int* E = cudaGauge->X(); 00918 int pad = cudaGauge->Pad(); 00919 00920 do_loadLinkToGPU_ex(E, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 00921 cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 00922 prec, cpuGauge->Order()); 00923 00924 00925 } 00926 00927 00928 template<typename FloatN, typename Float> 00929 static void 00930 do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd, 00931 int bytes, int Vh, int stride, QudaPrecision prec) 00932 { 00933 00934 double* unpackedDataEven; 00935 double* unpackedDataOdd; 00936 int datalen = 4*Vh*gaugeSiteSize*sizeof(Float); 00937 if(cudaMalloc(&unpackedDataEven, datalen) != cudaSuccess){ 00938 errorQuda("cudaMalloc failed for unpackedDataEven\n"); 00939 } 00940 unpackedDataOdd = unpackedDataEven; 00941 //unpack even data kernel 00942 link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]); 00943 00944 #ifdef GPU_DIRECT 00945 cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]); 00946 #else 00947 cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost); 00948 #endif 00949 00950 //unpack odd data kernel 00951 link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]); 00952 #ifdef GPU_DIRECT 00953 cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]); 00954 #else 00955 cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost); 00956 #endif 00957 00958 cudaFree(unpackedDataEven); 00959 } 00960 void 00961 storeLinkToCPU(cpuGaugeField* cpuGauge, cudaGaugeField *cudaGauge, QudaGaugeParam* param) 00962 { 00963 00964 QudaPrecision cpu_prec = param->cpu_prec; 00965 QudaPrecision cuda_prec= param->cuda_prec; 00966 00967 if (cpu_prec != cuda_prec){ 00968 printf("ERROR: cpu precision and cuda precision must be the same in this function %s\n", __FUNCTION__); 00969 exit(1); 00970 } 00971 00972 if (cudaGauge->Reconstruct() != QUDA_RECONSTRUCT_NO){ 00973 printf("ERROR: it makes no sense to get data back to cpu for 8/12 reconstruct, function %s\n", __FUNCTION__); 00974 exit(1); 00975 } 00976 00977 int stride = cudaGauge->VolumeCB() + cudaGauge->Pad(); 00978 00979 if (cuda_prec == QUDA_DOUBLE_PRECISION){ 00980 do_storeLinkToCPU( (double*)cpuGauge->Gauge_p(), (double2*) cudaGauge->Even_p(), (double2*)cudaGauge->Odd_p(), 00981 cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec); 00982 }else if (cuda_prec == QUDA_SINGLE_PRECISION){ 00983 do_storeLinkToCPU( (float*)cpuGauge->Gauge_p(), (float2*) cudaGauge->Even_p(), (float2*)cudaGauge->Odd_p(), 00984 cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec); 00985 }else{ 00986 printf("ERROR: half precision not supported in this function %s\n", __FUNCTION__); 00987 exit(1); 00988 } 00989 } 00990 00991 00992 00993 00994 #endif 00995