QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <quda_internal.h> 00002 #include <face_quda.h> 00003 #include <comm_quda.h> 00004 #include <cstdio> 00005 #include <cstdlib> 00006 #include <quda.h> 00007 #include <string.h> 00008 #include <sys/time.h> 00009 #include <mpi.h> 00010 #include <cuda.h> 00011 00012 #include <fat_force_quda.h> 00013 00014 using namespace std; 00015 00016 #ifdef DSLASH_PROFILING 00017 void printDslashProfile(); 00018 #define CUDA_EVENT_RECORD(a,b) cudaEventRecord(a,b) 00019 #else 00020 #define CUDA_EVENT_RECORD(a,b) 00021 #define DSLASH_TIME_PROFILE() 00022 #endif 00023 00024 cudaStream_t *stream; 00025 00026 bool globalReduce = true; 00027 00028 FaceBuffer::FaceBuffer(const int *X, const int nDim, const int Ninternal, 00029 const int nFace, const QudaPrecision precision) : 00030 Ninternal(Ninternal), precision(precision), nDim(nDim), nFace(nFace) 00031 { 00032 setupDims(X); 00033 00034 // set these both = 0 `for no overlap of qmp and cudamemcpyasync 00035 // sendBackStrmIdx = 0, and sendFwdStrmIdx = 1 for overlap 00036 sendBackStrmIdx = 0; 00037 sendFwdStrmIdx = 1; 00038 recFwdStrmIdx = sendBackStrmIdx; 00039 recBackStrmIdx = sendFwdStrmIdx; 00040 00041 00042 for(int i=0;i < QUDA_MAX_DIM; i++){ 00043 recv_request1[i] = malloc(sizeof(MPI_Request)); 00044 recv_request2[i] = malloc(sizeof(MPI_Request)); 00045 send_request1[i] = malloc(sizeof(MPI_Request)); 00046 send_request2[i] = malloc(sizeof(MPI_Request)); 00047 if( recv_request1[i] == 0 || recv_request2[i] == 0 00048 || send_request1[i] == 0 || send_request2[i] == 0){ 00049 errorQuda("ERROR: malloc failed for recv/send request handles\n"); 00050 } 00051 } 00052 00053 for(int dir =0 ; dir < 4;dir++){ 00054 nbytes[dir] = nFace*faceVolumeCB[dir]*Ninternal*precision; 00055 if (precision == QUDA_HALF_PRECISION) nbytes[dir] += nFace*faceVolumeCB[dir]*sizeof(float); 00056 00057 cudaMallocHost((void**)&fwd_nbr_spinor_sendbuf[dir], nbytes[dir]); 00058 cudaMallocHost((void**)&back_nbr_spinor_sendbuf[dir], nbytes[dir]); 00059 00060 if (fwd_nbr_spinor_sendbuf[dir] == NULL || back_nbr_spinor_sendbuf[dir] == NULL) 00061 errorQuda("dir =%d, malloc failed for fwd_nbr_spinor_sendbuf/back_nbr_spinor_sendbuf", dir); 00062 00063 cudaMallocHost((void**)&fwd_nbr_spinor[dir], nbytes[dir]); 00064 cudaMallocHost((void**)&back_nbr_spinor[dir], nbytes[dir]); 00065 00066 if (fwd_nbr_spinor[dir] == NULL || back_nbr_spinor[dir] == NULL) 00067 errorQuda("malloc failed for fwd_nbr_spinor/back_nbr_spinor"); 00068 00069 #ifdef GPU_DIRECT 00070 pageable_fwd_nbr_spinor_sendbuf[dir] = fwd_nbr_spinor_sendbuf[dir]; 00071 pageable_back_nbr_spinor_sendbuf[dir] = back_nbr_spinor_sendbuf[dir]; 00072 pageable_fwd_nbr_spinor[dir] = fwd_nbr_spinor[dir]; 00073 pageable_back_nbr_spinor[dir] = back_nbr_spinor[dir]; 00074 #else 00075 pageable_fwd_nbr_spinor_sendbuf[dir] = malloc(nbytes[dir]); 00076 pageable_back_nbr_spinor_sendbuf[dir] = malloc(nbytes[dir]); 00077 00078 if (pageable_fwd_nbr_spinor_sendbuf[dir] == NULL || pageable_back_nbr_spinor_sendbuf[dir] == NULL) 00079 errorQuda("malloc failed for pageable_fwd_nbr_spinor_sendbuf/pageable_back_nbr_spinor_sendbuf"); 00080 00081 pageable_fwd_nbr_spinor[dir]=malloc(nbytes[dir]); 00082 pageable_back_nbr_spinor[dir]=malloc(nbytes[dir]); 00083 00084 if (pageable_fwd_nbr_spinor[dir] == NULL || pageable_back_nbr_spinor[dir] == NULL) 00085 errorQuda("malloc failed for pageable_fwd_nbr_spinor/pageable_back_nbr_spinor"); 00086 #endif 00087 00088 } 00089 00090 return; 00091 } 00092 00093 FaceBuffer::FaceBuffer(const FaceBuffer &face) { 00094 errorQuda("FaceBuffer copy constructor not implemented"); 00095 } 00096 00097 // X here is a checkboarded volume 00098 void FaceBuffer::setupDims(const int* X) 00099 { 00100 Volume = 1; 00101 for (int d=0; d<nDim; d++) { 00102 this->X[d] = X[d]; 00103 Volume *= this->X[d]; 00104 } 00105 VolumeCB = Volume/2; 00106 00107 for (int i=0; i<nDim; i++) { 00108 faceVolume[i] = 1; 00109 for (int j=0; j<nDim; j++) { 00110 if (i==j) continue; 00111 faceVolume[i] *= this->X[j]; 00112 } 00113 faceVolumeCB[i] = faceVolume[i]/2; 00114 } 00115 } 00116 00117 FaceBuffer::~FaceBuffer() 00118 { 00119 00120 for(int i=0;i < QUDA_MAX_DIM; i++){ 00121 free((void*)recv_request1[i]); 00122 free((void*)recv_request2[i]); 00123 free((void*)send_request1[i]); 00124 free((void*)send_request2[i]); 00125 } 00126 00127 for(int dir =0; dir < 4; dir++){ 00128 if(fwd_nbr_spinor_sendbuf[dir]) { 00129 cudaFreeHost(fwd_nbr_spinor_sendbuf[dir]); 00130 fwd_nbr_spinor_sendbuf[dir] = NULL; 00131 } 00132 if(back_nbr_spinor_sendbuf[dir]) { 00133 cudaFreeHost(back_nbr_spinor_sendbuf[dir]); 00134 back_nbr_spinor_sendbuf[dir] = NULL; 00135 } 00136 if(fwd_nbr_spinor[dir]) { 00137 cudaFreeHost(fwd_nbr_spinor[dir]); 00138 fwd_nbr_spinor[dir] = NULL; 00139 } 00140 if(back_nbr_spinor[dir]) { 00141 cudaFreeHost(back_nbr_spinor[dir]); 00142 back_nbr_spinor[dir] = NULL; 00143 } 00144 00145 #ifdef GPU_DIRECT 00146 pageable_fwd_nbr_spinor_sendbuf[dir] = NULL; 00147 pageable_back_nbr_spinor_sendbuf[dir]=NULL; 00148 pageable_fwd_nbr_spinor[dir]=NULL; 00149 pageable_back_nbr_spinor[dir]=NULL; 00150 #else 00151 if(pageable_fwd_nbr_spinor_sendbuf[dir]){ 00152 free(pageable_fwd_nbr_spinor_sendbuf[dir]); 00153 pageable_fwd_nbr_spinor_sendbuf[dir] = NULL; 00154 } 00155 00156 if(pageable_back_nbr_spinor_sendbuf[dir]){ 00157 free(pageable_back_nbr_spinor_sendbuf[dir]); 00158 pageable_back_nbr_spinor_sendbuf[dir]=NULL; 00159 } 00160 00161 if(pageable_fwd_nbr_spinor[dir]){ 00162 free(pageable_fwd_nbr_spinor[dir]); 00163 pageable_fwd_nbr_spinor[dir]=NULL; 00164 } 00165 00166 if(pageable_back_nbr_spinor[dir]){ 00167 free(pageable_back_nbr_spinor[dir]); 00168 pageable_back_nbr_spinor[dir]=NULL; 00169 } 00170 #endif 00171 00172 00173 } 00174 } 00175 00176 void FaceBuffer::pack(cudaColorSpinorField &in, int parity, int dagger, int dim, cudaStream_t *stream_p) 00177 { 00178 if(!commDimPartitioned(dim)) return; 00179 00180 in.allocateGhostBuffer(); // allocate the ghost buffer if not yet allocated 00181 stream = stream_p; 00182 00183 in.packGhost(dim, (QudaParity)parity, dagger, &stream[Nstream-1]); 00184 } 00185 00186 void FaceBuffer::gather(cudaColorSpinorField &in, int dagger, int dir) 00187 { 00188 int dim = dir/2; 00189 if(!commDimPartitioned(dim)) return; 00190 00191 if (dir%2==0){ // backwards send 00192 in.sendGhost(back_nbr_spinor_sendbuf[dim], dim, QUDA_BACKWARDS, dagger, &stream[2*dim + sendBackStrmIdx]); 00193 } else { // forwards send 00194 in.sendGhost(fwd_nbr_spinor_sendbuf[dim], dim, QUDA_FORWARDS, dagger, &stream[2*dim + sendFwdStrmIdx]); 00195 } 00196 } 00197 00198 void FaceBuffer::commsStart(int dir) { 00199 int dim = dir / 2; 00200 if(!commDimPartitioned(dim)) return; 00201 00202 int back_nbr[4] = {X_BACK_NBR,Y_BACK_NBR,Z_BACK_NBR,T_BACK_NBR}; 00203 int fwd_nbr[4] = {X_FWD_NBR,Y_FWD_NBR,Z_FWD_NBR,T_FWD_NBR}; 00204 int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; 00205 int uptags[4] = {XUP, YUP, ZUP, TUP}; 00206 00207 if (dir %2 == 0) { 00208 // Prepost all receives 00209 00210 comm_recv_with_tag(pageable_fwd_nbr_spinor[dim], nbytes[dim], fwd_nbr[dim], downtags[dim], recv_request1[dim]); 00211 #ifndef GPU_DIRECT 00212 memcpy(pageable_back_nbr_spinor_sendbuf[dim], 00213 back_nbr_spinor_sendbuf[dim], nbytes[dim]); 00214 #endif 00215 comm_send_with_tag(pageable_back_nbr_spinor_sendbuf[dim], nbytes[dim], back_nbr[dim], downtags[dim], send_request1[dim]); 00216 } else { 00217 00218 comm_recv_with_tag(pageable_back_nbr_spinor[dim], nbytes[dim], back_nbr[dim], uptags[dim], recv_request2[dim]); 00219 #ifndef GPU_DIRECT 00220 memcpy(pageable_fwd_nbr_spinor_sendbuf[dim], 00221 fwd_nbr_spinor_sendbuf[dim], nbytes[dim]); 00222 #endif 00223 comm_send_with_tag(pageable_fwd_nbr_spinor_sendbuf[dim], nbytes[dim], fwd_nbr[dim], uptags[dim], send_request2[dim]); 00224 } 00225 } 00226 00227 00228 int FaceBuffer::commsQuery(int dir) { 00229 int dim = dir / 2; 00230 if(!commDimPartitioned(dim)) return 0; 00231 00232 if(dir%2==0) { 00233 if (comm_query(recv_request1[dim]) && 00234 comm_query(send_request1[dim])) { 00235 #ifndef GPU_DIRECT 00236 memcpy(fwd_nbr_spinor[dim], pageable_fwd_nbr_spinor[dim], nbytes[dim]); 00237 #endif 00238 return 1; 00239 } 00240 } else { 00241 if (comm_query(recv_request2[dim]) && 00242 comm_query(send_request2[dim])) { 00243 #ifndef GPU_DIRECT 00244 memcpy(back_nbr_spinor[dim], pageable_back_nbr_spinor[dim], nbytes[dim]); 00245 #endif 00246 return 1; 00247 } 00248 } 00249 00250 return 0; 00251 } 00252 00253 void FaceBuffer::scatter(cudaColorSpinorField &out, int dagger, int dir) { 00254 int dim = dir / 2; 00255 if(!commDimPartitioned(dim)) return; 00256 00257 if (dir%2 == 0) { 00258 out.unpackGhost(fwd_nbr_spinor[dim], dim, QUDA_FORWARDS, dagger, &stream[2*dim + recFwdStrmIdx]); 00259 } else { 00260 out.unpackGhost(back_nbr_spinor[dim], dim, QUDA_BACKWARDS, dagger, &stream[2*dim + recBackStrmIdx]); 00261 } 00262 } 00263 00264 void FaceBuffer::exchangeCpuSpinor(cpuColorSpinorField &spinor, int oddBit, int dagger) 00265 { 00266 00267 //for all dimensions 00268 int len[4] = { 00269 nFace*faceVolumeCB[0]*Ninternal*precision, 00270 nFace*faceVolumeCB[1]*Ninternal*precision, 00271 nFace*faceVolumeCB[2]*Ninternal*precision, 00272 nFace*faceVolumeCB[3]*Ninternal*precision 00273 }; 00274 00275 // allocate the ghost buffer if not yet allocated 00276 spinor.allocateGhostBuffer(); 00277 00278 for(int i=0;i < 4; i++){ 00279 spinor.packGhost(spinor.backGhostFaceSendBuffer[i], i, QUDA_BACKWARDS, (QudaParity)oddBit, dagger); 00280 spinor.packGhost(spinor.fwdGhostFaceSendBuffer[i], i, QUDA_FORWARDS, (QudaParity)oddBit, dagger); 00281 } 00282 00283 int back_nbr[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR,T_BACK_NBR}; 00284 int fwd_nbr[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR,T_FWD_NBR}; 00285 int uptags[4] = {XUP, YUP, ZUP, TUP}; 00286 int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; 00287 00288 for(int i= 0;i < 4; i++){ 00289 comm_recv_with_tag(spinor.backGhostFaceBuffer[i], len[i], back_nbr[i], uptags[i], recv_request1[i]); 00290 comm_recv_with_tag(spinor.fwdGhostFaceBuffer[i], len[i], fwd_nbr[i], downtags[i], recv_request2[i]); 00291 comm_send_with_tag(spinor.fwdGhostFaceSendBuffer[i], len[i], fwd_nbr[i], uptags[i], send_request1[i]); 00292 comm_send_with_tag(spinor.backGhostFaceSendBuffer[i], len[i], back_nbr[i], downtags[i], send_request2[i]); 00293 } 00294 00295 for(int i=0;i < 4;i++){ 00296 comm_wait(recv_request1[i]); 00297 comm_wait(recv_request2[i]); 00298 comm_wait(send_request1[i]); 00299 comm_wait(send_request2[i]); 00300 } 00301 00302 } 00303 00304 00305 void FaceBuffer::exchangeCpuLink(void** ghost_link, void** link_sendbuf) { 00306 int uptags[4] = {XUP, YUP, ZUP,TUP}; 00307 int fwd_nbrs[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR}; 00308 int back_nbrs[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR}; 00309 00310 for(int dir =0; dir < 4; dir++) 00311 { 00312 int len = 2*nFace*faceVolumeCB[dir]*Ninternal; 00313 MPI_Request recv_request; 00314 comm_recv_with_tag(ghost_link[dir], len*precision, back_nbrs[dir], uptags[dir], &recv_request); 00315 MPI_Request send_request; 00316 comm_send_with_tag(link_sendbuf[dir], len*precision, fwd_nbrs[dir], uptags[dir], &send_request); 00317 comm_wait(&recv_request); 00318 comm_wait(&send_request); 00319 } 00320 } 00321 00322 void reduceMaxDouble(double &max) { 00323 00324 #ifdef MPI_COMMS 00325 comm_allreduce_max(&max); 00326 #endif 00327 00328 } 00329 void reduceDouble(double &sum) { 00330 00331 #ifdef MPI_COMMS 00332 if (globalReduce) comm_allreduce(&sum); 00333 #endif 00334 00335 } 00336 00337 void reduceDoubleArray(double *sum, const int len) { 00338 00339 #ifdef MPI_COMMS 00340 if (globalReduce) comm_allreduce_array(sum, len); 00341 #endif 00342 00343 } 00344 00345 int commDim(int dir) { return comm_dim(dir); } 00346 00347 int commCoords(int dir) { return comm_coords(dir); } 00348 00349 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);} 00350 00351 void commDimPartitionedSet(int dir) { comm_dim_partitioned_set(dir);} 00352 00353 void commBarrier() { comm_barrier(); } 00354 00355 00356 00357 /************************************************************** 00358 * Staple exchange routine 00359 * used in fat link computation 00360 ***************************************************************/ 00361 #if defined(GPU_FATLINK)||defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) 00362 00363 #define gaugeSiteSize 18 00364 00365 #ifdef GPU_DIRECT 00366 static void* fwd_nbr_staple_cpu[4]; 00367 static void* back_nbr_staple_cpu[4]; 00368 static void* fwd_nbr_staple_sendbuf_cpu[4]; 00369 static void* back_nbr_staple_sendbuf_cpu[4]; 00370 #endif 00371 00372 static void* fwd_nbr_staple_gpu[4]; 00373 static void* back_nbr_staple_gpu[4]; 00374 00375 static void* fwd_nbr_staple[4]; 00376 static void* back_nbr_staple[4]; 00377 static void* fwd_nbr_staple_sendbuf[4]; 00378 static void* back_nbr_staple_sendbuf[4]; 00379 00380 static int dims[4]; 00381 static int X1,X2,X3,X4; 00382 static int V; 00383 static int Vh; 00384 static int Vs[4], Vsh[4]; 00385 static int Vs_x, Vs_y, Vs_z, Vs_t; 00386 static int Vsh_x, Vsh_y, Vsh_z, Vsh_t; 00387 static MPI_Request llfat_send_request1[4]; 00388 static MPI_Request llfat_recv_request1[4]; 00389 static MPI_Request llfat_send_request2[4]; 00390 static MPI_Request llfat_recv_request2[4]; 00391 00392 #include "gauge_field.h" 00393 extern void setup_dims_in_gauge(int *XX); 00394 00395 static void 00396 setup_dims(int* X) 00397 { 00398 V = 1; 00399 for (int d=0; d< 4; d++) { 00400 V *= X[d]; 00401 dims[d] = X[d]; 00402 } 00403 Vh = V/2; 00404 00405 X1=X[0]; 00406 X2=X[1]; 00407 X3=X[2]; 00408 X4=X[3]; 00409 00410 00411 Vs[0] = Vs_x = X[1]*X[2]*X[3]; 00412 Vs[1] = Vs_y = X[0]*X[2]*X[3]; 00413 Vs[2] = Vs_z = X[0]*X[1]*X[3]; 00414 Vs[3] = Vs_t = X[0]*X[1]*X[2]; 00415 00416 Vsh[0] = Vsh_x = Vs_x/2; 00417 Vsh[1] = Vsh_y = Vs_y/2; 00418 Vsh[2] = Vsh_z = Vs_z/2; 00419 Vsh[3] = Vsh_t = Vs_t/2; 00420 00421 } 00422 00423 void 00424 exchange_llfat_init(QudaPrecision prec) 00425 { 00426 static int initialized = 0; 00427 if (initialized){ 00428 return; 00429 } 00430 initialized = 1; 00431 00432 00433 for(int i=0;i < 4; i++){ 00434 if(cudaMalloc((void**)&fwd_nbr_staple_gpu[i], Vs[i]*gaugeSiteSize*prec) != cudaSuccess){ 00435 errorQuda("cudaMalloc() failed for fwd_nbr_staple_gpu\n"); 00436 } 00437 if(cudaMalloc((void**)&back_nbr_staple_gpu[i], Vs[i]*gaugeSiteSize*prec) != cudaSuccess){ 00438 errorQuda("cudaMalloc() failed for back_nbr_staple_gpu\n"); 00439 } 00440 00441 cudaMallocHost((void**)&fwd_nbr_staple[i], Vs[i]*gaugeSiteSize*prec); 00442 cudaMallocHost((void**)&back_nbr_staple[i], Vs[i]*gaugeSiteSize*prec); 00443 00444 cudaMallocHost((void**)&fwd_nbr_staple_sendbuf[i], Vs[i]*gaugeSiteSize*prec); 00445 cudaMallocHost((void**)&back_nbr_staple_sendbuf[i], Vs[i]*gaugeSiteSize*prec); 00446 } 00447 00448 00449 00450 #ifdef GPU_DIRECT 00451 for(int i=0;i < 4; i++){ 00452 fwd_nbr_staple_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec); 00453 back_nbr_staple_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec); 00454 if (fwd_nbr_staple_cpu[i] == NULL||back_nbr_staple_cpu[i] == NULL){ 00455 printf("ERROR: malloc failed for fwd_nbr_staple_cpu/back_nbr_staple_cpu\n"); 00456 comm_exit(1); 00457 } 00458 00459 } 00460 00461 for(int i=0;i < 4; i++){ 00462 fwd_nbr_staple_sendbuf_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec); 00463 back_nbr_staple_sendbuf_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec); 00464 if (fwd_nbr_staple_sendbuf_cpu[i] == NULL || back_nbr_staple_sendbuf_cpu[i] == NULL){ 00465 printf("ERROR: malloc failed for fwd_nbr_staple_sendbuf/back_nbr_staple_sendbuf\n"); 00466 comm_exit(1); 00467 } 00468 } 00469 #endif 00470 00471 return; 00472 } 00473 00474 template<typename Float> 00475 void 00476 exchange_sitelink_diag(int* X, Float** sitelink, Float** ghost_sitelink_diag, int optflag) 00477 { 00478 /* 00479 nu | | 00480 |__________| 00481 mu 00482 00483 * There are total 12 different combinations for (nu,mu) 00484 * since nu/mu = X,Y,Z,T and nu != mu 00485 * For each combination, we need to communicate with the corresponding 00486 * neighbor and get the diag ghost data 00487 * The neighbor we need to get data from is dx[nu]=-1, dx[mu]= +1 00488 * and we need to send our data to neighbor with dx[nu]=+1, dx[mu]=-1 00489 */ 00490 00491 for(int nu = XUP; nu <=TUP; nu++){ 00492 for(int mu = XUP; mu <= TUP; mu++){ 00493 if(nu == mu){ 00494 continue; 00495 } 00496 if(optflag && (!commDimPartitioned(mu) || !commDimPartitioned(nu))){ 00497 continue; 00498 } 00499 00500 int dir1, dir2; //other two dimensions 00501 for(dir1=0; dir1 < 4; dir1 ++){ 00502 if(dir1 != nu && dir1 != mu){ 00503 break; 00504 } 00505 } 00506 for(dir2=0; dir2 < 4; dir2 ++){ 00507 if(dir2 != nu && dir2 != mu && dir2 != dir1){ 00508 break; 00509 } 00510 } 00511 00512 if(dir1 == 4 || dir2 == 4){ 00513 errorQuda("Invalid dir1/dir2"); 00514 } 00515 int len = X[dir1]*X[dir2]*gaugeSiteSize*sizeof(Float); 00516 void* sendbuf = malloc(len); 00517 if(sendbuf == NULL){ 00518 errorQuda("Malloc failed for diag sendbuf\n"); 00519 } 00520 00521 pack_gauge_diag(sendbuf, X, (void**)sitelink, nu, mu, dir1, dir2, (QudaPrecision)sizeof(Float)); 00522 00523 //post recv 00524 int dx[4]={0,0,0,0}; 00525 dx[nu]=-1; 00526 dx[mu]=+1; 00527 int src_rank = comm_get_neighbor_rank(dx[0], dx[1], dx[2], dx[3]); 00528 MPI_Request recv_request; 00529 comm_recv_from_rank(ghost_sitelink_diag[nu*4+mu], len, src_rank, &recv_request); 00530 //do send 00531 dx[nu]=+1; 00532 dx[mu]=-1; 00533 int dst_rank = comm_get_neighbor_rank(dx[0], dx[1], dx[2], dx[3]); 00534 MPI_Request send_request; 00535 comm_send_to_rank(sendbuf, len, dst_rank, &send_request); 00536 00537 comm_wait(&recv_request); 00538 comm_wait(&send_request); 00539 00540 free(sendbuf); 00541 }//mu 00542 }//nu 00543 } 00544 00545 00546 template<typename Float> 00547 void 00548 exchange_sitelink(int*X, Float** sitelink, Float** ghost_sitelink, Float** ghost_sitelink_diag, 00549 Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf, int optflag) 00550 { 00551 00552 00553 #if 0 00554 int i; 00555 int len = Vsh_t*gaugeSiteSize*sizeof(Float); 00556 for(i=0;i < 4;i++){ 00557 Float* even_sitelink_back_src = sitelink[i]; 00558 Float* odd_sitelink_back_src = sitelink[i] + Vh*gaugeSiteSize; 00559 Float* sitelink_back_dst = sitelink_back_sendbuf[3] + 2*i*Vsh_t*gaugeSiteSize; 00560 00561 if(dims[3] % 2 == 0){ 00562 memcpy(sitelink_back_dst, even_sitelink_back_src, len); 00563 memcpy(sitelink_back_dst + Vsh_t*gaugeSiteSize, odd_sitelink_back_src, len); 00564 }else{ 00565 //switching odd and even ghost sitelink 00566 memcpy(sitelink_back_dst, odd_sitelink_back_src, len); 00567 memcpy(sitelink_back_dst + Vsh_t*gaugeSiteSize, even_sitelink_back_src, len); 00568 } 00569 } 00570 00571 for(i=0;i < 4;i++){ 00572 Float* even_sitelink_fwd_src = sitelink[i] + (Vh - Vsh_t)*gaugeSiteSize; 00573 Float* odd_sitelink_fwd_src = sitelink[i] + Vh*gaugeSiteSize + (Vh - Vsh_t)*gaugeSiteSize; 00574 Float* sitelink_fwd_dst = sitelink_fwd_sendbuf[3] + 2*i*Vsh_t*gaugeSiteSize; 00575 if(dims[3] % 2 == 0){ 00576 memcpy(sitelink_fwd_dst, even_sitelink_fwd_src, len); 00577 memcpy(sitelink_fwd_dst + Vsh_t*gaugeSiteSize, odd_sitelink_fwd_src, len); 00578 }else{ 00579 //switching odd and even ghost sitelink 00580 memcpy(sitelink_fwd_dst, odd_sitelink_fwd_src, len); 00581 memcpy(sitelink_fwd_dst + Vsh_t*gaugeSiteSize, even_sitelink_fwd_src, len); 00582 } 00583 00584 } 00585 #else 00586 int nFace =1; 00587 for(int dir=0; dir < 4; dir++){ 00588 if(optflag && !commDimPartitioned(dir)) continue; 00589 pack_ghost_all_links((void**)sitelink, (void**)sitelink_back_sendbuf, (void**)sitelink_fwd_sendbuf, dir, nFace, (QudaPrecision)(sizeof(Float)), X); 00590 } 00591 #endif 00592 00593 00594 int fwd_neighbors[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR}; 00595 int back_neighbors[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR}; 00596 int up_tags[4] = {XUP, YUP, ZUP, TUP}; 00597 int down_tags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; 00598 00599 for(int dir =0; dir < 4; dir++){ 00600 if(optflag && !commDimPartitioned(dir)) continue; 00601 int len = Vsh[dir]*gaugeSiteSize*sizeof(Float); 00602 Float* ghost_sitelink_back = ghost_sitelink[dir]; 00603 Float* ghost_sitelink_fwd = ghost_sitelink[dir] + 8*Vsh[dir]*gaugeSiteSize; 00604 00605 MPI_Request recv_request1; 00606 MPI_Request recv_request2; 00607 comm_recv_with_tag(ghost_sitelink_back, 8*len, back_neighbors[dir], up_tags[dir], &recv_request1); 00608 comm_recv_with_tag(ghost_sitelink_fwd, 8*len, fwd_neighbors[dir], down_tags[dir], &recv_request2); 00609 MPI_Request send_request1; 00610 MPI_Request send_request2; 00611 comm_send_with_tag(sitelink_fwd_sendbuf[dir], 8*len, fwd_neighbors[dir], up_tags[dir], &send_request1); 00612 comm_send_with_tag(sitelink_back_sendbuf[dir], 8*len, back_neighbors[dir], down_tags[dir], &send_request2); 00613 comm_wait(&recv_request1); 00614 comm_wait(&recv_request2); 00615 comm_wait(&send_request1); 00616 comm_wait(&send_request2); 00617 } 00618 00619 exchange_sitelink_diag(X, sitelink, ghost_sitelink_diag, optflag); 00620 } 00621 00622 //this function is used for link fattening computation 00623 //@optflag: if this flag is set, we only communicate in directions that are partitioned 00624 // if not set, then we communicate in all directions regradless of partitions 00625 void exchange_cpu_sitelink(int* X, 00626 void** sitelink, void** ghost_sitelink, 00627 void** ghost_sitelink_diag, 00628 QudaPrecision gPrecision, QudaGaugeParam* param, int optflag) 00629 { 00630 setup_dims(X); 00631 static void* sitelink_fwd_sendbuf[4]; 00632 static void* sitelink_back_sendbuf[4]; 00633 static int allocated = 0; 00634 00635 if(!allocated){ 00636 for(int i=0;i < 4;i++){ 00637 sitelink_fwd_sendbuf[i] = malloc(4*Vs[i]*gaugeSiteSize*gPrecision); 00638 sitelink_back_sendbuf[i] = malloc(4*Vs[i]*gaugeSiteSize*gPrecision); 00639 if (sitelink_fwd_sendbuf[i] == NULL|| sitelink_back_sendbuf[i] == NULL){ 00640 errorQuda("ERROR: malloc failed for sitelink_sendbuf/site_link_back_sendbuf\n"); 00641 } 00642 memset(sitelink_fwd_sendbuf[i], 0, 4*Vs[i]*gaugeSiteSize*gPrecision); 00643 memset(sitelink_back_sendbuf[i], 0, 4*Vs[i]*gaugeSiteSize*gPrecision); 00644 } 00645 allocated = 1; 00646 } 00647 00648 if (gPrecision == QUDA_DOUBLE_PRECISION){ 00649 exchange_sitelink(X, (double**)sitelink, (double**)(ghost_sitelink), (double**)ghost_sitelink_diag, 00650 (double**)sitelink_fwd_sendbuf, (double**)sitelink_back_sendbuf, optflag); 00651 }else{ //single 00652 exchange_sitelink(X, (float**)sitelink, (float**)(ghost_sitelink), (float**)ghost_sitelink_diag, 00653 (float**)sitelink_fwd_sendbuf, (float**)sitelink_back_sendbuf, optflag); 00654 } 00655 00656 if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)){ 00657 for(int i=0;i < 4;i++){ 00658 free(sitelink_fwd_sendbuf[i]); 00659 free(sitelink_back_sendbuf[i]); 00660 } 00661 allocated = 0; 00662 } 00663 } 00664 00665 00666 00667 00668 /* This function exchange the sitelink and store them in the correspoinding portion of 00669 * the extended sitelink memory region 00670 * @sitelink: this is stored according to dimension size (X1+4) (X2+4) (X3+4) (X4+4) 00671 */ 00672 00673 void exchange_cpu_sitelink_ex(int* X, void** sitelink, QudaGaugeFieldOrder cpu_order, 00674 QudaPrecision gPrecision, int optflag) 00675 { 00676 int X1,X2,X3,X4; 00677 int E1,E2,E3,E4; 00678 X1 = X[0]; X2 = X[1]; X3 = X[2]; X4 = X[3]; 00679 E1 = X[0]+4; E2 = X[1]+4; E3 = X[2]+4; E4 = X[3]+4; 00680 int E3E2E1=E3*E2*E1; 00681 int E2E1=E2*E1; 00682 int E4E3E2=E4*E3*E2; 00683 int E3E2=E3*E2; 00684 int E4E3E1=E4*E3*E1; 00685 int E3E1=E3*E1; 00686 int E4E2E1=E4*E2*E1; 00687 int Vh_ex = E4*E3*E2*E1/2; 00688 00689 //...............x.........y.....z......t 00690 int starta[] = {2, 2, 2, 0}; 00691 int enda[] = {X4+2, X4+2, X4+2, X3+4}; 00692 00693 int startb[] = {2, 2, 0, 0}; 00694 int endb[] = {X3+2, X3+2, X2+4, X2+4}; 00695 00696 int startc[] = {2, 0, 0, 0}; 00697 int endc[] = {X2+2, X1+4, X1+4, X1+4}; 00698 00699 int f_main[4][4] = { 00700 {E3E2E1, E2E1, E1, 1}, 00701 {E3E2E1, E2E1, 1, E1}, 00702 {E3E2E1, E1, 1, E2E1}, 00703 {E2E1, E1, 1, E3E2E1} 00704 }; 00705 00706 int f_bound[4][4]={ 00707 {E3E2, E2, 1, E4E3E2}, 00708 {E3E1, E1, 1, E4E3E1}, 00709 {E2E1, E1, 1, E4E2E1}, 00710 {E2E1, E1, 1, E3E2E1} 00711 }; 00712 00713 int nslices = 2; 00714 int slice_3d[] = { E4E3E2, E4E3E1, E4E2E1, E3E2E1}; 00715 int len[4]; 00716 for(int i=0;i < 4;i++){ 00717 len[i] = slice_3d[i] * nslices* 4*gaugeSiteSize*gPrecision; //2 slices, 4 directions' links 00718 } 00719 void* ghost_sitelink_fwd_sendbuf[4]; 00720 void* ghost_sitelink_back_sendbuf[4]; 00721 void* ghost_sitelink_fwd[4]; 00722 void* ghost_sitelink_back[4]; 00723 for(int i=0;i < 4;i++){ 00724 if(!commDimPartitioned(i)) continue; 00725 ghost_sitelink_fwd_sendbuf[i] = malloc(len[i]); 00726 ghost_sitelink_back_sendbuf[i] = malloc(len[i]); 00727 if(ghost_sitelink_fwd_sendbuf[i] == NULL || ghost_sitelink_back_sendbuf[i] == NULL){ 00728 errorQuda("Error: malloc for ghost sitelink send buffer failed\n"); 00729 } 00730 00731 ghost_sitelink_fwd[i] = malloc(len[i]); 00732 ghost_sitelink_back[i] = malloc(len[i]); 00733 if(ghost_sitelink_fwd[i] == NULL || ghost_sitelink_back[i] == NULL){ 00734 errorQuda("Error: malloc for ghost sitelink failed\n"); 00735 } 00736 00737 } 00738 00739 int back_nbr[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR,T_BACK_NBR}; 00740 int fwd_nbr[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR,T_FWD_NBR}; 00741 int uptags[4] = {XUP, YUP, ZUP, TUP}; 00742 int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; 00743 MPI_Request recv_request1[4], recv_request2[4]; 00744 MPI_Request send_request1[4], send_request2[4]; 00745 00746 int gaugebytes = gaugeSiteSize*gPrecision; 00747 int a, b, c,d; 00748 for(int dir =0;dir < 4;dir++){ 00749 if( (!commDimPartitioned(dir)) && optflag) continue; 00750 if(commDimPartitioned(dir)){ 00751 //fill the sendbuf here 00752 //back 00753 for(d=2; d < 4; d++) 00754 for(a=starta[dir];a < enda[dir]; a++) 00755 for(b=startb[dir]; b < endb[dir]; b++) 00756 00757 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){ 00758 for (c=startc[dir]; c < endc[dir]; c++){ 00759 int oddness = (a+b+c+d)%2; 00760 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 00761 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-2)*f_bound[dir][3])>> 1; 00762 00763 int src_oddness = oddness; 00764 int dst_oddness = oddness; 00765 if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position 00766 dst_oddness = 1-oddness; 00767 } 00768 00769 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir) \ 00770 if(src_oddness){ \ 00771 src_idx += Vh_ex; \ 00772 } \ 00773 if(dst_oddness){ \ 00774 dst_idx += nslices*slice_3d[dir]/2; \ 00775 } \ 00776 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 00777 for(int linkdir=0; linkdir < 4; linkdir++){ \ 00778 char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \ 00779 char* dst = ((char*)ghost_buf[dir])+ linkdir*nslices*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \ 00780 memcpy(dst, src, gaugebytes*(num)); \ 00781 } \ 00782 }else{ /*QUDA_MILC_GAUGE_ORDER*/ \ 00783 char* src = ((char*)sitelink)+ 4*(src_idx)*gaugebytes; \ 00784 char* dst = ((char*)ghost_buf[dir]) + 4*(dst_idx)*gaugebytes; \ 00785 memcpy(dst, src, 4*gaugebytes*(num)); \ 00786 } \ 00787 00788 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, 1, dir); 00789 00790 }//c 00791 }else{ 00792 for(int loop=0; loop < 2; loop++){ 00793 c=startc[dir]+loop; 00794 if(c < endc[dir]){ 00795 int oddness = (a+b+c+d)%2; 00796 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 00797 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-2)*f_bound[dir][3])>> 1; 00798 00799 int src_oddness = oddness; 00800 int dst_oddness = oddness; 00801 if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position 00802 dst_oddness = 1-oddness; 00803 } 00804 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, (endc[dir] -c +1)/2, dir); 00805 00806 }//if c 00807 }//for loop 00808 }//if 00809 00810 00811 //fwd 00812 for(d=X[dir]; d < X[dir]+2; d++) 00813 for(a=starta[dir];a < enda[dir]; a++) 00814 for(b=startb[dir]; b < endb[dir]; b++) 00815 00816 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){ 00817 for (c=startc[dir]; c < endc[dir]; c++){ 00818 int oddness = (a+b+c+d)%2; 00819 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 00820 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1; 00821 00822 int src_oddness = oddness; 00823 int dst_oddness = oddness; 00824 if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position 00825 dst_oddness = 1-oddness; 00826 } 00827 00828 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, 1,dir); 00829 }//c 00830 }else{ 00831 for(int loop=0; loop < 2; loop++){ 00832 c=startc[dir]+loop; 00833 if(c < endc[dir]){ 00834 int oddness = (a+b+c+d)%2; 00835 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 00836 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1; 00837 00838 int src_oddness = oddness; 00839 int dst_oddness = oddness; 00840 if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position 00841 dst_oddness = 1-oddness; 00842 } 00843 00844 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, (endc[dir] -c +1)/2,dir); 00845 00846 00847 } 00848 }//for loop 00849 }//if 00850 comm_recv_with_tag(ghost_sitelink_back[dir], len[dir], back_nbr[dir], uptags[dir], &recv_request1[dir]); 00851 comm_recv_with_tag(ghost_sitelink_fwd[dir], len[dir], fwd_nbr[dir], downtags[dir], &recv_request2[dir]); 00852 comm_send_with_tag(ghost_sitelink_fwd_sendbuf[dir], len[dir], fwd_nbr[dir], uptags[dir], &send_request1[dir]); 00853 comm_send_with_tag(ghost_sitelink_back_sendbuf[dir], len[dir], back_nbr[dir], downtags[dir], &send_request2[dir]); 00854 00855 //need the messages to be here before we can send the next messages 00856 comm_wait(&recv_request1[dir]); 00857 comm_wait(&recv_request2[dir]); 00858 comm_wait(&send_request1[dir]); 00859 comm_wait(&send_request2[dir]); 00860 }//if 00861 00862 //use the messages to fill the sitelink data 00863 //back 00864 if (dir < 3 ){ 00865 for(d=0; d < 2; d++) 00866 for(a=starta[dir];a < enda[dir]; a++) 00867 for(b=startb[dir]; b < endb[dir]; b++) 00868 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){ 00869 for (c=startc[dir]; c < endc[dir]; c++){ 00870 int oddness = (a+b+c+d)%2; 00871 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 00872 int src_idx; 00873 if(commDimPartitioned(dir)){ 00874 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1; 00875 }else{ 00876 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1; 00877 } 00878 /* 00879 if(oddness){ 00880 if(commDimPartitioned(dir)){ 00881 src_idx += nslices*slice_3d[dir]/2; 00882 }else{ 00883 src_idx += Vh_ex; 00884 } 00885 dst_idx += Vh_ex; 00886 } 00887 */ 00888 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir) \ 00889 if(oddness){ \ 00890 if(commDimPartitioned(dir)){ \ 00891 src_idx += nslices*slice_3d[dir]/2; \ 00892 }else{ \ 00893 src_idx += Vh_ex; \ 00894 } \ 00895 dst_idx += Vh_ex; \ 00896 } \ 00897 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 00898 for(int linkdir=0; linkdir < 4; linkdir++){ \ 00899 char* src; \ 00900 if(commDimPartitioned(dir)){ \ 00901 src = ((char*)ghost_buf[dir])+ linkdir*nslices*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \ 00902 }else{ \ 00903 src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \ 00904 } \ 00905 char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \ 00906 memcpy(dst, src, gaugebytes*(num)); \ 00907 } \ 00908 }else{/*QUDA_MILC_GAUGE_FIELD*/ \ 00909 char* src; \ 00910 if(commDimPartitioned(dir)){ \ 00911 src=((char*)ghost_buf[dir]) + 4*(src_idx)*gaugebytes; \ 00912 }else{ \ 00913 src = ((char*)sitelink)+ 4*(src_idx)*gaugebytes; \ 00914 } \ 00915 char* dst = ((char*)sitelink) + 4*(dst_idx)*gaugebytes; \ 00916 memcpy(dst, src, 4*gaugebytes*(num)); \ 00917 } 00918 00919 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, 1, dir); 00920 00921 }//c 00922 }else{ 00923 //optimized copy 00924 //first half: startc[dir] -> end[dir] with step=2 00925 00926 for(int loop =0;loop <2;loop++){ 00927 int c=startc[dir]+loop; 00928 if(c < endc[dir]){ 00929 int oddness = (a+b+c+d)%2; 00930 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 00931 int src_idx; 00932 if(commDimPartitioned(dir)){ 00933 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1; 00934 }else{ 00935 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1; 00936 } 00937 00938 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, (endc[dir]-c+1)/2, dir); 00939 00940 }//if c 00941 }//for loop 00942 00943 }//if 00944 }else{ 00945 //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys 00946 00947 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir) \ 00948 /*even*/ \ 00949 int even_dst_idx = (dst_face*E3E2E1)/2; \ 00950 int even_src_idx; \ 00951 if(commDimPartitioned(dir)){ \ 00952 even_src_idx = 0; \ 00953 }else{ \ 00954 even_src_idx = (src_face*E3E2E1)/2; \ 00955 } \ 00956 /*odd*/ \ 00957 int odd_dst_idx = even_dst_idx+Vh_ex; \ 00958 int odd_src_idx; \ 00959 if(commDimPartitioned(dir)){ \ 00960 odd_src_idx = nslices*slice_3d[dir]/2; \ 00961 }else{ \ 00962 odd_src_idx = even_src_idx+Vh_ex; \ 00963 } \ 00964 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 00965 for(int linkdir=0; linkdir < 4; linkdir ++){ \ 00966 char* dst = (char*)sitelink[linkdir]; \ 00967 char* src; \ 00968 if(commDimPartitioned(dir)){ \ 00969 src = ((char*)ghost_buf[dir]) + linkdir*nslices*slice_3d[dir]*gaugebytes; \ 00970 }else{ \ 00971 src = (char*)sitelink[linkdir]; \ 00972 } \ 00973 memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, nslices*slice_3d[dir]*gaugebytes/2); \ 00974 memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, nslices*slice_3d[dir]*gaugebytes/2); \ 00975 } \ 00976 }else{/*QUDA_MILC_GAUGE_ORDER*/ \ 00977 char* dst = (char*)sitelink; \ 00978 char* src; \ 00979 if(commDimPartitioned(dir)){ \ 00980 src = (char*)ghost_buf[dir]; \ 00981 }else{ \ 00982 src = (char*)sitelink; \ 00983 } \ 00984 memcpy(dst+4*even_dst_idx*gaugebytes, src+4*even_src_idx*gaugebytes, 4*nslices*slice_3d[dir]*gaugebytes/2); \ 00985 memcpy(dst+4*odd_dst_idx*gaugebytes, src+4*odd_src_idx*gaugebytes, 4*nslices*slice_3d[dir]*gaugebytes/2); \ 00986 } 00987 00988 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0, X4, dir) 00989 }//if 00990 00991 //fwd 00992 if( dir < 3 ){ 00993 for(d=X[dir]+2; d < X[dir]+4; d++) 00994 for(a=starta[dir];a < enda[dir]; a++) 00995 for(b=startb[dir]; b < endb[dir]; b++) 00996 00997 if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){ 00998 for (c=startc[dir]; c < endc[dir]; c++){ 00999 int oddness = (a+b+c+d)%2; 01000 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 01001 int src_idx; 01002 if(commDimPartitioned(dir)){ 01003 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-2)*f_bound[dir][3])>> 1; 01004 }else{ 01005 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1; 01006 } 01007 01008 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, 1, dir); 01009 01010 }//c 01011 }else{ 01012 for(int loop =0; loop < 2; loop++){ 01013 //for (c=startc[dir]; c < endc[dir]; c++){ 01014 c=startc[dir] + loop; 01015 if(c < endc[dir]){ 01016 int oddness = (a+b+c+d)%2; 01017 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1; 01018 int src_idx; 01019 if(commDimPartitioned(dir)){ 01020 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-2)*f_bound[dir][3])>> 1; 01021 }else{ 01022 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1; 01023 } 01024 01025 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, (endc[dir]-c+1)/2, dir); 01026 }//if 01027 01028 }//for loop 01029 01030 }//if 01031 01032 01033 }else{ 01034 //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys 01035 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (X4+2), 2, dir) 01036 01037 }//if 01038 01039 }//dir for loop 01040 01041 01042 for(int dir=0;dir < 4;dir++){ 01043 if(!commDimPartitioned(dir)) continue; 01044 free(ghost_sitelink_fwd_sendbuf[dir]); 01045 free(ghost_sitelink_back_sendbuf[dir]); 01046 free(ghost_sitelink_fwd[dir]); 01047 free(ghost_sitelink_back[dir]); 01048 } 01049 01050 01051 } 01052 01053 01054 01055 template<typename Float> 01056 void 01057 do_exchange_cpu_staple(Float* staple, Float** ghost_staple, Float** staple_fwd_sendbuf, Float** staple_back_sendbuf, int* X) 01058 { 01059 01060 01061 #if 0 01062 int len = Vsh_t*gaugeSiteSize*sizeof(Float); 01063 Float* even_staple_back_src = staple; 01064 Float* odd_staple_back_src = staple + Vh*gaugeSiteSize; 01065 Float* staple_back_dst = staple_back_sendbuf[3]; 01066 01067 if(dims[3] % 2 == 0){ 01068 memcpy(staple_back_dst, even_staple_back_src, len); 01069 memcpy(staple_back_dst + Vsh_t*gaugeSiteSize, odd_staple_back_src, len); 01070 }else{ 01071 //switching odd and even ghost staple 01072 memcpy(staple_back_dst, odd_staple_back_src, len); 01073 memcpy(staple_back_dst + Vsh_t*gaugeSiteSize, even_staple_back_src, len); 01074 } 01075 01076 01077 Float* even_staple_fwd_src = staple + (Vh - Vsh_t)*gaugeSiteSize; 01078 Float* odd_staple_fwd_src = staple + Vh*gaugeSiteSize + (Vh - Vsh_t)*gaugeSiteSize; 01079 Float* staple_fwd_dst = staple_fwd_sendbuf[3]; 01080 if(dims[3] % 2 == 0){ 01081 memcpy(staple_fwd_dst, even_staple_fwd_src, len); 01082 memcpy(staple_fwd_dst + Vsh_t*gaugeSiteSize, odd_staple_fwd_src, len); 01083 }else{ 01084 //switching odd and even ghost staple 01085 memcpy(staple_fwd_dst, odd_staple_fwd_src, len); 01086 memcpy(staple_fwd_dst + Vsh_t*gaugeSiteSize, even_staple_fwd_src, len); 01087 } 01088 #else 01089 int nFace =1; 01090 pack_ghost_all_staples_cpu(staple, (void**)staple_back_sendbuf, 01091 (void**)staple_fwd_sendbuf, nFace, (QudaPrecision)(sizeof(Float)), X); 01092 01093 #endif 01094 01095 int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t}; 01096 int len[4] = { 01097 Vsh_x*gaugeSiteSize*sizeof(Float), 01098 Vsh_y*gaugeSiteSize*sizeof(Float), 01099 Vsh_z*gaugeSiteSize*sizeof(Float), 01100 Vsh_t*gaugeSiteSize*sizeof(Float) 01101 }; 01102 01103 int fwd_neighbors[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR}; 01104 int back_neighbors[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR}; 01105 int up_tags[4] = {XUP, YUP, ZUP, TUP}; 01106 int down_tags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; 01107 01108 for(int dir=0;dir < 4; dir++){ 01109 Float* ghost_staple_back = ghost_staple[dir]; 01110 Float* ghost_staple_fwd = ghost_staple[dir] + 2*Vsh[dir]*gaugeSiteSize; 01111 01112 MPI_Request recv_request1; 01113 MPI_Request recv_request2; 01114 MPI_Request send_request1; 01115 MPI_Request send_request2; 01116 comm_recv_with_tag(ghost_staple_back, 2*len[dir], back_neighbors[dir], up_tags[dir], &recv_request1); 01117 comm_recv_with_tag(ghost_staple_fwd, 2*len[dir], fwd_neighbors[dir], down_tags[dir], &recv_request2); 01118 comm_send_with_tag(staple_fwd_sendbuf[dir], 2*len[dir], fwd_neighbors[dir], up_tags[dir], &send_request1); 01119 comm_send_with_tag(staple_back_sendbuf[dir], 2*len[dir], back_neighbors[dir], down_tags[dir], &send_request2); 01120 01121 comm_wait(&recv_request1); 01122 comm_wait(&recv_request2); 01123 comm_wait(&send_request1); 01124 comm_wait(&send_request2); 01125 } 01126 } 01127 //this function is used for link fattening computation 01128 void exchange_cpu_staple(int* X, 01129 void* staple, void** ghost_staple, 01130 QudaPrecision gPrecision) 01131 { 01132 01133 setup_dims(X); 01134 01135 int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t}; 01136 void* staple_fwd_sendbuf[4]; 01137 void* staple_back_sendbuf[4]; 01138 01139 for(int i=0;i < 4; i++){ 01140 staple_fwd_sendbuf[i] = malloc(Vs[i]*gaugeSiteSize*gPrecision); 01141 staple_back_sendbuf[i] = malloc(Vs[i]*gaugeSiteSize*gPrecision); 01142 if (staple_fwd_sendbuf[i] == NULL|| staple_back_sendbuf[i] == NULL){ 01143 printf("ERROR: malloc failed for staple_sendbuf/site_link_back_sendbuf\n"); 01144 exit(1); 01145 } 01146 } 01147 01148 if (gPrecision == QUDA_DOUBLE_PRECISION){ 01149 do_exchange_cpu_staple((double*)staple, (double**)ghost_staple, 01150 (double**)staple_fwd_sendbuf, (double**)staple_back_sendbuf, X); 01151 }else{ //single 01152 do_exchange_cpu_staple((float*)staple, (float**)ghost_staple, 01153 (float**)staple_fwd_sendbuf, (float**)staple_back_sendbuf, X); 01154 } 01155 01156 for(int i=0;i < 4;i++){ 01157 free(staple_fwd_sendbuf[i]); 01158 free(staple_back_sendbuf[i]); 01159 } 01160 } 01161 01162 //@whichway indicates send direction 01163 void 01164 exchange_gpu_staple_start(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream) 01165 { 01166 setup_dims(X); 01167 01168 cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple; 01169 exchange_llfat_init(cudaStaple->Precision()); 01170 01171 01172 void* even = cudaStaple->Even_p(); 01173 void* odd = cudaStaple->Odd_p(); 01174 int volume = cudaStaple->VolumeCB(); 01175 QudaPrecision prec = cudaStaple->Precision(); 01176 int stride = cudaStaple->Stride(); 01177 01178 packGhostStaple(X, even, odd, volume, prec, stride, 01179 dir, whichway, fwd_nbr_staple_gpu, back_nbr_staple_gpu, 01180 fwd_nbr_staple_sendbuf, back_nbr_staple_sendbuf, stream); 01181 } 01182 01183 01184 //@whichway indicates send direction 01185 //we use recv_whichway to indicate recv direction 01186 void 01187 exchange_gpu_staple_comms(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream) 01188 { 01189 cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple; 01190 QudaPrecision prec = cudaStaple->Precision(); 01191 01192 int fwd_neighbors[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR}; 01193 int back_neighbors[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR}; 01194 int up_tags[4] = {XUP, YUP, ZUP, TUP}; 01195 int down_tags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; 01196 01197 cudaStreamSynchronize(*stream); 01198 01199 int recv_whichway; 01200 if(whichway == QUDA_BACKWARDS){ 01201 recv_whichway = QUDA_FORWARDS; 01202 }else{ 01203 recv_whichway = QUDA_BACKWARDS; 01204 } 01205 01206 01207 int i = dir; 01208 int len = Vs[i]*gaugeSiteSize*prec; 01209 int normlen = Vs[i]*sizeof(float); 01210 01211 if(recv_whichway == QUDA_BACKWARDS){ 01212 #ifdef GPU_DIRECT 01213 comm_recv_with_tag(back_nbr_staple[i], len, back_neighbors[i], up_tags[i], &llfat_recv_request1[i]); 01214 comm_send_with_tag(fwd_nbr_staple_sendbuf[i], len, fwd_neighbors[i], up_tags[i], &llfat_send_request1[i]); 01215 #else 01216 comm_recv_with_tag(back_nbr_staple_cpu[i], len, back_neighbors[i], up_tags[i], &llfat_recv_request1[i]); 01217 memcpy(fwd_nbr_staple_sendbuf_cpu[i], fwd_nbr_staple_sendbuf[i], len); 01218 comm_send_with_tag(fwd_nbr_staple_sendbuf_cpu[i], len, fwd_neighbors[i], up_tags[i], &llfat_send_request1[i]); 01219 #endif 01220 } else { // QUDA_FORWARDS 01221 #ifdef GPU_DIRECT 01222 comm_recv_with_tag(fwd_nbr_staple[i], len, fwd_neighbors[i], down_tags[i], &llfat_recv_request2[i]); 01223 comm_send_with_tag(back_nbr_staple_sendbuf[i], len, back_neighbors[i] ,down_tags[i], &llfat_send_request2[i]); 01224 #else 01225 comm_recv_with_tag(fwd_nbr_staple_cpu[i], len, fwd_neighbors[i], down_tags[i], &llfat_recv_request2[i]); 01226 memcpy(back_nbr_staple_sendbuf_cpu[i], back_nbr_staple_sendbuf[i], len); 01227 comm_send_with_tag(back_nbr_staple_sendbuf_cpu[i], len, back_neighbors[i] ,down_tags[i], &llfat_send_request2[i]); 01228 #endif 01229 } 01230 } 01231 01232 01233 //@whichway indicates send direction 01234 //we use recv_whichway to indicate recv direction 01235 void 01236 exchange_gpu_staple_wait(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream) 01237 { 01238 cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple; 01239 01240 void* even = cudaStaple->Even_p(); 01241 void* odd = cudaStaple->Odd_p(); 01242 int volume = cudaStaple->VolumeCB(); 01243 QudaPrecision prec = cudaStaple->Precision(); 01244 int stride = cudaStaple->Stride(); 01245 01246 int recv_whichway; 01247 if(whichway == QUDA_BACKWARDS){ 01248 recv_whichway = QUDA_FORWARDS; 01249 }else{ 01250 recv_whichway = QUDA_BACKWARDS; 01251 } 01252 01253 01254 int i = dir; 01255 int len = Vs[i]*gaugeSiteSize*prec; 01256 int normlen = Vs[i]*sizeof(float); 01257 01258 if(recv_whichway == QUDA_BACKWARDS){ 01259 comm_wait(&llfat_recv_request1[i]); 01260 comm_wait(&llfat_send_request1[i]); 01261 01262 #ifdef GPU_DIRECT 01263 unpackGhostStaple(X, even, odd, volume, prec, stride, 01264 i, QUDA_BACKWARDS, fwd_nbr_staple, back_nbr_staple, stream); 01265 #else 01266 memcpy(back_nbr_staple[i], back_nbr_staple_cpu[i], len); 01267 unpackGhostStaple(X, even, odd, volume, prec, stride, 01268 i, QUDA_BACKWARDS, fwd_nbr_staple, back_nbr_staple, stream); 01269 #endif 01270 01271 } else { // QUDA_FORWARDS 01272 comm_wait(&llfat_recv_request2[i]); 01273 comm_wait(&llfat_send_request2[i]); 01274 01275 #ifdef GPU_DIRECT 01276 unpackGhostStaple(X, even, odd, volume, prec, stride, 01277 i, QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream); 01278 #else 01279 memcpy(fwd_nbr_staple[i], fwd_nbr_staple_cpu[i], len); 01280 unpackGhostStaple(X, even, odd, volume, prec, stride, 01281 i, QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream); 01282 #endif 01283 01284 } 01285 } 01286 01287 01288 void 01289 exchange_llfat_cleanup(void) 01290 { 01291 01292 for(int i=0;i < 4; i++){ 01293 if(fwd_nbr_staple_gpu[i]){ 01294 cudaFree(fwd_nbr_staple_gpu[i]); fwd_nbr_staple_gpu[i] =NULL; 01295 } 01296 if(back_nbr_staple_gpu[i]){ 01297 cudaFree(back_nbr_staple_gpu[i]);back_nbr_staple_gpu[i] = NULL; 01298 } 01299 01300 } 01301 01302 #ifdef GPU_DIRECT 01303 for(int i=0;i < 4; i++){ 01304 if(fwd_nbr_staple_cpu[i]){ 01305 free(fwd_nbr_staple_cpu[i]); fwd_nbr_staple_cpu[i] =NULL; 01306 } 01307 if(back_nbr_staple_cpu[i]){ 01308 free(back_nbr_staple_cpu[i]);back_nbr_staple_cpu[i] = NULL; 01309 } 01310 } 01311 for(int i=0;i < 4; i++){ 01312 if(fwd_nbr_staple_sendbuf_cpu[i]){ 01313 free(fwd_nbr_staple_sendbuf_cpu[i]); fwd_nbr_staple_sendbuf_cpu[i] = NULL; 01314 } 01315 if(back_nbr_staple_sendbuf_cpu[i]){ 01316 free(back_nbr_staple_sendbuf_cpu[i]); back_nbr_staple_sendbuf_cpu[i] = NULL; 01317 } 01318 } 01319 #endif 01320 01321 for(int i=0;i < 4; i++){ 01322 if(fwd_nbr_staple[i]){ 01323 cudaFreeHost(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL; 01324 } 01325 if(back_nbr_staple[i]){ 01326 cudaFreeHost(back_nbr_staple[i]); back_nbr_staple[i] = NULL; 01327 } 01328 } 01329 01330 for(int i=0;i < 4; i++){ 01331 if(fwd_nbr_staple_sendbuf[i]){ 01332 cudaFreeHost(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL; 01333 } 01334 if(back_nbr_staple_sendbuf[i]){ 01335 cudaFreeHost(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL; 01336 } 01337 } 01338 01339 } 01340 01341 #endif