16 my_face(0), from_face(0), Ninternal(Ninternal), precision(precision), nDim(nDim),
17 nDimComms(nDim), nFace(nFace)
25 recFwdStrmIdx = sendBackStrmIdx;
26 recBackStrmIdx = sendFwdStrmIdx;
30 for (
int i=0; i<nDimComms; i++) {
31 nbytes[i] = nFace*faceVolumeCB[i]*Ninternal*precision;
35 faceBytes += 2*nbytes[i];
39 my_face = allocatePinned(faceBytes);
40 from_face = allocatePinned(faceBytes);
45 for (
int i=0; i<nDimComms; i++) {
48 my_back_face[i] = (
char*)my_face + offset;
49 from_back_face[i] = (
char*)from_face + offset;
52 my_fwd_face[i] = (
char*)my_face + offset;
53 from_fwd_face[i] = (
char*)from_face + offset;
56 #ifdef GPU_DIRECT // just alias the pointer
57 ib_my_fwd_face[i] = my_fwd_face[i];
58 ib_my_back_face[i] = my_back_face[i];
59 ib_from_fwd_face[i] = from_fwd_face[i];
60 ib_from_back_face[i] = from_back_face[i];
61 #else // if no GPUDirect so need separate IB and GPU host buffers
69 for (
int i=0; i<nDimComms; i++) {
82 errorQuda(
"FaceBuffer copy constructor not implemented");
88 for (
int i=0; i<nDimComms; i++) {
104 for (
int i=0; i<nDimComms; i++) {
105 ib_my_fwd_face[i] = NULL;
106 ib_my_back_face[i] = NULL;
107 ib_from_fwd_face[i] = NULL;
108 ib_from_back_face[i] = NULL;
110 my_fwd_face[i] = NULL;
111 my_back_face[i] = NULL;
112 from_fwd_face[i] = NULL;
113 from_back_face[i] = NULL;
115 mh_recv_fwd[i] = NULL;
116 mh_recv_back[i] = NULL;
117 mh_send_fwd[i] = NULL;
118 mh_send_back[i] = NULL;
121 if (from_face) freePinned(from_face);
122 if (my_face) freePinned(my_face);
129 void FaceBuffer::setupDims(
const int*
X,
int Ls)
132 for (
int d=0; d<4; d++) this->X[d] = X[d];
134 this->X[nDim-1] =
Ls;
139 for (
int d=0; d<nDim; d++) Volume *= this->X[d];
142 for (
int i=0; i<nDim; i++) {
144 for (
int j=0; j<nDim; j++) {
146 faceVolume[i] *= this->X[j];
148 faceVolumeCB[i] = faceVolume[i]/2;
154 std::multimap<size_t, void *> FaceBuffer::pinnedCache;
157 std::map<void *, size_t> FaceBuffer::pinnedSize;
160 void *FaceBuffer::allocatePinned(
size_t nbytes)
162 std::multimap<size_t, void *>::iterator
it;
165 if (pinnedCache.empty()) {
168 it = pinnedCache.lower_bound(nbytes);
169 if (it != pinnedCache.end()) {
172 pinnedCache.erase(it);
174 it = pinnedCache.begin();
176 pinnedCache.erase(it);
181 pinnedSize[ptr] = nbytes;
186 void FaceBuffer::freePinned(
void *ptr)
188 if (!pinnedSize.count(ptr)) {
189 errorQuda(
"Attempt to free invalid pointer");
191 pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
192 pinnedSize.erase(ptr);
198 std::multimap<size_t, void *>::iterator it;
199 for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
200 void *ptr = it->second;
208 cudaStream_t *stream_p,
bool zeroCopyPack,
double a,
double b)
215 cudaHostGetDevicePointer(&my_face_d, my_face, 0);
236 cudaStream_t *stream_p,
bool zeroCopyPack,
double a,
double b)
239 pack(in, dim, dir, parity, dagger, stream_p, zeroCopyPack, a, b);
243 cudaStream_t *stream_p,
bool zeroCopyPack,
double a,
double b)
247 pack(in, dim, dir, parity, dagger, stream_p, zeroCopyPack, a, b);
255 const int stream_idx = (dir%2 == 0) ? dir+sendBackStrmIdx : dir-1+sendFwdStrmIdx;
256 gather(in, dagger, dir, stream_idx);
265 struct commCallback_t {
275 void CUDART_CB commCallback(cudaStream_t
stream, cudaError_t status,
void *data) {
276 const unsigned long long dir = (
unsigned long long)data;
280 memcpy(commCB[dir].ib_buffer, commCB[dir].face_buffer, commCB[dir].bytes);
291 commCB[dir].mh_recv = mh_recv_fwd[
dim];
292 commCB[dir].mh_send = mh_send_back[
dim];
293 commCB[dir].ib_buffer = ib_my_back_face[
dim];
294 commCB[dir].face_buffer = my_back_face[
dim];
295 commCB[dir].bytes = nbytes[
dim];
297 commCB[dir].mh_recv = mh_recv_back[
dim];
298 commCB[dir].mh_send = mh_send_fwd[
dim];
299 commCB[dir].ib_buffer = ib_my_fwd_face[
dim];
300 commCB[dir].face_buffer = my_fwd_face[
dim];
301 commCB[dir].bytes = nbytes[
dim];
304 cudaStreamAddCallback(
stream[dir], commCallback, (
void*)dir, 0);
307 #else // !defined(QUDA_CALLBACK)
317 memcpy(ib_my_back_face[dim], my_back_face[dim], nbytes[dim]);
325 memcpy(ib_my_fwd_face[dim], my_fwd_face[dim], nbytes[dim]);
349 memcpy(ib_my_back_face[dim], my_back_face[dim], nbytes[dim]);
355 memcpy(ib_my_fwd_face[dim], my_fwd_face[dim], nbytes[dim]);
361 #endif // QUDA_CALLBACK
372 memcpy(from_fwd_face[dim], ib_from_fwd_face[dim], nbytes[dim]);
379 memcpy(from_back_face[dim], ib_from_back_face[dim], nbytes[dim]);
402 scatter(out, dagger, dir, (dir/2)*2);
412 for(
int i=0;i < 4; i++){
424 for (
int i=0; i<nDimComms; i++) {
432 for (
int i=0; i<nDimComms; i++) {
444 for (
int i=0; i<nDimComms; i++) {
452 for (
int i=0; i<nDimComms; i++) {
468 for (
int i=0; i<nDimComms; i++) bytes[i] = 2*nFace*faceVolumeCB[i]*Ninternal*precision;
473 for (
int i=0; i<nDimComms; i++) {
475 send[i] = link_sendbuf[i];
476 receive[i] = ghost_link[i];
478 memcpy(ghost_link[i], link_sendbuf[i], bytes[i]);
482 for (
int i=0; i<nDimComms; i++) {
484 send[i] = allocatePinned(bytes[i]);
485 receive[i] = allocatePinned(bytes[i]);
486 cudaMemcpy(send[i], link_sendbuf[i], bytes[i], cudaMemcpyDeviceToHost);
488 cudaMemcpy(ghost_link[i], link_sendbuf[i], bytes[i], cudaMemcpyDeviceToDevice);
493 for (
int i=0; i<nDimComms; i++) {
499 for (
int i=0; i<nDimComms; i++) {
505 for (
int i=0; i<nDimComms; i++) {
512 for (
int i=0; i<nDimComms; i++) {
514 cudaMemcpy(ghost_link[i], receive[i], bytes[i], cudaMemcpyHostToDevice);
516 freePinned(receive[i]);
520 for (
int i=0; i<nDimComms; i++) {
void allocateGhostBuffer(void)
void pack(quda::cudaColorSpinorField &in, quda::FullClover &clov, quda::FullClover &clovInv, int dim, int dir, int parity, int dagger, cudaStream_t *stream, bool zeroCopyPack=false, double a=0)
#define pinned_malloc(size)
enum QudaPrecision_s QudaPrecision
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, void *buffer=0, double a=0, double b=0)
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
int commDimPartitioned(int dir)
void comm_allreduce_array(double *data, size_t size)
void reduceDoubleArray(double *sum, const int len)
cpuColorSpinorField * spinor
void packGhost(void *ghost_spinor, const int dim, const QudaDirection dir, const QudaParity parity, const int dagger)
void reduceDouble(double &sum)
void comm_free(MsgHandle *mh)
enum QudaDirection_s QudaDirection
static void * backGhostFaceSendBuffer[QUDA_MAX_DIM]
void exchangeLink(void **ghost_link, void **link_sendbuf, QudaFieldLocation location)
void gather(quda::cudaColorSpinorField &in, int dagger, int dir, int streamIdx)
const QudaFieldLocation location
void scatter(quda::cudaColorSpinorField &out, int dagger, int dir)
static void flushPinnedCache()
void exchangeCpuSpinor(quda::cpuColorSpinorField &in, int parity, int dagger)
void comm_start(MsgHandle *mh)
void comm_dim_partitioned_set(int dim)
FaceBuffer(const int *X, const int nDim, const int Ninternal, const int nFace, const QudaPrecision precision, const int Ls=1)
static void * backGhostFaceBuffer[QUDA_MAX_DIM]
enum QudaParity_s QudaParity
static void * fwdGhostFaceSendBuffer[QUDA_MAX_DIM]
#define safe_malloc(size)
static void * fwdGhostFaceBuffer[QUDA_MAX_DIM]
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
int comm_query(MsgHandle *mh)
void allocateGhostBuffer(int nFace)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void comm_wait(MsgHandle *mh)
void comm_allreduce(double *data)
void comm_allreduce_max(double *data)
void commDimPartitionedSet(int dir)
void reduceMaxDouble(double &max)
int comm_dim_partitioned(int dim)