18 Ninternal(Ninternal), precision(precision), nDim(nDim), nDimComms(nDim), nFace(nFace)
26 recFwdStrmIdx = sendBackStrmIdx;
27 recBackStrmIdx = sendFwdStrmIdx;
30 for (
int i=0; i<nDimComms; i++) {
31 nbytes[i] = nFace*faceVolumeCB[i]*Ninternal*precision;
35 my_back_face[i] = allocatePinned(2*nbytes[i]);
36 my_fwd_face[i] = (
char*)my_back_face[i] + nbytes[i];
37 from_back_face[i] = allocatePinned(2*nbytes[i]);
38 from_fwd_face[i] = (
char*)from_back_face[i] + nbytes[i];
40 #ifdef GPU_DIRECT // just alias the pointer
41 ib_my_fwd_face[i] = my_fwd_face[i];
42 ib_my_back_face[i] = my_back_face[i];
43 ib_from_fwd_face[i] = from_fwd_face[i];
44 ib_from_back_face[i] = from_back_face[i];
45 #else // if no GPUDirect so need separate IB and GPU host buffers
54 for (
int i=0; i<nDimComms; i++) {
66 errorQuda(
"FaceBuffer copy constructor not implemented");
72 for (
int i=0; i<nDimComms; i++) {
86 freePinned(from_back_face[i]);
87 freePinned(my_back_face[i]);
90 for (
int i=0; i<nDimComms; i++) {
91 ib_my_fwd_face[i] = NULL;
92 ib_my_back_face[i] = NULL;
93 ib_from_fwd_face[i] = NULL;
94 ib_from_back_face[i] = NULL;
96 my_fwd_face[i] = NULL;
97 my_back_face[i] = NULL;
98 from_fwd_face[i] = NULL;
99 from_back_face[i] = NULL;
101 mh_recv_fwd[i] = NULL;
102 mh_recv_back[i] = NULL;
103 mh_send_fwd[i] = NULL;
104 mh_send_back[i] = NULL;
112 void FaceBuffer::setupDims(
const int*
X,
int Ls)
115 for (
int d=0; d<4; d++) this->X[d] = X[d];
117 this->X[nDim-1] =
Ls;
122 for (
int d=0; d<nDim; d++) Volume *= this->X[d];
125 for (
int i=0; i<nDim; i++) {
127 for (
int j=0; j<nDim; j++) {
129 faceVolume[i] *= this->X[j];
131 faceVolumeCB[i] = faceVolume[i]/2;
137 std::multimap<size_t, void *> FaceBuffer::pinnedCache;
140 std::map<void *, size_t> FaceBuffer::pinnedSize;
143 void *FaceBuffer::allocatePinned(
size_t nbytes)
145 std::multimap<size_t, void *>::iterator it;
148 if (pinnedCache.empty()) {
151 it = pinnedCache.lower_bound(nbytes);
152 if (it != pinnedCache.end()) {
155 pinnedCache.erase(it);
157 it = pinnedCache.begin();
159 pinnedCache.erase(it);
164 pinnedSize[ptr] = nbytes;
169 void FaceBuffer::freePinned(
void *ptr)
171 if (!pinnedSize.count(ptr)) {
172 errorQuda(
"Attempt to free invalid pointer");
174 pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
175 pinnedSize.erase(ptr);
181 std::multimap<size_t, void *>::iterator it;
182 for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
183 void *ptr = it->second;
219 struct commCallback_t {
229 void CUDART_CB commCallback(cudaStream_t
stream, cudaError_t status,
void *data) {
230 const unsigned long long dir = (
unsigned long long)data;
234 memcpy(commCB[dir].ib_buffer, commCB[dir].face_buffer, commCB[dir].bytes);
245 commCB[
dir].mh_recv = mh_recv_fwd[dim];
246 commCB[
dir].mh_send = mh_send_back[dim];
247 commCB[
dir].ib_buffer = ib_my_back_face[dim];
248 commCB[
dir].face_buffer = my_back_face[dim];
249 commCB[
dir].bytes = nbytes[dim];
251 commCB[
dir].mh_recv = mh_recv_back[dim];
252 commCB[
dir].mh_send = mh_send_fwd[dim];
253 commCB[
dir].ib_buffer = ib_my_fwd_face[dim];
254 commCB[
dir].face_buffer = my_fwd_face[dim];
255 commCB[
dir].bytes = nbytes[dim];
258 cudaStreamAddCallback(
stream[dir], commCallback, (
void*)dir, 0);
261 #else // !defined(QUDA_CALLBACK)
271 memcpy(ib_my_back_face[dim], my_back_face[dim], nbytes[dim]);
279 memcpy(ib_my_fwd_face[dim], my_fwd_face[dim], nbytes[dim]);
285 #endif // QUDA_CALLBACK
296 memcpy(from_fwd_face[dim], ib_from_fwd_face[dim], nbytes[dim]);
303 memcpy(from_back_face[dim], ib_from_back_face[dim], nbytes[dim]);
333 for(
int i=0;i < 4; i++){
343 for (
int i=0; i<nDimComms; i++) {
350 for (
int i=0; i<nDimComms; i++) {
357 for (
int i=0; i<nDimComms; i++) {
364 for (
int i=0; i<nDimComms; i++) {
378 for (
int i=0; i<nDimComms; i++) {
379 int len = 2*nFace*faceVolumeCB[i]*Ninternal;
384 for (
int i=0; i<nDimComms; i++) {
389 for (
int i=0; i<nDimComms; i++) {
394 for (
int i=0; i<nDimComms; i++) {