QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
staggered_oprod.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <staggered_oprod.h>
4 
5 #include <tune_quda.h>
6 #include <quda_internal.h>
7 #include <gauge_field_order.h>
8 #include <quda_matrix.h>
9 
10 namespace quda {
11 
12 #ifdef GPU_STAGGERED_OPROD
13 
14  namespace { // anonymous
15 #include <texture.h>
16  }
17 
18  template<int N>
19  void createEventArray(cudaEvent_t (&event)[N], unsigned int flags=cudaEventDefault)
20  {
21  for(int i=0; i<N; ++i)
22  cudaEventCreate(&event[i],flags);
23  return;
24  }
25 
26  template<int N>
27  void destroyEventArray(cudaEvent_t (&event)[N])
28  {
29  for(int i=0; i<N; ++i)
30  cudaEventDestroy(event[i]);
31  }
32 
33 
34  static cudaEvent_t packEnd;
35  static cudaEvent_t gatherEnd[4];
36  static cudaEvent_t scatterEnd[4];
37  static cudaEvent_t oprodStart;
38  static cudaEvent_t oprodEnd;
39 
40 
42 #ifdef MULTI_GPU
43  cudaEventCreate(&packEnd, cudaEventDisableTiming);
44  createEventArray(gatherEnd, cudaEventDisableTiming);
45  createEventArray(scatterEnd, cudaEventDisableTiming);
46 #endif
47  cudaEventCreate(&oprodStart, cudaEventDisableTiming);
48  cudaEventCreate(&oprodEnd, cudaEventDisableTiming);
49  return;
50  }
51 
53 #ifdef MULTI_GPU
54  destroyEventArray(gatherEnd);
55  destroyEventArray(scatterEnd);
56  cudaEventDestroy(packEnd);
57 #endif
58  cudaEventDestroy(oprodStart);
59  cudaEventDestroy(oprodEnd);
60  return;
61  }
62 
63 
64  enum KernelType {OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL};
65 
66  template<typename Complex, typename Output, typename InputA, typename InputB>
67  struct StaggeredOprodArg {
68  unsigned int length;
69  int X[4];
70  unsigned int parity;
71  unsigned int dir;
72  unsigned int ghostOffset;
73  unsigned int displacement;
74  KernelType kernelType;
75  bool partitioned[4];
76  InputA inA;
77  InputB inB;
78  Output outA;
79  Output outB;
80  cudaGaugeField& outFieldA;
81  cudaGaugeField& outFieldB;
82  typename RealTypeId<Complex>::Type coeff[2];
83 
84  StaggeredOprodArg(const unsigned int length,
85  const int X[4],
86  const unsigned int parity,
87  const unsigned int dir,
88  const unsigned int ghostOffset,
89  const unsigned int displacement,
90  const KernelType& kernelType,
91  const double coeff[2],
92  InputA& inA,
93  InputB& inB,
94  Output& outA,
95  Output& outB,
96  cudaGaugeField& outFieldA,
97  cudaGaugeField& outFieldB) : length(length), parity(parity), ghostOffset(ghostOffset),
98  displacement(displacement), kernelType(kernelType), inA(inA), inB(inB), outA(outA), outB(outB),
99  outFieldA(outFieldA), outFieldB(outFieldB)
100  {
101  this->coeff[0] = coeff[0];
102  this->coeff[1] = coeff[1];
103  for(int i=0; i<4; ++i) this->X[i] = X[i];
104  for(int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
105  }
106  };
107 
108  enum IndexType {
109  EVEN_X = 0,
110  EVEN_Y = 1,
111  EVEN_Z = 2,
112  EVEN_T = 3
113  };
114 
115  template <IndexType idxType>
116  static __device__ __forceinline__ void coordsFromIndex(int& idx, int c[4],
117  const unsigned int cb_idx, const unsigned int parity, const int X[4])
118  {
119  const int &LX = X[0];
120  const int &LY = X[1];
121  const int &LZ = X[2];
122  const int XYZ = X[2]*X[1]*X[0];
123  const int XY = X[1]*X[0];
124 
125  idx = 2*cb_idx;
126 
127  int x, y, z, t;
128 
129  if (idxType == EVEN_X ) { // X even
130  // t = idx / XYZ;
131  // z = (idx / XY) % Z;
132  // y = (idx / X) % Y;
133  // idx += (parity + t + z + y) & 1;
134  // x = idx % X;
135  // equivalent to the above, but with fewer divisions/mods:
136  int aux1 = idx / LX;
137  x = idx - aux1 * LX;
138  int aux2 = aux1 / LY;
139  y = aux1 - aux2 * LY;
140  t = aux2 / LZ;
141  z = aux2 - t * LZ;
142  aux1 = (parity + t + z + y) & 1;
143  x += aux1;
144  idx += aux1;
145  } else if (idxType == EVEN_Y ) { // Y even
146  t = idx / XYZ;
147  z = (idx / XY) % LZ;
148  idx += (parity + t + z) & 1;
149  y = (idx / LX) % LY;
150  x = idx % LX;
151  } else if (idxType == EVEN_Z ) { // Z even
152  t = idx / XYZ;
153  idx += (parity + t) & 1;
154  z = (idx / XY) % LZ;
155  y = (idx / LX) % LY;
156  x = idx % LX;
157  } else {
158  idx += parity;
159  t = idx / XYZ;
160  z = (idx / XY) % LZ;
161  y = (idx / LX) % LY;
162  x = idx % LX;
163  }
164 
165  c[0] = x;
166  c[1] = y;
167  c[2] = z;
168  c[3] = t;
169  }
170 
171 
172 
173 
174  // Get the coordinates for the exterior kernels
175  template<int Nspin>
176  __device__ void coordsFromIndex(int x[4], const unsigned int cb_idx, const int X[4], const unsigned int dir, const int displacement, const unsigned int parity)
177  {
178 
179  if(Nspin == 1){
180  unsigned int Xh[2] = {X[0]/2, X[1]/2};
181  switch(dir){
182  case 0:
183  x[2] = cb_idx/Xh[1] % X[2];
184  x[3] = cb_idx/(Xh[1]*X[2]) % X[3];
185  x[0] = cb_idx/(Xh[1]*X[2]*X[3]);
186  x[0] += (X[0] - displacement);
187  x[1] = 2*(cb_idx % Xh[1]) + ((x[0]+x[2]+x[3]+parity)&1);
188  break;
189 
190  case 1:
191  x[2] = cb_idx/Xh[0] % X[2];
192  x[3] = cb_idx/(Xh[0]*X[2]) % X[3];
193  x[1] = cb_idx/(Xh[0]*X[2]*X[3]);
194  x[1] += (X[1] - displacement);
195  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
196 
197  break;
198 
199  case 2:
200  x[1] = cb_idx/Xh[0] % X[1];
201  x[3] = cb_idx/(Xh[0]*X[1]) % X[3];
202  x[2] = cb_idx/(Xh[0]*X[1]*X[3]);
203  x[2] += (X[2] - displacement);
204  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
205 
206  break;
207 
208  case 3:
209  x[1] = cb_idx/Xh[0] % X[1];
210  x[2] = cb_idx/(Xh[0]*X[1]) % X[2];
211  x[3] = cb_idx/(Xh[0]*X[1]*X[2]);
212  x[3] += (X[3] - displacement);
213  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
214 
215  break;
216  }
217  }else if(Nspin == 3){
218  // currently unsupported
219  }
220  return;
221  }
222 
223 
224  template<int Nspin, int Nface>
225  __device__ int ghostIndexFromCoords(const int x[4], const int X[4], const unsigned int dir, const int shift){
226  return 0;
227  }
228 
229 
230 
231  template<>
232  __device__ int ghostIndexFromCoords<1,3>(
233  const int x[4],
234  const int X[4],
235  unsigned int dir,
236  const int shift)
237  {
238  int ghost_idx;
239  if(shift > 0){
240  if((x[dir] + shift) >= X[dir]){
241  switch(dir){
242  case 0:
243  ghost_idx = (3*3 + (x[0]-X[0]+shift))*(X[3]*X[2]*X[1])/2 + ((x[3]*X[2] + x[2])*X[1] + x[1])/2;
244  break;
245  case 1:
246  ghost_idx = (3*3 + (x[1]-X[1]+shift))*(X[3]*X[2]*X[0])/2 + (x[3]*X[2]*X[0] + x[2]*X[0] + x[0])/2;
247  break;
248  case 2:
249  ghost_idx = (3*3 + (x[2]-X[2]+shift))*(X[3]*X[1]*X[0])/2 + (x[3]*X[1]*X[0] + x[1]*X[0] + x[0])/2;
250  break;
251  case 3:
252  ghost_idx = (3*3 + (x[3]-X[3]+shift))*(X[2]*X[1]*X[0])/2 + (x[2]*X[1]*X[0] + x[1]*X[0] + x[0])/2;
253  break;
254  default:
255  break;
256  } // switch
257  } // x[dir] + shift[dir] >= X[dir]
258  }else{ // shift < 0
259  if(static_cast<int>(x[dir]) + shift < 0){
260  switch(dir){
261  case 0:
262  ghost_idx = (3 + shift)*(X[3]*X[2]*X[1])/2 + ((x[3]*X[2] + x[2])*X[1] + x[1])/2;
263  break;
264  case 1:
265  ghost_idx = (3 + shift)*(X[3]*X[2]*X[0])/2 + ((x[3]*X[2] + x[2])*X[0] + x[0])/2;
266  break;
267  case 2:
268  ghost_idx = (3 + shift)*(X[3]*X[1]*X[0])/2 + ((x[3]*X[1] + x[1])*X[0] + x[0])/2;
269  break;
270  case 3:
271  ghost_idx = (3 + shift)*(X[2]*X[1]*X[0])/2 + ((x[2]*X[1] + x[1])*X[0] + x[0])/2;
272  break;
273  } // switch(dir)
274  }
275  } // shift < 0
276 
277  return ghost_idx;
278  }
279 
280 
281 
282 
283  __device__ __forceinline__
284  int neighborIndex(const unsigned int& cb_idx, const int shift[4], const bool partitioned[4], const unsigned int& parity,
285  const int X[4]){
286 
287  int full_idx;
288  int x[4];
289 
290 
292 
293 #ifdef MULTI_GPU
294  for(int dim = 0; dim<4; ++dim){
295  if(partitioned[dim])
296  if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim]) return -1;
297  }
298 #endif
299 
300  for(int dim=0; dim<4; ++dim){
301  x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
302  }
303  return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
304  }
305 
306 
307 
308  template<typename Complex, typename Output, typename InputA, typename InputB>
309  __global__ void interiorOprodKernel(StaggeredOprodArg<Complex, Output, InputA, InputB> arg)
310  {
311  unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
312  const unsigned int gridSize = gridDim.x*blockDim.x;
313 
314  typedef typename RealTypeId<Complex>::Type real;
315  Complex x[3];
316  Complex y[3];
317  Complex z[3];
318  Matrix<Complex,3> result;
319  Matrix<Complex,3> tempA, tempB; // input
320 
321 
322  while(idx<arg.length){
323  arg.inA.load(x, idx);
324  for(int dir=0; dir<4; ++dir){
325  int shift[4] = {0,0,0,0};
326  shift[dir] = 1;
327  const int first_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
328  if(first_nbr_idx >= 0){
329  arg.inB.load(y, first_nbr_idx);
330  outerProd(y,x,&result);
331  arg.outA.load(reinterpret_cast<real*>(tempA.data), idx, dir, arg.parity);
332  result = tempA + result*arg.coeff[0];
333  arg.outA.save(reinterpret_cast<real*>(result.data), idx, dir, arg.parity);
334 
335  shift[dir] = 3;
336  const int third_nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
337  if(third_nbr_idx >= 0){
338  arg.inB.load(z, third_nbr_idx);
339  outerProd(z, x, &result);
340  arg.outB.load(reinterpret_cast<real*>(tempB.data), idx, dir, arg.parity);
341  result = tempB + result*arg.coeff[1];
342  arg.outB.save(reinterpret_cast<real*>(result.data), idx, dir, arg.parity);
343  }
344  }
345  } // dir
346  idx += gridSize;
347  }
348  return;
349  } // interiorOprodKernel
350 
351 
352 
353  template<typename Complex, typename Output, typename InputA, typename InputB>
354  __global__ void exteriorOprodKernel(StaggeredOprodArg<Complex, Output, InputA, InputB> arg)
355  {
356  unsigned int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
357  const unsigned int gridSize = gridDim.x*blockDim.x;
358 
359  Complex a[3];
360  Complex b[3];
361  Matrix<Complex,3> result;
362  Matrix<Complex,3> inmatrix; // input
363  typedef typename RealTypeId<Complex>::Type real;
364 
365 
366  Output& out = (arg.displacement == 1) ? arg.outA : arg.outB;
367  real coeff = (arg.displacement == 1) ? arg.coeff[0] : arg.coeff[1];
368 
369  int x[4];
370  while(cb_idx<arg.length){
371  coordsFromIndex<1>(x, cb_idx, arg.X, arg.dir, arg.displacement, arg.parity);
372  const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
373 
374  out.load(reinterpret_cast<real*>(inmatrix.data), bulk_cb_idx, arg.dir, arg.parity);
375  arg.inA.load(a, bulk_cb_idx);
376 
377  const unsigned int ghost_idx = arg.ghostOffset + ghostIndexFromCoords<1,3>(x, arg.X, arg.dir, arg.displacement);
378  arg.inB.load(b, ghost_idx);
379 
380  outerProd(b,a,&result);
381  result = inmatrix + result*coeff;
382  out.save(reinterpret_cast<real*>(result.data), bulk_cb_idx, arg.dir, arg.parity);
383 
384  cb_idx += gridSize;
385  }
386  return;
387  }
388 
389 
390 
391  template<typename Complex, typename Output, typename InputA, typename InputB>
392  class StaggeredOprodField : public Tunable {
393 
394  private:
395  StaggeredOprodArg<Complex,Output,InputA,InputB> arg;
396  const GaugeField &meta;
397  QudaFieldLocation location; // location of the lattice fields
398 
399  unsigned int sharedBytesPerThread() const { return 0; }
400  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
401 
402  unsigned int minThreads() const { return arg.outA.volumeCB; }
403  bool tunedGridDim() const { return false; }
404 
405  public:
406  StaggeredOprodField(const StaggeredOprodArg<Complex,Output,InputA,InputB> &arg,
407  const GaugeField &meta, QudaFieldLocation location)
408  : arg(arg), meta(meta), location(location) {
409  writeAuxString("threads=%d,prec=%lu,stride=%d",arg.length,sizeof(Complex)/2,arg.inA.Stride());
410  // this sets the communications pattern for the packing kernel
412  setPackComms(comms);
413  }
414 
415  virtual ~StaggeredOprodField() {}
416 
417  void set(const StaggeredOprodArg<Complex,Output,InputA,InputB> &arg, QudaFieldLocation location){
418  // This is a hack. Need to change this!
419  this->arg.dir = arg.dir;
420  this->arg.length = arg.length;
421  this->arg.ghostOffset = arg.ghostOffset;
422  this->arg.kernelType = arg.kernelType;
423  this->location = location;
424  } // set
425 
426  void apply(const cudaStream_t &stream){
427  if(location == QUDA_CUDA_FIELD_LOCATION){
428  // Disable tuning for the time being
429  TuneParam tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
430  //if(arg.kernelType == OPROD_INTERIOR_KERNEL){
431  interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
432  // dim3 blockDim(128, 1, 1);
433  // const int gridSize = (arg.length + (blockDim.x-1))/blockDim.x;
434  // dim3 gridDim(gridSize, 1, 1);
435  // interiorOprodKernel<<<gridDim,blockDim,0, stream>>>(arg);
436  // }else if(arg.kernelType == OPROD_EXTERIOR_KERNEL){
437  // const unsigned int volume = arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3];
438  // arg.inB.setStride(3*volume/(2*arg.X[arg.dir]));
439  // exteriorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
440  // arg.inB.setStride(arg.inA.Stride());
441  // }else{
442  // errorQuda("Kernel type not supported\n");
443  // }
444  }else{ // run the CPU code
445  errorQuda("No CPU support for staggered outer-product calculation\n");
446  }
447  } // apply
448 
449  void preTune(){
450  this->arg.outFieldA.backup();
451  this->arg.outFieldB.backup();
452  }
453  void postTune(){
454  this->arg.outFieldA.restore();
455  this->arg.outFieldB.restore();
456  }
457 
458  long long flops() const {
459  return 0; // fix this
460  }
461 
462  long long bytes() const {
463  return 0; // fix this
464  }
465 
466  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux);}
467  }; // StaggeredOprodField
468 
469  template<typename Complex, typename Output, typename InputA, typename InputB>
470  void computeStaggeredOprodCuda(Output outA, Output outB, cudaGaugeField& outFieldA, cudaGaugeField& outFieldB, InputA& inA, InputB& inB, cudaColorSpinorField& src,
471  FaceBuffer& faceBuffer, const unsigned int parity, const int faceVolumeCB[4],
472  const unsigned int ghostOffset[4], const double coeff[2])
473  {
474 
475  cudaEventRecord(oprodStart, streams[Nstream-1]);
476 
477 
478  const int dim[4] = {src.X(0)*2, src.X(1), src.X(2), src.X(3)};
479  // Create the arguments for the interior kernel
480  StaggeredOprodArg<Complex,Output,InputA,InputB> arg(outA.volumeCB, dim, parity, 0, 0, 1, OPROD_INTERIOR_KERNEL, coeff, inA, inB, outA, outB, outFieldA,
481  outFieldB);
482 
483 
484  StaggeredOprodField<Complex,Output,InputA,InputB> oprod(arg, outFieldA, QUDA_CUDA_FIELD_LOCATION);
485 
486 #ifdef MULTI_GPU
487  bool pack=false;
488  for(int i=3; i>=0; i--){
489  if(commDimPartitioned(i) && (i!=3 || getKernelPackT())){
490  pack = true;
491  break;
492  }
493  } // i=3,..,0
494 
495  // source, dir(+/-1), parity, dagger, stream_ptr
496  // packing is all done in streams[Nstream-1]
497  // always call pack since this also sets the stream pointer even if not packing
498  //faceBuffer.pack(src, -1, 1-parity, 0, streams);
499  faceBuffer.pack(src, 1-parity, 0, streams); // FIXME work around since uni-direction packing is broken
500  if(pack){
501  cudaEventRecord(packEnd, streams[Nstream-1]);
502  }
503 
504  for(int i=3; i>=0; i--){
505  if(commDimPartitioned(i)){
506 
507  cudaEvent_t &event = (i!=3 || getKernelPackT()) ? packEnd : oprodStart;
508  cudaStreamWaitEvent(streams[2*i], event, 0); // wait in stream 2*i for event to complete
509 
510 
511  // Initialize the host transfer from the source spinor
512  faceBuffer.gather(src, false, 2*i);
513  // record the end of the gathering
514  cudaEventRecord(gatherEnd[i], streams[2*i]);
515  } // comDim(i)
516  } // i=3,..,0
517 #endif
518  oprod.apply(streams[Nstream-1]);
519 
520 /*
521  dim3 blockDim(128, 1, 1);
522  const int gridSize = (arg.length + (blockDim.x-1))/blockDim.x;
523  dim3 gridDim(gridSize, 1, 1);
524  interiorOprodKernel<<<gridDim,blockDim,0,streams[Nstream-1]>>>(arg);
525 
526 */
527 
528 
529 
530 #ifdef MULTI_GPU
531  // compute gather completed
532  int gatherCompleted[5];
533  int commsCompleted[5];
534  int oprodCompleted[4];
535 
536  for(int i=0; i<4; ++i){
537  gatherCompleted[i] = commsCompleted[i] = oprodCompleted[i] = 0;
538  }
539  gatherCompleted[4] = commsCompleted[4] = 1;
540 
541  // initialize commDimTotal
542  int commDimTotal = 0;
543  for(int i=0; i<4; ++i){
544  commDimTotal += commDimPartitioned(i);
545  }
546  commDimTotal *= 2;
547 
548  // initialize previousDir
549  int previousDir[4];
550  for(int i=3; i>=0; i--){
551  if(commDimPartitioned(i)){
552  int prev = 4;
553  for(int j=3; j>i; j--){
554  if(commDimPartitioned(j)){
555  prev = j;
556  }
557  }
558  previousDir[i] = prev;
559  }
560  } // set previous directions
561 
562 
563  if(commDimTotal){
564  arg.kernelType = OPROD_EXTERIOR_KERNEL;
565  unsigned int completeSum=0;
566  while(completeSum < commDimTotal){
567 
568  for(int i=3; i>=0; i--){
569  if(!commDimPartitioned(i)) continue;
570 
571  if(!gatherCompleted[i] && gatherCompleted[previousDir[i]]){
572  cudaError_t event_test = cudaEventQuery(gatherEnd[i]);
573 
574  if(event_test == cudaSuccess){
575  gatherCompleted[i] = 1;
576  completeSum++;
577  faceBuffer.commsStart(2*i);
578  }
579  }
580 
581  // Query if comms has finished
582  if(!commsCompleted[i] && commsCompleted[previousDir[i]] && gatherCompleted[i]){
583  int comms_test = faceBuffer.commsQuery(2*i);
584  if(comms_test){
585  commsCompleted[i] = 1;
586  completeSum++;
587  faceBuffer.scatter(src, false, 2*i);
588  }
589  }
590 
591  // enqueue the boundary oprod kernel as soon as the scatters have been enqueud
592  if(!oprodCompleted[i] && commsCompleted[i]){
593  cudaEventRecord(scatterEnd[i], streams[2*i]);
594  cudaStreamWaitEvent(streams[Nstream-1], scatterEnd[i],0);
595 
596  arg.dir = i;
597  arg.ghostOffset = ghostOffset[i];
598  const unsigned int volume = arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3];
599  arg.inB.setStride(3*volume/(2*arg.X[arg.dir]));
600  // First, do the one hop term
601  {
602 
603  arg.length = faceVolumeCB[i];
604  arg.displacement = 1;
605  dim3 blockDim(128, 1, 1);
606  const int gridSize = (arg.length + (blockDim.x-1))/blockDim.x;
607  dim3 gridDim(gridSize, 1, 1);
608  exteriorOprodKernel<<<gridDim, blockDim, 0, streams[Nstream-1]>>>(arg);
609  }
610  // Now do the 3 hop term - Try putting this in a separate stream
611  {
612 
613  arg.displacement = 3;
614  arg.length = arg.displacement*faceVolumeCB[i];
615  dim3 blockDim(128, 1, 1);
616  const int gridSize = (arg.length + (blockDim.x-1))/blockDim.x;
617  dim3 gridDim(gridSize, 1, 1);
618  exteriorOprodKernel<<<gridDim, blockDim, 0, streams[Nstream-1]>>>(arg);
619  }
620  arg.inB.setStride(arg.inA.Stride());
621 
622  oprodCompleted[i] = 1;
623  }
624 
625  } // i=3,..,0
626  } // completeSum < commDimTotal
627  } // if commDimTotal
628 #endif
629  } // computeStaggeredOprodCuda
630 
631 #endif // GPU_STAGGERED_OPROD
632 
633  // At the moment, I pass an instance of FaceBuffer in.
634  // Soon, faceBuffer will be subsumed into cudaColorSpinorField.
636  cudaColorSpinorField& inEven,
637  cudaColorSpinorField& inOdd,
638  FaceBuffer& faceBuffer,
639  const unsigned int parity, const double coeff[2])
640  {
641 
642 #ifdef GPU_STAGGERED_OPROD
643 
644  if(outA.Order() != QUDA_FLOAT2_GAUGE_ORDER)
645  errorQuda("Unsupported output ordering: %d\n", outA.Order());
646 
647  if(outB.Order() != QUDA_FLOAT2_GAUGE_ORDER)
648  errorQuda("Unsupported output ordering: %d\n", outB.Order());
649 
650  unsigned int ghostOffset[4] = {0,0,0,0};
651 #ifdef MULTI_GPU
652  const unsigned int Npad = inEven.Ncolor()*inEven.Nspin()*2/inEven.FieldOrder();
653  for(int dir=0; dir<4; ++dir){
654  ghostOffset[dir] = Npad*(inEven.GhostOffset(dir) + inEven.Stride());
655  }
656 #endif
657 
658  if(inEven.Precision() != outA.Precision()) errorQuda("Mixed precision not supported: %d %d\n", inEven.Precision(), outA.Precision());
659 
660  cudaColorSpinorField& inA = (parity&1) ? inOdd : inEven;
661  cudaColorSpinorField& inB = (parity&1) ? inEven : inOdd;
662 
663 #if (__COMPUTE_CAPABILITY__ >= 200)
664  if(inEven.Precision() == QUDA_DOUBLE_PRECISION){
665 
668  computeStaggeredOprodCuda<double2>(FloatNOrder<double, 18, 2, 18>(outA), FloatNOrder<double, 18, 2, 18>(outB),
669  outA, outB,
670  spinorA, spinorB, inB, faceBuffer, parity, inB.GhostFace(), ghostOffset, coeff);
671  }else if(inEven.Precision() == QUDA_SINGLE_PRECISION){
672 
675  computeStaggeredOprodCuda<float2>(FloatNOrder<float, 18, 2, 18>(outA), FloatNOrder<float, 18, 2, 18>(outB),
676  outA, outB,
677  spinorA, spinorB, inB, faceBuffer, parity, inB.GhostFace(), ghostOffset, coeff);
678  } else {
679  errorQuda("Unsupported precision: %d\n", inEven.Precision());
680  }
681 #else
682  errorQuda("Staggered Outer Product not supported on pre-Fermi architecture");
683 #endif
684 
685 
686 #else // GPU_STAGGERED_OPROD not defined
687  errorQuda("Staggered Outer Product has not been built!");
688 #endif
689 
690  return;
691  } // computeStaggeredOprod
692 
693 
694 
695 } // namespace quda
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
void destroyStaggeredOprodEvents()
int commDimPartitioned(int dir)
int y[4]
bool getKernelPackT()
Definition: dslash_quda.cu:84
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
void setPackComms(const int *commDim)
Definition: dslash_pack.cu:39
std::complex< double > Complex
Definition: eig_variables.h:13
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
cudaStream_t * streams
cudaStream_t * stream
int aux1
const int Nstream
int length[]
KernelType
int Nspin
Definition: blas_test.cu:43
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
Definition: quda_matrix.h:720
QudaPrecision Precision() const
const QudaFieldLocation location
Definition: pack_test.cpp:46
cudaEvent_t packEnd[Nstream]
Definition: dslash_quda.cu:99
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
__constant__ double coeff
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:101
void computeStaggeredOprod(cudaGaugeField &out, cudaColorSpinorField &in, FaceBuffer &facebuffer, const unsigned int parity, const double coeff, const unsigned int displacement)
int x[4]
QudaFieldOrder FieldOrder() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
QudaPrecision Precision() const
int full_idx
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
int aux2
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:103
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void createStaggeredOprodEvents()