QUDA  1.0.0
clover_outer_product.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <quda_internal.h>
6 #include <gauge_field_order.h>
7 #include <quda_matrix.h>
8 #include <color_spinor.h>
9 #include <dslash_quda.h>
10 
11 namespace quda {
12 
13 #ifdef GPU_CLOVER_DIRAC
14 
15  namespace { // anonymous
16 #include <texture.h>
17  }
18 
19  template<int N>
20  void createEventArray(cudaEvent_t (&event)[N], unsigned int flags=cudaEventDefault)
21  {
22  for(int i=0; i<N; ++i)
23  cudaEventCreate(&event[i],flags);
24  return;
25  }
26 
27  template<int N>
28  void destroyEventArray(cudaEvent_t (&event)[N])
29  {
30  for(int i=0; i<N; ++i)
31  cudaEventDestroy(event[i]);
32  }
33 
34 
35  static cudaEvent_t packEnd;
36  static cudaEvent_t gatherEnd[4];
37  static cudaEvent_t scatterEnd[4];
38  static cudaEvent_t oprodStart;
39  static cudaEvent_t oprodEnd;
40 
41 
42  void createCloverForceEvents(){
43  cudaEventCreate(&packEnd, cudaEventDisableTiming);
44  createEventArray(gatherEnd, cudaEventDisableTiming);
45  createEventArray(scatterEnd, cudaEventDisableTiming);
46  cudaEventCreate(&oprodStart, cudaEventDisableTiming);
47  cudaEventCreate(&oprodEnd, cudaEventDisableTiming);
48  return;
49  }
50 
51  void destroyCloverForceEvents(){
52  destroyEventArray(gatherEnd);
53  destroyEventArray(scatterEnd);
54  cudaEventDestroy(packEnd);
55  cudaEventDestroy(oprodStart);
56  cudaEventDestroy(oprodEnd);
57  return;
58  }
59 
60  enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };
61 
62  template<typename Float, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
63  struct CloverForceArg {
64  unsigned int length;
65  int X[4];
66  unsigned int parity;
67  unsigned int dir;
68  unsigned int ghostOffset[4];
69  unsigned int displacement;
70  OprodKernelType kernelType;
71  bool partitioned[4];
72  InputA inA;
73  InputB inB_shift;
74  InputC inC;
75  InputD inD_shift;
76  Gauge gauge;
77  Output force;
78  Float coeff;
79 
80  CloverForceArg(const unsigned int parity, const unsigned int dir, const unsigned int *ghostOffset,
81  const unsigned int displacement, const OprodKernelType kernelType, const double coeff, InputA &inA,
82  InputB &inB_shift, InputC &inC, InputD &inD_shift, Gauge &gauge, Output &force, GaugeField &meta) :
83  length(meta.VolumeCB()),
84  parity(parity),
85  dir(5),
86  displacement(displacement),
87  kernelType(kernelType),
88  coeff(coeff),
89  inA(inA),
90  inB_shift(inB_shift),
91  inC(inC),
92  inD_shift(inD_shift),
93  gauge(gauge),
94  force(force)
95  {
96  for(int i=0; i<4; ++i) this->X[i] = meta.X()[i];
97  for(int i=0; i<4; ++i) this->ghostOffset[i] = ghostOffset[i];
98  for(int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
99  }
100  };
101 
102  enum IndexType {
103  EVEN_X = 0,
104  EVEN_Y = 1,
105  EVEN_Z = 2,
106  EVEN_T = 3
107  };
108 
109  template <IndexType idxType>
110  __device__ inline void coordsFromIndex(
111  int &idx, int c[4], const unsigned int cb_idx, const unsigned int parity, const int X[4])
112  {
113  const int &LX = X[0];
114  const int &LY = X[1];
115  const int &LZ = X[2];
116  const int XYZ = X[2] * X[1] * X[0];
117  const int XY = X[1] * X[0];
118 
119  idx = 2 * cb_idx;
120 
121  int x, y, z, t;
122 
123  if (idxType == EVEN_X ) { // X even
124  // t = idx / XYZ;
125  // z = (idx / XY) % Z;
126  // y = (idx / X) % Y;
127  // idx += (parity + t + z + y) & 1;
128  // x = idx % X;
129  // equivalent to the above, but with fewer divisions/mods:
130  int aux1 = idx / LX;
131  x = idx - aux1 * LX;
132  int aux2 = aux1 / LY;
133  y = aux1 - aux2 * LY;
134  t = aux2 / LZ;
135  z = aux2 - t * LZ;
136  aux1 = (parity + t + z + y) & 1;
137  x += aux1;
138  idx += aux1;
139  } else if (idxType == EVEN_Y ) { // Y even
140  t = idx / XYZ;
141  z = (idx / XY) % LZ;
142  idx += (parity + t + z) & 1;
143  y = (idx / LX) % LY;
144  x = idx % LX;
145  } else if (idxType == EVEN_Z ) { // Z even
146  t = idx / XYZ;
147  idx += (parity + t) & 1;
148  z = (idx / XY) % LZ;
149  y = (idx / LX) % LY;
150  x = idx % LX;
151  } else {
152  idx += parity;
153  t = idx / XYZ;
154  z = (idx / XY) % LZ;
155  y = (idx / LX) % LY;
156  x = idx % LX;
157  }
158 
159  c[0] = x;
160  c[1] = y;
161  c[2] = z;
162  c[3] = t;
163  }
164 
165  // Get the coordinates for the exterior kernels
166  __device__ inline void coordsFromIndexExterior(int x[4], const unsigned int cb_idx, const int X[4],
167  const unsigned int dir, const int displacement, const unsigned int parity)
168  {
169  int Xh[2] = {X[0] / 2, X[1] / 2};
170  switch (dir) {
171  case 0:
172  x[2] = cb_idx / Xh[1] % X[2];
173  x[3] = cb_idx / (Xh[1] * X[2]) % X[3];
174  x[0] = cb_idx / (Xh[1] * X[2] * X[3]);
175  x[0] += (X[0] - displacement);
176  x[1] = 2 * (cb_idx % Xh[1]) + ((x[0] + x[2] + x[3] + parity) & 1);
177  break;
178 
179  case 1:
180  x[2] = cb_idx / Xh[0] % X[2];
181  x[3] = cb_idx / (Xh[0] * X[2]) % X[3];
182  x[1] = cb_idx / (Xh[0] * X[2] * X[3]);
183  x[1] += (X[1] - displacement);
184  x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
185  break;
186 
187  case 2:
188  x[1] = cb_idx / Xh[0] % X[1];
189  x[3] = cb_idx / (Xh[0] * X[1]) % X[3];
190  x[2] = cb_idx / (Xh[0] * X[1] * X[3]);
191  x[2] += (X[2] - displacement);
192  x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
193  break;
194 
195  case 3:
196  x[1] = cb_idx / Xh[0] % X[1];
197  x[2] = cb_idx / (Xh[0] * X[1]) % X[2];
198  x[3] = cb_idx / (Xh[0] * X[1] * X[2]);
199  x[3] += (X[3] - displacement);
200  x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
201  break;
202  }
203  return;
204  }
205 
206 
207  __device__ __forceinline__
208  int neighborIndex(const unsigned int cb_idx, const int shift[4], const bool partitioned[4], const unsigned int parity, const int X[4]){
209  int full_idx;
210  int x[4];
211  coordsFromIndex<EVEN_X>(full_idx, x, cb_idx, parity, X);
212 
213  for(int dim = 0; dim<4; ++dim){
214  if(partitioned[dim])
215  if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim]) return -1;
216  }
217 
218  for(int dim=0; dim<4; ++dim){
219  x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
220  }
221  return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
222  }
223 
224 
225 
226  template<typename real, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
227  __global__ void interiorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD> arg) {
228  typedef complex<real> Complex;
229  int idx = blockIdx.x*blockDim.x + threadIdx.x;
230 
231  ColorSpinor<real,3,4> A, B_shift, C, D_shift;
232  Matrix<Complex,3> U, result, temp;
233 
234  while (idx<arg.length){
235  arg.inA.load(static_cast<Complex*>(A.data), idx);
236  arg.inC.load(static_cast<Complex*>(C.data), idx);
237 
238 #pragma unroll
239  for(int dim=0; dim<4; ++dim){
240  int shift[4] = {0,0,0,0};
241  shift[dim] = 1;
242  const int nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
243 
244  if(nbr_idx >= 0){
245  arg.inB_shift.load(static_cast<Complex*>(B_shift.data), nbr_idx);
246  arg.inD_shift.load(static_cast<Complex*>(D_shift.data), nbr_idx);
247 
248  B_shift = (B_shift.project(dim,1)).reconstruct(dim,1);
249  result = outerProdSpinTrace(B_shift,A);
250 
251  D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1);
252  result += outerProdSpinTrace(D_shift,C);
253 
254  temp = arg.force(dim, idx, arg.parity);
255  U = arg.gauge(dim, idx, arg.parity);
256  result = temp + U*result*arg.coeff;
257  arg.force(dim, idx, arg.parity) = result;
258  }
259  } // dim
260 
261  idx += gridDim.x*blockDim.x;
262  }
263  return;
264  } // interiorOprodKernel
265 
266  template<int dim, typename real, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
267  __global__ void exteriorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD> arg) {
268  typedef complex<real> Complex;
269  int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
270 
271  ColorSpinor<real,3,4> A, B_shift, C, D_shift;
272  ColorSpinor<real,3,2> projected_tmp;
273  Matrix<Complex,3> U, result, temp;
274 
275  int x[4];
276  while (cb_idx<arg.length){
277  coordsFromIndexExterior(x, cb_idx, arg.X, dim, arg.displacement, arg.parity);
278  const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
279  arg.inA.load(static_cast<Complex*>(A.data), bulk_cb_idx);
280  arg.inC.load(static_cast<Complex*>(C.data), bulk_cb_idx);
281 
282  const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
283 
284  arg.inB_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx, dim);
285  B_shift = projected_tmp.reconstruct(dim, 1);
286  result = outerProdSpinTrace(B_shift,A);
287 
288  arg.inD_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx, dim);
289  D_shift = projected_tmp.reconstruct(dim,-1);
290  result += outerProdSpinTrace(D_shift,C);
291 
292  temp = arg.force(dim, bulk_cb_idx, arg.parity);
293  U = arg.gauge(dim, bulk_cb_idx, arg.parity);
294  result = temp + U*result*arg.coeff;
295  arg.force(dim, bulk_cb_idx, arg.parity) = result;
296 
297  cb_idx += gridDim.x*blockDim.x;
298  }
299  return;
300  } // exteriorOprodKernel
301 
302  template<typename Float, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
303  class CloverForce : public Tunable {
304 
305  private:
306  CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &arg;
307  const GaugeField &meta;
308  QudaFieldLocation location; // location of the lattice fields
309 
310  unsigned int sharedBytesPerThread() const { return 0; }
311  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
312 
313  unsigned int minThreads() const { return arg.length; }
314  bool tuneGridDim() const { return false; }
315 
316  public:
317  CloverForce(CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &arg,
318  const GaugeField &meta, QudaFieldLocation location)
319  : arg(arg), meta(meta), location(location) {
320  writeAuxString("prec=%lu,stride=%d", sizeof(Float), arg.inA.Stride());
321  // this sets the communications pattern for the packing kernel
323  setPackComms(comms);
324  }
325 
326  virtual ~CloverForce() {}
327 
328  void apply(const cudaStream_t &stream){
329  if(location == QUDA_CUDA_FIELD_LOCATION){
330  // Disable tuning for the time being
331  TuneParam tp = tuneLaunch(*this,getTuning(),getVerbosity());
332 
333  if(arg.kernelType == OPROD_INTERIOR_KERNEL){
334  interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
335  } else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
336  if (arg.dir == 0)
337  exteriorOprodKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
338  else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
339  else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
340  else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
341  } else {
342  errorQuda("Kernel type not supported\n");
343  }
344  }else{ // run the CPU code
345  errorQuda("No CPU support for staggered outer-product calculation\n");
346  }
347  } // apply
348 
349  void preTune(){
350  this->arg.force.save();
351  }
352  void postTune(){
353  this->arg.force.load();
354  }
355 
356  long long flops() const {
357  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
358  return ((long long)arg.length)*4*(24 + 144 + 234); // spin project + spin trace + multiply-add
359  } else {
360  return ((long long)arg.length)*(144 + 234); // spin trace + multiply-add
361  }
362  }
363  long long bytes() const {
364  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
365  return ((long long)arg.length)*(arg.inA.Bytes() + arg.inC.Bytes() + 4*(arg.inB_shift.Bytes() + arg.inD_shift.Bytes() + 2*arg.force.Bytes() + arg.gauge.Bytes()));
366  } else {
367  return ((long long)arg.length)*(arg.inA.Bytes() + arg.inB_shift.Bytes()/2 + arg.inC.Bytes() + arg.inD_shift.Bytes()/2 + 2*arg.force.Bytes() + arg.gauge.Bytes());
368  }
369  }
370 
371  TuneKey tuneKey() const {
372  char new_aux[TuneKey::aux_n];
373  strcpy(new_aux, aux);
374  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
375  strcat(new_aux, ",interior");
376  } else {
377  strcat(new_aux, ",exterior");
378  if (arg.dir==0) strcat(new_aux, ",dir=0");
379  else if (arg.dir==1) strcat(new_aux, ",dir=1");
380  else if (arg.dir==2) strcat(new_aux, ",dir=2");
381  else if (arg.dir==3) strcat(new_aux, ",dir=3");
382  }
383  return TuneKey(meta.VolString(), "CloverForce", new_aux);
384  }
385  }; // CloverForce
386 
387  void exchangeGhost(cudaColorSpinorField &a, int parity, int dag) {
388  // need to enable packing in temporal direction to get spin-projector correct
389  pushKernelPackT(true);
390 
391  // first transfer src1
393 
394  MemoryLocation location[2*QUDA_MAX_DIM] = {Device, Device, Device, Device, Device, Device, Device, Device};
395  a.pack(1, 1-parity, dag, Nstream-1, location, Device);
396 
398 
399  for(int i=3; i>=0; i--){
400  if(commDimPartitioned(i)){
401  // Initialize the host transfer from the source spinor
402  a.gather(1, dag, 2*i);
403  } // commDim(i)
404  } // i=3,..,0
405 
407 
408  for (int i=3; i>=0; i--) {
409  if(commDimPartitioned(i)) {
410  a.commsStart(1, 2*i, dag);
411  }
412  }
413 
414  for (int i=3; i>=0; i--) {
415  if(commDimPartitioned(i)) {
416  a.commsWait(1, 2*i, dag);
417  a.scatter(1, dag, 2*i);
418  }
419  }
420 
422  popKernelPackT(); // restore packing state
423 
424  a.bufferIndex = (1 - a.bufferIndex);
425  comm_barrier();
426  }
427 
428  template<typename Float, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
429  void computeCloverForceCuda(Output force, Gauge gauge, GaugeField& out,
430  InputA& inA, InputB& inB, InputC &inC, InputD &inD,
431  cudaColorSpinorField& src1, cudaColorSpinorField& src2,
432  const unsigned int parity, const int faceVolumeCB[4], const double coeff)
433  {
434  qudaEventRecord(oprodStart, streams[Nstream-1]);
435 
436  unsigned int ghostOffset[4] = {0,0,0,0};
437  for (int dir=0; dir<4; ++dir) {
438  if (src1.GhostOffset(dir) != src2.GhostOffset(dir))
439  errorQuda("Mismatched ghost offset[%d] %d != %d", dir, src1.GhostOffset(dir), src2.GhostOffset(dir));
440  ghostOffset[dir] = (src1.GhostOffset(dir,1))/src1.FieldOrder(); // offset we want is the forwards one
441  }
442 
443  // Create the arguments for the interior kernel
444  CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, coeff, inA, inB, inC, inD, gauge, force, out);
445  CloverForce<Float,Output,Gauge,InputA,InputB,InputC,InputD> oprod(arg, out, QUDA_CUDA_FIELD_LOCATION);
446 
447  arg.kernelType = OPROD_INTERIOR_KERNEL;
448  arg.length = src1.VolumeCB();
449  oprod.apply(0);
450 
451  for (int i=3; i>=0; i--) {
452  if (commDimPartitioned(i)) {
453  // update parameters for this exterior kernel
454  arg.kernelType = OPROD_EXTERIOR_KERNEL;
455  arg.dir = i;
456  arg.length = faceVolumeCB[i];
457  arg.displacement = 1; // forwards displacement
458  oprod.apply(0);
459  }
460  } // i=3,..,0
461  } // computeCloverForceCuda
462 
463 #endif // GPU_CLOVER_DIRAC
464 
465  void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector<ColorSpinorField *> &x,
466  std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
467  {
468 
469 #ifdef GPU_CLOVER_DIRAC
470  if(force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
471  errorQuda("Unsupported output ordering: %d\n", force.Order());
472 
473  if(x[0]->Precision() != force.Precision())
474  errorQuda("Mixed precision not supported: %d %d\n", x[0]->Precision(), force.Precision());
475 
476  createCloverForceEvents(); // FIXME not actually used
477 
478  int dag = 1;
479 
480  for (unsigned int i=0; i<x.size(); i++) {
481  static_cast<cudaColorSpinorField&>(x[i]->Even()).allocateGhostBuffer(1);
482  static_cast<cudaColorSpinorField&>(x[i]->Odd()).allocateGhostBuffer(1);
483  static_cast<cudaColorSpinorField&>(p[i]->Even()).allocateGhostBuffer(1);
484  static_cast<cudaColorSpinorField&>(p[i]->Odd()).allocateGhostBuffer(1);
485 
486  for (int parity=0; parity<2; parity++) {
487 
488  ColorSpinorField& inA = (parity&1) ? p[i]->Odd() : p[i]->Even();
489  ColorSpinorField& inB = (parity&1) ? x[i]->Even(): x[i]->Odd();
490  ColorSpinorField& inC = (parity&1) ? x[i]->Odd() : x[i]->Even();
491  ColorSpinorField& inD = (parity&1) ? p[i]->Even(): p[i]->Odd();
492 
493  if (x[0]->Precision() == QUDA_DOUBLE_PRECISION) {
494  Spinor<double2, double2, 12, 0> spinorA(inA);
495 
496  Spinor<double2, double2, 12, 0> spinorB(inB);
497  exchangeGhost(static_cast<cudaColorSpinorField&>(inB), parity, dag);
498 
499  Spinor<double2, double2, 12, 0> spinorC(inC);
500 
501  Spinor<double2, double2, 12, 0> spinorD(inD);
502  exchangeGhost(static_cast<cudaColorSpinorField&>(inD), parity, 1-dag);
503 
504  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
505  computeCloverForceCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(force),
507  force, spinorA, spinorB, spinorC, spinorD,
508  static_cast<cudaColorSpinorField&>(inB),
509  static_cast<cudaColorSpinorField&>(inD),
510  parity, inB.GhostFace(), coeff[i]);
511  } else if (U.Reconstruct() == QUDA_RECONSTRUCT_12) {
512  computeCloverForceCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(force),
514  force, spinorA, spinorB, spinorC, spinorD,
515  static_cast<cudaColorSpinorField&>(inB),
516  static_cast<cudaColorSpinorField&>(inD),
517  parity, inB.GhostFace(), coeff[i]);
518  } else {
519  errorQuda("Unsupported recontruction type");
520  }
521 #if 0 // FIXME - Spinor class expect float4 pointer not Complex pointer
522  } else if (x[0]->Precision() == QUDA_SINGLE_PRECISION) {
523 #endif
524  } else {
525  errorQuda("Unsupported precision: %d\n", x[0]->Precision());
526  }
527  }
528  }
529 
530  destroyCloverForceEvents(); // not actually used
531 
532 #else // GPU_CLOVER_DIRAC not defined
533  errorQuda("Clover Dirac operator has not been built!");
534 #endif
535 
536  checkCudaError();
537  return;
538  } // computeCloverForce
539 
540 
541 
542 } // namespace quda
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * streams
cudaStream_t * stream
const int Nstream
Definition: quda_internal.h:83
__device__ __host__ Matrix< complex< Float >, Nc > outerProdSpinTrace(const ColorSpinor< Float, Nc, Ns > &a, const ColorSpinor< Float, Nc, Ns > &b)
Definition: color_spinor.h:985
int length[]
void popKernelPackT()
Definition: dslash_quda.cu:42
#define qudaDeviceSynchronize()
IndexType
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:55
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
static const int aux_n
Definition: tune_key.h:12
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:59
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int * GhostFace() const
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
bool flags
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:57
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.
Definition: dslash_pack2.cu:14
void comm_barrier(void)
Definition: comm_mpi.cpp:326