QUDA  0.9.0
clover_outer_product.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <quda_internal.h>
6 #include <gauge_field_order.h>
7 #include <quda_matrix.h>
8 #include <color_spinor.h>
9 #include <dslash_quda.h>
10 
11 namespace quda {
12 
13 #ifdef GPU_CLOVER_DIRAC
14 
15  namespace { // anonymous
16 #include <texture.h>
17  }
18 
19  template<int N>
20  void createEventArray(cudaEvent_t (&event)[N], unsigned int flags=cudaEventDefault)
21  {
22  for(int i=0; i<N; ++i)
23  cudaEventCreate(&event[i],flags);
24  return;
25  }
26 
27  template<int N>
28  void destroyEventArray(cudaEvent_t (&event)[N])
29  {
30  for(int i=0; i<N; ++i)
31  cudaEventDestroy(event[i]);
32  }
33 
34 
35  static cudaEvent_t packEnd;
36  static cudaEvent_t gatherEnd[4];
37  static cudaEvent_t scatterEnd[4];
38  static cudaEvent_t oprodStart;
39  static cudaEvent_t oprodEnd;
40 
41 
42  void createCloverForceEvents(){
43  cudaEventCreate(&packEnd, cudaEventDisableTiming);
44  createEventArray(gatherEnd, cudaEventDisableTiming);
45  createEventArray(scatterEnd, cudaEventDisableTiming);
46  cudaEventCreate(&oprodStart, cudaEventDisableTiming);
47  cudaEventCreate(&oprodEnd, cudaEventDisableTiming);
48  return;
49  }
50 
51  void destroyCloverForceEvents(){
52  destroyEventArray(gatherEnd);
53  destroyEventArray(scatterEnd);
54  cudaEventDestroy(packEnd);
55  cudaEventDestroy(oprodStart);
56  cudaEventDestroy(oprodEnd);
57  return;
58  }
59 
60  enum KernelType {OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL};
61 
62  template<typename Float, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
63  struct CloverForceArg {
64  unsigned int length;
65  int X[4];
66  unsigned int parity;
67  unsigned int dir;
68  unsigned int ghostOffset[4];
69  unsigned int displacement;
70  KernelType kernelType;
71  bool partitioned[4];
72  InputA inA;
73  InputB inB_shift;
74  InputC inC;
75  InputD inD_shift;
76  Gauge gauge;
77  Output force;
78  Float coeff;
79 
80  CloverForceArg(const unsigned int parity,
81  const unsigned int dir,
82  const unsigned int *ghostOffset,
83  const unsigned int displacement,
84  const KernelType kernelType,
85  const double coeff,
86  InputA& inA,
87  InputB& inB_shift,
88  InputC& inC,
89  InputD& inD_shift,
90  Gauge& gauge,
91  Output& force,
92  GaugeField &meta) : length(meta.VolumeCB()), parity(parity), dir(5),
93  displacement(displacement), kernelType(kernelType),
94  coeff(coeff), inA(inA), inB_shift(inB_shift),
95  inC(inC), inD_shift(inD_shift),
96  gauge(gauge), force(force)
97  {
98  for(int i=0; i<4; ++i) this->X[i] = meta.X()[i];
99  for(int i=0; i<4; ++i) this->ghostOffset[i] = ghostOffset[i];
100  for(int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
101  }
102  };
103 
104  enum IndexType {
105  EVEN_X = 0,
106  EVEN_Y = 1,
107  EVEN_Z = 2,
108  EVEN_T = 3
109  };
110 
111  template <IndexType idxType>
112  static __device__ __forceinline__ void coordsFromIndex(int& idx, int c[4],
113  const unsigned int cb_idx, const unsigned int parity, const int X[4])
114  {
115  const int &LX = X[0];
116  const int &LY = X[1];
117  const int &LZ = X[2];
118  const int XYZ = X[2]*X[1]*X[0];
119  const int XY = X[1]*X[0];
120 
121  idx = 2*cb_idx;
122 
123  int x, y, z, t;
124 
125  if (idxType == EVEN_X ) { // X even
126  // t = idx / XYZ;
127  // z = (idx / XY) % Z;
128  // y = (idx / X) % Y;
129  // idx += (parity + t + z + y) & 1;
130  // x = idx % X;
131  // equivalent to the above, but with fewer divisions/mods:
132  int aux1 = idx / LX;
133  x = idx - aux1 * LX;
134  int aux2 = aux1 / LY;
135  y = aux1 - aux2 * LY;
136  t = aux2 / LZ;
137  z = aux2 - t * LZ;
138  aux1 = (parity + t + z + y) & 1;
139  x += aux1;
140  idx += aux1;
141  } else if (idxType == EVEN_Y ) { // Y even
142  t = idx / XYZ;
143  z = (idx / XY) % LZ;
144  idx += (parity + t + z) & 1;
145  y = (idx / LX) % LY;
146  x = idx % LX;
147  } else if (idxType == EVEN_Z ) { // Z even
148  t = idx / XYZ;
149  idx += (parity + t) & 1;
150  z = (idx / XY) % LZ;
151  y = (idx / LX) % LY;
152  x = idx % LX;
153  } else {
154  idx += parity;
155  t = idx / XYZ;
156  z = (idx / XY) % LZ;
157  y = (idx / LX) % LY;
158  x = idx % LX;
159  }
160 
161  c[0] = x;
162  c[1] = y;
163  c[2] = z;
164  c[3] = t;
165  }
166 
167 
168 
169  // Get the coordinates for the exterior kernels
170  __device__ static void coordsFromIndex(int x[4], const unsigned int cb_idx, const int X[4], const unsigned int dir, const int displacement, const unsigned int parity)
171  {
172  int Xh[2] = {X[0]/2, X[1]/2};
173  switch(dir){
174  case 0:
175  x[2] = cb_idx/Xh[1] % X[2];
176  x[3] = cb_idx/(Xh[1]*X[2]) % X[3];
177  x[0] = cb_idx/(Xh[1]*X[2]*X[3]);
178  x[0] += (X[0] - displacement);
179  x[1] = 2*(cb_idx % Xh[1]) + ((x[0]+x[2]+x[3]+parity)&1);
180  break;
181 
182  case 1:
183  x[2] = cb_idx/Xh[0] % X[2];
184  x[3] = cb_idx/(Xh[0]*X[2]) % X[3];
185  x[1] = cb_idx/(Xh[0]*X[2]*X[3]);
186  x[1] += (X[1] - displacement);
187  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
188  break;
189 
190  case 2:
191  x[1] = cb_idx/Xh[0] % X[1];
192  x[3] = cb_idx/(Xh[0]*X[1]) % X[3];
193  x[2] = cb_idx/(Xh[0]*X[1]*X[3]);
194  x[2] += (X[2] - displacement);
195  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
196  break;
197 
198  case 3:
199  x[1] = cb_idx/Xh[0] % X[1];
200  x[2] = cb_idx/(Xh[0]*X[1]) % X[2];
201  x[3] = cb_idx/(Xh[0]*X[1]*X[2]);
202  x[3] += (X[3] - displacement);
203  x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
204  break;
205  }
206  return;
207  }
208 
209 
210  __device__ __forceinline__
211  int neighborIndex(const unsigned int cb_idx, const int shift[4], const bool partitioned[4], const unsigned int parity, const int X[4]){
212  int full_idx;
213  int x[4];
214  coordsFromIndex<EVEN_X>(full_idx, x, cb_idx, parity, X);
215 
216  for(int dim = 0; dim<4; ++dim){
217  if(partitioned[dim])
218  if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim]) return -1;
219  }
220 
221  for(int dim=0; dim<4; ++dim){
222  x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
223  }
224  return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
225  }
226 
227 
228 
229  template<typename real, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
230  __global__ void interiorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD> arg) {
231  typedef complex<real> Complex;
232  int idx = blockIdx.x*blockDim.x + threadIdx.x;
233 
234  ColorSpinor<real,3,4> A, B_shift, C, D_shift;
235  Matrix<Complex,3> U, result, temp;
236 
237  while (idx<arg.length){
238  arg.inA.load(static_cast<Complex*>(A.data), idx);
239  arg.inC.load(static_cast<Complex*>(C.data), idx);
240 
241 #pragma unroll
242  for(int dim=0; dim<4; ++dim){
243  int shift[4] = {0,0,0,0};
244  shift[dim] = 1;
245  const int nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
246 
247  if(nbr_idx >= 0){
248  arg.inB_shift.load(static_cast<Complex*>(B_shift.data), nbr_idx);
249  arg.inD_shift.load(static_cast<Complex*>(D_shift.data), nbr_idx);
250 
251  B_shift = (B_shift.project(dim,1)).reconstruct(dim,1);
252  result = outerProdSpinTrace(B_shift,A);
253 
254  D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1);
255  result += outerProdSpinTrace(D_shift,C);
256 
257  arg.force.load(reinterpret_cast<real*>(temp.data), idx, dim, arg.parity);
258  arg.gauge.load(reinterpret_cast<real*>(U.data), idx, dim, arg.parity);
259  result = temp + U*result*arg.coeff;
260  arg.force.save(reinterpret_cast<real*>(result.data), idx, dim, arg.parity);
261  }
262  } // dim
263 
264  idx += gridDim.x*blockDim.x;
265  }
266  return;
267  } // interiorOprodKernel
268 
269  template<int dim, typename real, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
270  __global__ void exteriorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD> arg) {
271  typedef complex<real> Complex;
272  int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
273 
274  ColorSpinor<real,3,4> A, B_shift, C, D_shift;
275  ColorSpinor<real,3,2> projected_tmp;
276  Matrix<Complex,3> U, result, temp;
277 
278  int x[4];
279  while (cb_idx<arg.length){
280  coordsFromIndex(x, cb_idx, arg.X, dim, arg.displacement, arg.parity);
281  const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
282  arg.inA.load(static_cast<Complex*>(A.data), bulk_cb_idx);
283  arg.inC.load(static_cast<Complex*>(C.data), bulk_cb_idx);
284 
285  const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
286 
287  arg.inB_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx, dim);
288  B_shift = projected_tmp.reconstruct(dim, 1);
289  result = outerProdSpinTrace(B_shift,A);
290 
291  arg.inD_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx, dim);
292  D_shift = projected_tmp.reconstruct(dim,-1);
293  result += outerProdSpinTrace(D_shift,C);
294 
295  arg.force.load(reinterpret_cast<real*>(temp.data), bulk_cb_idx, dim, arg.parity);
296  arg.gauge.load(reinterpret_cast<real*>(U.data), bulk_cb_idx, dim, arg.parity);
297  result = temp + U*result*arg.coeff;
298  arg.force.save(reinterpret_cast<real*>(result.data), bulk_cb_idx, dim, arg.parity);
299 
300  cb_idx += gridDim.x*blockDim.x;
301  }
302  return;
303  } // exteriorOprodKernel
304 
305  template<typename Float, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
306  class CloverForce : public Tunable {
307 
308  private:
309  CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &arg;
310  const GaugeField &meta;
311  QudaFieldLocation location; // location of the lattice fields
312 
313  unsigned int sharedBytesPerThread() const { return 0; }
314  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
315 
316  unsigned int minThreads() const { return arg.length; }
317  bool tuneGridDim() const { return false; }
318 
319  public:
320  CloverForce(CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &arg,
321  const GaugeField &meta, QudaFieldLocation location)
322  : arg(arg), meta(meta), location(location) {
323  writeAuxString("prec=%lu,stride=%d", sizeof(Float), arg.inA.Stride());
324  // this sets the communications pattern for the packing kernel
326  setPackComms(comms);
327  }
328 
329  virtual ~CloverForce() {}
330 
331  void apply(const cudaStream_t &stream){
332  if(location == QUDA_CUDA_FIELD_LOCATION){
333  // Disable tuning for the time being
334  TuneParam tp = tuneLaunch(*this,getTuning(),getVerbosity());
335 
336  if(arg.kernelType == OPROD_INTERIOR_KERNEL){
337  interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
338  } else if(arg.kernelType == OPROD_EXTERIOR_KERNEL) {
339  if (arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
340  else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
341  else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
342  else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(arg);
343  } else {
344  errorQuda("Kernel type not supported\n");
345  }
346  }else{ // run the CPU code
347  errorQuda("No CPU support for staggered outer-product calculation\n");
348  }
349  } // apply
350 
351  void preTune(){
352  this->arg.force.save();
353  }
354  void postTune(){
355  this->arg.force.load();
356  }
357 
358  long long flops() const {
359  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
360  return ((long long)arg.length)*4*(24 + 144 + 234); // spin project + spin trace + multiply-add
361  } else {
362  return ((long long)arg.length)*(144 + 234); // spin trace + multiply-add
363  }
364  }
365  long long bytes() const {
366  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
367  return ((long long)arg.length)*(arg.inA.Bytes() + arg.inC.Bytes() + 4*(arg.inB_shift.Bytes() + arg.inD_shift.Bytes() + 2*arg.force.Bytes() + arg.gauge.Bytes()));
368  } else {
369  return ((long long)arg.length)*(arg.inA.Bytes() + arg.inB_shift.Bytes()/2 + arg.inC.Bytes() + arg.inD_shift.Bytes()/2 + 2*arg.force.Bytes() + arg.gauge.Bytes());
370  }
371  }
372 
373  TuneKey tuneKey() const {
374  char new_aux[TuneKey::aux_n];
375  strcpy(new_aux, aux);
376  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
377  strcat(new_aux, ",interior");
378  } else {
379  strcat(new_aux, ",exterior");
380  if (arg.dir==0) strcat(new_aux, ",dir=0");
381  else if (arg.dir==1) strcat(new_aux, ",dir=1");
382  else if (arg.dir==2) strcat(new_aux, ",dir=2");
383  else if (arg.dir==3) strcat(new_aux, ",dir=3");
384  }
385  return TuneKey(meta.VolString(), "CloverForce", new_aux);
386  }
387  }; // CloverForce
388 
389  void exchangeGhost(cudaColorSpinorField &a, int parity, int dag) {
390  // need to enable packing in temporal direction to get spin-projector correct
391  bool pack_old = getKernelPackT();
392  setKernelPackT(true);
393 
394  // first transfer src1
396 
398  a.pack(1, 1-parity, dag, Nstream-1, location, Device);
399 
401 
402  for(int i=3; i>=0; i--){
403  if(commDimPartitioned(i)){
404  // Initialize the host transfer from the source spinor
405  a.gather(1, dag, 2*i);
406  } // commDim(i)
407  } // i=3,..,0
408 
410 
411  for (int i=3; i>=0; i--) {
412  if(commDimPartitioned(i)) {
413  a.commsStart(1, 2*i, dag);
414  }
415  }
416 
417  for (int i=3; i>=0; i--) {
418  if(commDimPartitioned(i)) {
419  a.commsWait(1, 2*i, dag);
420  a.scatter(1, dag, 2*i);
421  }
422  }
423 
425  setKernelPackT(pack_old); // restore packing state
426 
427  a.bufferIndex = (1 - a.bufferIndex);
428  comm_barrier();
429  }
430 
431  template<typename Float, typename Output, typename Gauge, typename InputA, typename InputB, typename InputC, typename InputD>
432  void computeCloverForceCuda(Output force, Gauge gauge, GaugeField& out,
433  InputA& inA, InputB& inB, InputC &inC, InputD &inD,
434  cudaColorSpinorField& src1, cudaColorSpinorField& src2,
435  const unsigned int parity, const int faceVolumeCB[4], const double coeff)
436  {
437  qudaEventRecord(oprodStart, streams[Nstream-1]);
438 
439  unsigned int ghostOffset[4] = {0,0,0,0};
440  for (int dir=0; dir<4; ++dir) {
441  if (src1.GhostOffset(dir) != src2.GhostOffset(dir))
442  errorQuda("Mismatched ghost offset[%d] %d != %d", dir, src1.GhostOffset(dir), src2.GhostOffset(dir));
443  ghostOffset[dir] = (src1.GhostOffset(dir,1))/src1.FieldOrder(); // offset we want is the forwards one
444  }
445 
446  // Create the arguments for the interior kernel
447  CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, coeff, inA, inB, inC, inD, gauge, force, out);
448  CloverForce<Float,Output,Gauge,InputA,InputB,InputC,InputD> oprod(arg, out, QUDA_CUDA_FIELD_LOCATION);
449 
450  arg.kernelType = OPROD_INTERIOR_KERNEL;
451  arg.length = src1.VolumeCB();
452  oprod.apply(0);
453 
454  for (int i=3; i>=0; i--) {
455  if (commDimPartitioned(i)) {
456  // update parameters for this exterior kernel
457  arg.kernelType = OPROD_EXTERIOR_KERNEL;
458  arg.dir = i;
459  arg.length = faceVolumeCB[i];
460  arg.displacement = 1; // forwards displacement
461  oprod.apply(0);
462  }
463  } // i=3,..,0
464  } // computeCloverForceCuda
465 
466 #endif // GPU_CLOVER_FORCE
467 
469  const GaugeField& U,
470  std::vector<ColorSpinorField*> &x,
471  std::vector<ColorSpinorField*> &p,
472  std::vector<double> &coeff)
473  {
474 
475 #ifdef GPU_CLOVER_DIRAC
476  if(force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
477  errorQuda("Unsupported output ordering: %d\n", force.Order());
478 
479  if(x[0]->Precision() != force.Precision())
480  errorQuda("Mixed precision not supported: %d %d\n", x[0]->Precision(), force.Precision());
481 
482  createCloverForceEvents(); // FIXME not actually used
483 
484  int dag = 1;
485 
486  for (unsigned int i=0; i<x.size(); i++) {
487  static_cast<cudaColorSpinorField&>(x[i]->Even()).allocateGhostBuffer(1);
488  static_cast<cudaColorSpinorField&>(x[i]->Odd()).allocateGhostBuffer(1);
489  static_cast<cudaColorSpinorField&>(p[i]->Even()).allocateGhostBuffer(1);
490  static_cast<cudaColorSpinorField&>(p[i]->Odd()).allocateGhostBuffer(1);
491 
492  for (int parity=0; parity<2; parity++) {
493 
494  ColorSpinorField& inA = (parity&1) ? p[i]->Odd() : p[i]->Even();
495  ColorSpinorField& inB = (parity&1) ? x[i]->Even(): x[i]->Odd();
496  ColorSpinorField& inC = (parity&1) ? x[i]->Odd() : x[i]->Even();
497  ColorSpinorField& inD = (parity&1) ? p[i]->Even(): p[i]->Odd();
498 
499  if (x[0]->Precision() == QUDA_DOUBLE_PRECISION) {
501 
503  exchangeGhost(static_cast<cudaColorSpinorField&>(inB), parity, dag);
504 
506 
508  exchangeGhost(static_cast<cudaColorSpinorField&>(inD), parity, 1-dag);
509 
510  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
511  computeCloverForceCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(force),
513  force, spinorA, spinorB, spinorC, spinorD,
514  static_cast<cudaColorSpinorField&>(inB),
515  static_cast<cudaColorSpinorField&>(inD),
516  parity, inB.GhostFace(), coeff[i]);
517  } else if (U.Reconstruct() == QUDA_RECONSTRUCT_12) {
518  computeCloverForceCuda<double>(gauge::FloatNOrder<double, 18, 2, 18>(force),
520  force, spinorA, spinorB, spinorC, spinorD,
521  static_cast<cudaColorSpinorField&>(inB),
522  static_cast<cudaColorSpinorField&>(inD),
523  parity, inB.GhostFace(), coeff[i]);
524  } else {
525  errorQuda("Unsupported recontruction type");
526  }
527 #if 0 // FIXME - Spinor class expect float4 pointer not Complex pointer
528  } else if (x[0]->Precision() == QUDA_SINGLE_PRECISION) {
529 #endif
530  } else {
531  errorQuda("Unsupported precision: %d\n", x[0]->Precision());
532  }
533  }
534  }
535 
536  destroyCloverForceEvents(); // not actually used
537 
538 #else // GPU_CLOVER_DIRAC not defined
539  errorQuda("Clover Dirac operator has not been built!");
540 #endif
541 
542  checkCudaError();
543  return;
544  } // computeCloverForce
545 
546 
547 
548 } // namespace quda
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
cudaEvent_t event
dim3 dim3 blockDim
int commDimPartitioned(int dir)
bool getKernelPackT()
Definition: dslash_quda.cu:61
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:73
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
#define errorQuda(...)
Definition: util_quda.h:90
void setPackComms(const int *commDim)
Definition: dslash_pack.cu:41
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * streams
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
const int Nstream
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
__device__ __host__ Matrix< complex< Float >, Nc > outerProdSpinTrace(const ColorSpinor< Float, Nc, Ns > &a, const ColorSpinor< Float, Nc, Ns > &b)
Definition: color_spinor.h:849
KernelType
IndexType
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __inline__ size_t p
const auto * Xh
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:71
static unsigned int unsigned int shift
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
static const int aux_n
Definition: tune_key.h:12
unsigned long long flops
Definition: blas_quda.cu:42
int full_idx
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
const int * GhostFace() const
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
const void * c
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:69
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:129
const void int size_t unsigned int flags
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
void comm_barrier(void)
Definition: comm_mpi.cpp:328