QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
covDev.cu
Go to the documentation of this file.
1 #ifdef GPU_CONTRACT
2 
3 #include <cstdlib>
4 #include <cstdio>
5 #include <string>
6 #include <iostream>
7 
8 #include <color_spinor_field.h>
9 #include <clover_field.h>
10 
11 // these control the Wilson-type actions
12 
13 #include <quda_internal.h>
14 #include <dslash_quda.h>
15 #include <sys/time.h>
16 #include <blas_quda.h>
17 #include <face_quda.h>
18 
19 #include <inline_ptx.h>
20 
21 
22 namespace quda
23 {
24  namespace covdev
25  {
26  #include <dslash_constants.h>
27  #include <dslash_textures.h>
28  #include <dslash_index.cuh>
29 
30  // Enable shared memory dslash for Fermi architecture
31  //#define SHARED_WILSON_DSLASH
32  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
33 
34  #include <dslash_quda.cuh>
35 
36  #include "covDev.h" //Covariant derivative definitions
37  #include <dslash_init.cuh>
38  } // end namespace wilson
39 
40  // declare the dslash events
41  #include <dslash_events.cuh>
42 
43  using namespace covdev;
44 
49  #define MORE_GENERIC_COVDEV(FUNC, dir, DAG, kernel_type, gridDim, blockDim, shared, stream, param, ...) \
50  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
51  switch (dir) { \
52  case 0: \
53  FUNC ## 018 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
54  break; \
55  case 1: \
56  FUNC ## 118 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
57  break; \
58  case 2: \
59  FUNC ## 218 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
60  break; \
61  case 3: \
62  FUNC ## 318 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
63  break; \
64  } \
65  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
66  switch (dir) { \
67  case 0: \
68  FUNC ## 012 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
69  break; \
70  case 1: \
71  FUNC ## 112 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
72  break; \
73  case 2: \
74  FUNC ## 212 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
75  break; \
76  case 3: \
77  FUNC ## 312 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
78  break; \
79  } \
80  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
81  switch (dir) { \
82  case 0: \
83  FUNC ## 08 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
84  break; \
85  case 1: \
86  FUNC ## 18 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
87  break; \
88  case 2: \
89  FUNC ## 28 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
90  break; \
91  case 3: \
92  FUNC ## 38 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
93  break; \
94  } \
95  }
96 
97  #define GENERIC_COVDEV(FUNC, dir, DAG, gridDim, blockDim, shared, stream, param, ...) \
98  switch(param.kernel_type) { \
99  case INTERIOR_KERNEL: \
100  MORE_GENERIC_COVDEV(FUNC, dir, DAG, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
101  break; \
102  case EXTERIOR_KERNEL_X: \
103  MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
104  break; \
105  case EXTERIOR_KERNEL_Y: \
106  MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
107  break; \
108  case EXTERIOR_KERNEL_Z: \
109  MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
110  break; \
111  case EXTERIOR_KERNEL_T: \
112  MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
113  break; \
114  }
115 
116  #define COVDEV(FUNC, mu, gridDim, blockDim, shared, stream, param, ...) \
117  if (mu < 4) { \
118  GENERIC_COVDEV(FUNC, mu, , gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
119  } else { \
120  int nMu = mu - 4; \
121  GENERIC_COVDEV(FUNC, nMu, Dagger, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
122  }
123 
124  #define PROFILE(f, profile, idx) \
125  profile.Start(idx); \
126  f; \
127  profile.Stop(idx);
128 
133  void covDevCuda(DslashCuda &dslash, const size_t regSize, const int mu, TimeProfile &profile)
134  {
135  profile.Start(QUDA_PROFILE_TOTAL);
136 
137  const int dir = mu%4;
138 
139  dslashParam.kernel_type = INTERIOR_KERNEL;
140 
141  PROFILE(dslash.apply(streams[Nstream-1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
142 
143  checkCudaError();
144 
145  #ifdef MULTI_GPU
146  if(comm_dim(dir) > 1)
147  {
148  dslashParam.kernel_type = static_cast<KernelType>(dir);
149  dslashParam.ghostDim[dir] = commDimPartitioned(dir); // determines whether to use regular or ghost indexing at boundary
150  dslashParam.commDim[dir] = commDimPartitioned(dir); // switch off comms if override = 0
151 
152  PROFILE(dslash.apply(streams[Nstream-1]), profile, QUDA_PROFILE_DSLASH_KERNEL);
153 
154  checkCudaError();
155 
156  dslashParam.ghostDim[dir] = 0; // not sure whether neccessary
157  dslashParam.commDim[dir] = 0;
158  }
159  #endif // MULTI_GPU
160 
161  profile.Stop(QUDA_PROFILE_TOTAL);
162  }
163 
168  template <typename Float, typename Float2>
169  class CovDevCuda : public SharedDslashCuda
170  {
171  private:
172  const cudaGaugeField *gauge;
173  const int mu; // Direction to apply the covariant derivative. Anything beyond 3 is understood as dagger and moving backwards (from x to x-(mu%4))
174  const int dir; // This is the real direction xyzt, that is, mu%4
175  const int parity; // The current implementation works on parity spinors
176 
177  void *gauge0, *gauge1;
178 
179  bool binded;
180 
181  #ifdef MULTI_GPU
182  Float *ghostBuffer; // For the ghosts, I did my own implementation, jsut because the number of functions to overload was absurd
183  int ghostVolume;
184  int ghostBytes;
185  int offset;
186  int Npad;
187  int Nvec;
188  int Nint;
189  #endif
190 
191  #ifdef USE_TEXTURE_OBJECTS
192  cudaTextureObject_t tex;
193  #endif
194 
195  protected:
196  unsigned int minThreads() const
197  {
198  #ifdef MULTI_GPU
199  if(dslashParam.kernel_type == INTERIOR_KERNEL) { return in->Volume(); } else { return ghostVolume; }
200  #else
201  return in->Volume();
202  #endif
203  }
204 
205  unsigned int sharedBytesPerThread() const { return 0; }
206 
207  dim3 createGrid(const dim3 &block) const // Tuning is supported in a one-dimensional grid, because other grids gave me wrong results
208  {
209  unsigned int gx = (dslashParam.threads + block.x - 1) / block.x;
210  unsigned int gy = 1;
211  unsigned int gz = 1;
212 
213  return dim3(gx, gy, gz);
214  }
215 
220  bool advanceBlockDim(TuneParam &param) const
221  {
222  //if(dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceBlockDim(param);
223  const unsigned int min_threads = 2;
224  const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0];
225 
226  param.block.x += 2;
227  param.block.y = 1;
228  param.block.z = 1;
229  param.grid = createGrid(param.block);
230 
231  if((param.block.x > min_threads) && (param.block.x < max_threads))
232  return true;
233  else
234  return false;
235  }
236 
237  #ifdef MULTI_GPU
238 
243  void allocateGhosts()
244  {
245  if(cudaMalloc(&ghostBuffer, ghostBytes) != cudaSuccess) {
246  printf("Error in rank %d: Unable to allocate %d bytes for GPU ghosts\n", comm_rank(), ghostBytes);
247  exit(-1);
248  }
249  }
250 
257  void exchangeGhosts()
258  {
259  const int rel = (mu < 4) ? 1 : -1;
260 
261  void *send = 0;
262  void *recv = 0;
263 
264  // Send buffers:
265  if(cudaHostAlloc(&send, ghostBytes, 0) != cudaSuccess) {
266  printf("Error in rank %d: Unable to allocate %d bytes for MPI requests (send)\n", comm_rank(), ghostBytes);
267  exit(-1);
268  }
269 
270  // Receive buffers:
271  if(cudaHostAlloc(&recv, ghostBytes, 0) != cudaSuccess) {
272  printf("Error in rank %d: Unable to allocate %d bytes for MPI requests (recv)\n", comm_rank(), ghostBytes);
273  exit(-1);
274  }
275 
276  switch(mu) {
277  default:
278  break;
279 
280  case 0: { // x from point p to p + x
281  void *sendFacePtr = (char*) in->V();
282  size_t len = Nvec*sizeof(Float);
283  size_t skip = len*in->X(0);
284  size_t dpitch = ghostVolume*Nvec*sizeof(Float);
285  size_t spitch = in->Stride()*Nvec*sizeof(Float);
286 
287  for(int t=0;t<ghostVolume;t++) { // I know this is a crime...
288  cudaMemcpy2DAsync((void*) (((char*)send)+len*t), dpitch, (void*) (((char*)sendFacePtr)+skip*t),
289  spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
290  cudaStreamSynchronize(streams[0]);
291  }
292  }
293  break;
294 
295  case 1: { // y from point p to p + y
296  void *sendFacePtr = (char*)in->V();
297  size_t len = in->X(0)*Nvec*sizeof(Float);
298  size_t skip = len*in->X(1);
299  size_t dpitch = ghostVolume*Nvec*sizeof(Float);
300  size_t spitch = in->Stride()*Nvec*sizeof(Float);
301 
302  for(int tz=0;tz<(in->X(2)*in->X(3));tz++) { // Although is terribly inefficient, it should work. The problem is that it doesn't
303  cudaMemcpy2DAsync((void*) (((char*)send)+len*tz), dpitch, (void*) (((char*)sendFacePtr)+skip*tz),
304  spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
305  cudaStreamSynchronize(streams[0]);
306  }
307  }
308  break;
309 
310  case 2: { // z from point p to p + z
311  void *sendFacePtr = (char*) in->V();
312  size_t len = ghostVolume*Nvec*sizeof(Float)/in->X(3);
313  size_t skip = len*in->X(2);
314  size_t dpitch = ghostVolume*Nvec*sizeof(Float);
315  size_t spitch = in->Stride()*Nvec*sizeof(Float);
316 
317  for(int t=0;t<in->X(3);t++) { // I'm sure this can be improved
318  cudaMemcpy2DAsync((void*) (((char*)send)+len*t), dpitch, (void*) (((char*)sendFacePtr)+skip*t),
319  spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
320  cudaStreamSynchronize(streams[0]);
321  }
322  }
323  break;
324 
325  case 3: { // t from point p to p + t
326  void *sendFacePtr = (char*)in->V();
327  size_t len = ghostVolume*Nvec*sizeof(Float);
328  size_t spitch = in->Stride()*Nvec*sizeof(Float);
329  cudaMemcpy2DAsync(send, len, sendFacePtr, spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
330  cudaStreamSynchronize(streams[0]);
331  }
332  break;
333 
334  // Dagger versions (mu >= 4) follow
335 
336  case 4: { // x dagger from point p to p - x
337  void *sendFacePtr = (char*) in->V() + offset*Nvec*sizeof(Float);
338  size_t len = Nvec*sizeof(Float);
339  size_t skip = len*in->X(0);
340  size_t dpitch = ghostVolume*Nvec*sizeof(Float);
341  size_t spitch = in->Stride()*Nvec*sizeof(Float);
342 
343  for(int t=0;t<ghostVolume;t++) {
344  cudaMemcpy2DAsync((void*) (((char*)send)+len*t), dpitch, (void*) (((char*)sendFacePtr)+skip*t),
345  spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
346  cudaStreamSynchronize(streams[0]);
347  }
348  }
349  break;
350 
351  case 5: { // y dagger from point p to p - y
352  void *sendFacePtr = ((char*) in->V()) + offset*Nvec*sizeof(Float);
353  size_t len = in->X(0)*Nvec*sizeof(Float);
354  size_t skip = len*in->X(1);
355  size_t dpitch = ghostVolume*Nvec*sizeof(Float);
356  size_t spitch = in->Stride()*Nvec*sizeof(Float);
357 
358  for(int tz=0;tz<(in->X(2)*in->X(3));tz++) {
359  cudaMemcpy2DAsync((void*) (((char*)send)+len*tz), dpitch, (void*) (((char*)sendFacePtr)+skip*tz),
360  spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
361  cudaStreamSynchronize(streams[0]);
362  }
363  }
364  break;
365 
366  case 6: { // z dagger from point p to p - z
367  void *sendFacePtr = (((char*)in->V()) + offset*Nvec*sizeof(Float));
368  size_t len = ghostVolume*Nvec*sizeof(Float)/in->X(3);
369  size_t skip = len*in->X(2);
370  size_t dpitch = ghostVolume*Nvec*sizeof(Float);
371  size_t spitch = in->Stride()*Nvec*sizeof(Float);
372 
373  for(int t=0;t<in->X(3);t++) {
374  cudaMemcpy2DAsync((void*) (((char*)send)+len*t), dpitch, (void*) (((char*)sendFacePtr)+skip*t),
375  spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
376  cudaStreamSynchronize(streams[0]);
377  }
378  }
379  break;
380 
381  case 7: { // t dagger from point p to p - t
382  void *sendFacePtr = (char*)in->V() + offset*Nvec*sizeof(Float);
383  size_t len = ghostVolume*Nvec*sizeof(Float);
384  size_t spitch = in->Stride()*Nvec*sizeof(Float);
385 
386  cudaMemcpy2DAsync(send, len, sendFacePtr, spitch, len, Npad, cudaMemcpyDeviceToHost, streams[0]);
387  cudaStreamSynchronize(streams[0]);
388  }
389  break;
390  }
391 
392  // Send buffers to neighbors:
393 
394  MsgHandle *mh_send;
395  MsgHandle *mh_from;
396 
397  mh_send = comm_declare_send_relative (send, dir, rel, ghostBytes);
398  mh_from = comm_declare_receive_relative (recv, dir, rel*(-1), ghostBytes);
399  comm_start (mh_send);
400  comm_start (mh_from);
401  comm_wait (mh_send);
402  comm_wait (mh_from);
403  comm_free (mh_send);
404  comm_free (mh_from);
405 
406  // Send buffers to GPU:
407  cudaMemcpy(ghostBuffer, recv, ghostBytes, cudaMemcpyHostToDevice);
408  cudaDeviceSynchronize();
409 
410  cudaFreeHost(send);
411  cudaFreeHost(recv);
412  }
413 
414 
415  void freeGhosts() { cudaFree(ghostBuffer); }
416 
417  void bindGhosts()
418  {
419  if(binded == false) { // bind only once
420  #ifdef USE_TEXTURE_OBJECTS
421  cudaChannelFormatDesc desc;
422  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
423  if (in->Precision() == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
424  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
425 
426  // staggered fields in half and single are always two component
427  if (in->Nspin() == 1 && (in->Precision() == QUDA_SINGLE_PRECISION)) {
428  desc.x = 8*in->Precision();
429  desc.y = 8*in->Precision();
430  desc.z = 0;
431  desc.w = 0;
432  } else { // all others are four component
433  desc.x = (in->Precision() == QUDA_DOUBLE_PRECISION) ? 32 : 8*in->Precision();
434  desc.y = (in->Precision() == QUDA_DOUBLE_PRECISION) ? 32 : 8*in->Precision();
435  desc.z = (in->Precision() == QUDA_DOUBLE_PRECISION) ? 32 : 8*in->Precision();
436  desc.w = (in->Precision() == QUDA_DOUBLE_PRECISION) ? 32 : 8*in->Precision();
437  }
438 
439  cudaResourceDesc resDesc;
440  memset(&resDesc, 0, sizeof(resDesc));
441  resDesc.resType = cudaResourceTypeLinear;
442  resDesc.res.linear.devPtr = ghostBuffer;
443  resDesc.res.linear.desc = desc;
444  resDesc.res.linear.sizeInBytes = Nint * ghostVolume * sizeof(Float);
445 
446  cudaTextureDesc texDesc;
447  memset(&texDesc, 0, sizeof(texDesc));
448  texDesc.readMode = cudaReadModeElementType;
449 
450  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
451 
452  dslashParam.inTex = tex;
453  #else
454  if(in->Precision() == QUDA_DOUBLE_PRECISION)
455  cudaBindTexture(0, spinorTexDouble, (Float2*)ghostBuffer, ghostBytes);
456  else if(in->Precision() == QUDA_SINGLE_PRECISION)
457  cudaBindTexture(0, spinorTexSingle, (Float2*)ghostBuffer, ghostBytes);
458  else
459  errorQuda("Half precision for covariant derivative not supported.");
460  #endif
461  checkCudaError();
462  binded = true;
463  }
464  }
465 
466  void unbindGhosts() {
467  if(binded == true) {
468  #ifdef USE_TEXTURE_OBJECTS
469  cudaDestroyTextureObject(tex);
470  #else
471  if(in->Precision() == QUDA_DOUBLE_PRECISION)
472  cudaUnbindTexture(spinorTexDouble);
473  else
474  cudaUnbindTexture(spinorTexSingle);
475  #endif
476  checkCudaError();
477  binded = false;
478  }
479  }
480  #endif /* MULTI_GPU */
481 
482  void unbindGauge() {
484  checkCudaError();
485  }
486 
487 
488  public:
489  CovDevCuda(cudaColorSpinorField *out, const cudaGaugeField *gauge, const cudaColorSpinorField *in, const int parity, const int mu)
490  : SharedDslashCuda(out, in, 0, gauge->Reconstruct(), mu<4 ? 0 : 1), gauge(gauge), parity(parity), mu(mu), dir(mu%4), binded(false)
491  {
492  bindSpinorTex<Float2>(in, out);
493  bindGaugeTex(*gauge, parity, &gauge0, &gauge1);
494 
495  #ifdef MULTI_GPU
496  if(comm_dim(dir) > 1) {
497  Nvec = sizeof(Float2)/sizeof(Float);
498  Nint = in->Ncolor()*in->Nspin()*Nvec;
499  Npad = Nint/Nvec;
500 
501  switch(dir) {
502  case 0:
503  ghostVolume = in->X(1)*in->X(2)*in->X(3)/2;
504  offset = in->X(0) - 1;
505  break;
506 
507  case 1:
508  ghostVolume = in->X(0)*in->X(2)*in->X(3);
509  offset = in->X(0)*(in->X(1) - 1);
510  break;
511 
512  case 2:
513  ghostVolume = in->X(0)*in->X(1)*in->X(3);
514  offset = in->X(0)*in->X(1)*(in->X(2) - 1);
515  break;
516 
517  case 3:
518  ghostVolume = in->X(0)*in->X(1)*in->X(2);
519  offset = in->Volume() - ghostVolume;
520  break;
521  }
522 
523  ghostBytes = ghostVolume*Nint*sizeof(Float);
524  allocateGhosts();
525  }
526  #endif
527  }
528 
529  virtual ~CovDevCuda() {
530  #ifdef MULTI_GPU
531  if(comm_dim(dir) > 1) {
532  unbindGhosts();
533  freeGhosts();
534  }
535  #endif
536  unbindGauge();
537  }
538 
539  void apply(const cudaStream_t &stream) {
540  #ifdef SHARED_WILSON_DSLASH
541  if(dslashParam.kernel_type == EXTERIOR_KERNEL_X)
542  errorQuda("Shared dslash (covariant derivative) does not yet support X-dimension partitioning");
543  #endif
544  if((dslashParam.kernel_type == EXTERIOR_KERNEL_X) || (dslashParam.kernel_type == EXTERIOR_KERNEL_Y))
545  errorQuda("Covariant derivative does not yet support X or Y-dimension partitioning");
546 
547  dslashParam.parity = parity;
548 
549  for(int i=0; i<4; i++) {
550  dslashParam.ghostDim[i] = 0;
551  dslashParam.ghostOffset[i] = 0;
552  dslashParam.ghostNormOffset[i] = 0;
553  dslashParam.commDim[i] = 0;
554  }
555 
556  if(dslashParam.kernel_type != INTERIOR_KERNEL) {
557  #ifdef MULTI_GPU
558  dslashParam.threads = ghostVolume;
559  exchangeGhosts(); // We must exchange ghosts here because the covDevCuda function requires a Dslash class as parameter
560  bindGhosts(); // and the dslash class doesn't have these specific functions to exchange ghosts
561 
562  // Maybe I should rebind the spinors for the INTERIOR kernels after this???
563 
564  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
565  COVDEV(covDevM, mu, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, (Float2*)out->V(), (Float2*)gauge0, (Float2*)gauge1, (Float2*)in->V());
566  #endif
567  } else {
568  dslashParam.threads = in->Volume();
569  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
570  COVDEV(covDevM, mu, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, (Float2*)out->V(), (Float2*)gauge0, (Float2*)gauge1, (Float2*)in->V());
571  }
572  }
573 
574  long long flops() const { return 144 * in->Volume(); } // Correct me if I'm wrong
575  };
576 
577 
587  void covDev(cudaColorSpinorField *out, cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int mu, TimeProfile &profile) {
588  DslashCuda *covdev = 0;
589  size_t regSize = sizeof(float);
590 
591  #ifdef GPU_CONTRACT
592  if(in->Precision() == QUDA_HALF_PRECISION)
593  errorQuda("Error: Half precision not supported");
594 
595  if(in->Precision() != gauge.Precision())
596  errorQuda("Mixing gauge %d and spinor %d precision not supported", gauge.Precision(), in->Precision());
597 
598  initConstants(gauge, profile); // The covariant derivative doesn't have an associated dirac operator, so we need to initialize the dslash constants somewhere else
599 
600  if(in->Precision() == QUDA_SINGLE_PRECISION)
601  covdev = new CovDevCuda<float, float4>(out, &gauge, in, parity, mu);
602  else if(in->Precision () == QUDA_DOUBLE_PRECISION) {
603  #if (__COMPUTE_CAPABILITY__ >= 130)
604  covdev = new CovDevCuda<double, double2>(out, &gauge, in, parity, mu);
605  regSize = sizeof(double);
606  #else
607  errorQuda("Error: Double precision not supported by hardware");
608  #endif
609  }
610 
611  covDevCuda(*covdev, regSize, mu, profile);
612 
613  delete covdev;
614  checkCudaError();
615  #else
616  errorQuda("Contraction kernels have not been built");
617  #endif
618  }
619  }
620 #endif
int comm_rank(void)
Definition: comm_mpi.cpp:80
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
int comm_dim(int dim)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
cudaStream_t * streams
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
cudaStream_t * stream
const int Nstream
KernelType
QudaGaugeParam param
Definition: pack_test.cpp:17
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:174
texture< int4, 1 > spinorTexDouble
texture< float4, 1, cudaReadModeElementType > spinorTexSingle
FloatingPoint< float > Float
Definition: gtest.h:7350
cpuColorSpinorField * in
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:180
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
cpuColorSpinorField * out
void * memset(void *s, int c, size_t n)
if(x2 >=X2) return
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define checkCudaError()
Definition: util_quda.h:110
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:186
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
void covDev(cudaColorSpinorField *out, cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int mu, TimeProfile &profile)