QUDA  0.9.0
dslash_clover.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 
24 #include <inline_ptx.h>
25 
26 namespace quda {
27 
28  namespace clover {
29 
30 #undef GPU_STAGGERED_DIRAC // do not delete - hack for Tesla architecture
31 
32 #ifndef GPU_DOMAIN_WALL_DIRAC
33 #define GPU_DOMAIN_WALL_DIRAC // do not delete - work around for CUDA 6.5 alignment bug
34 #endif
35 
36 #include <dslash_constants.h>
37 #include <dslash_textures.h>
38 #include <dslash_index.cuh>
39 
40  // Enable shared memory dslash for Fermi architecture
41  //#define SHARED_WILSON_DSLASH
42  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
43 
44 #ifdef GPU_CLOVER_DIRAC
45 #define DD_CLOVER 1
46 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover)
47 #undef DD_CLOVER
48 #endif
49 
50 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
51 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
52 #endif
53 
54 #include <dslash_quda.cuh>
55 
56  } // end namespace clover
57 
58  // declare the dslash events
59 #include <dslash_events.cuh>
60 
61  using namespace clover;
62 
63 #ifdef GPU_CLOVER_DIRAC
64  template <typename sFloat, typename gFloat, typename cFloat>
65  class CloverDslashCuda : public SharedDslashCuda {
66 
67  protected:
68  const FullClover &cloverInv;
69 
70  unsigned int sharedBytesPerThread() const
71  {
72  if (dslashParam.kernel_type == INTERIOR_KERNEL) {
73  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
74  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
75  } else {
76  return 0;
77  }
78  }
79  public:
80  CloverDslashCuda(cudaColorSpinorField *out, const GaugeField &gauge, const FullClover &cloverInv, const cudaColorSpinorField *in,
81  const cudaColorSpinorField *x, const double a, const int parity, const int dagger, const int *commOverride)
82  : SharedDslashCuda(out, in, x, gauge, parity, dagger, commOverride), cloverInv(cloverInv)
83  {
84  QudaPrecision clover_prec = bindCloverTex(cloverInv, parity, dslashParam);
85  if (in->Precision() != clover_prec) errorQuda("Mixing clover and spinor precision not supported");
86  dslashParam.a = a;
87  dslashParam.a_f = a;
88  dslashParam.cl_stride = cloverInv.stride;
89  }
90 
91  virtual ~CloverDslashCuda() {
92  unbindSpinorTex<sFloat>(in, out, x);
93  unbindCloverTex(cloverInv);
94  }
95 
96  void apply(const cudaStream_t &stream)
97  {
98 #ifdef SHARED_WILSON_DSLASH
99  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) errorQuda("Shared dslash does not yet support X-dimension partitioning");
100 #endif
101 #ifndef USE_TEXTURE_OBJECTS
102  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
103 #endif // USE_TEXTURE_OBJECTS
104  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
105  setParam();
106  dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
107  for (int i=0; i<4; i++) dslashParam.grid[i] = ( (i==0 ? 2 : 1) * in->X(i)) / dslashParam.block[i];
108  DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
109  }
110 
111  long long flops() const {
112  int clover_flops = 504;
113  long long flops = DslashCuda::flops();
114  switch(dslashParam.kernel_type) {
115  case EXTERIOR_KERNEL_X:
116  case EXTERIOR_KERNEL_Y:
117  case EXTERIOR_KERNEL_Z:
118  case EXTERIOR_KERNEL_T:
119  flops += clover_flops * in->GhostFace()[dslashParam.kernel_type];
120  break;
121  case EXTERIOR_KERNEL_ALL:
122  flops += clover_flops * 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
123  break;
124  case INTERIOR_KERNEL:
125  case KERNEL_POLICY:
126  flops += clover_flops * in->VolumeCB();
127 
128  if (dslashParam.kernel_type == KERNEL_POLICY) break;
129  // now correct for flops done by exterior kernel
130  long long ghost_sites = 0;
131  for (int d=0; d<4; d++) if (dslashParam.commDim[d]) ghost_sites += 2 * in->GhostFace()[d];
132  flops -= clover_flops * ghost_sites;
133 
134  break;
135  }
136  return flops;
137  }
138 
139  long long bytes() const {
140  bool isHalf = in->Precision() == sizeof(short) ? true : false;
141  int clover_bytes = 72 * in->Precision() + (isHalf ? 2*sizeof(float) : 0);
142 
143  long long bytes = DslashCuda::bytes();
144  switch(dslashParam.kernel_type) {
145  case EXTERIOR_KERNEL_X:
146  case EXTERIOR_KERNEL_Y:
147  case EXTERIOR_KERNEL_Z:
148  case EXTERIOR_KERNEL_T:
149  bytes += clover_bytes * 2 * in->GhostFace()[dslashParam.kernel_type];
150  break;
151  case EXTERIOR_KERNEL_ALL:
152  bytes += clover_bytes * 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
153  break;
154  case INTERIOR_KERNEL:
155  case KERNEL_POLICY:
156  bytes += clover_bytes*in->VolumeCB();
157 
158  if (dslashParam.kernel_type == KERNEL_POLICY) break;
159  // now correct for bytes done by exterior kernel
160  long long ghost_sites = 0;
161  for (int d=0; d<4; d++) if (dslashParam.commDim[d]) ghost_sites += 2*in->GhostFace()[d];
162  bytes -= clover_bytes * ghost_sites;
163 
164  break;
165  }
166 
167  return bytes;
168  }
169 
170  };
171 #endif // GPU_CLOVER_DIRAC
172 
173 #include <dslash_policy.cuh>
174 
175  void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover &cloverInv,
176  const cudaColorSpinorField *in, const int parity, const int dagger,
177  const cudaColorSpinorField *x, const double &a, const int *commOverride,
178  TimeProfile &profile)
179  {
180 #ifdef GPU_CLOVER_DIRAC
181  const_cast<cudaColorSpinorField*>(in)->createComms(1);
182 
183  DslashCuda *dslash = nullptr;
184  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
185  dslash = new CloverDslashCuda<double2, double2, double2>(out, gauge, cloverInv, in, x, a, parity, dagger, commOverride);
186  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
187  dslash = new CloverDslashCuda<float4, float4, float4>(out, gauge, cloverInv, in, x, a, parity, dagger, commOverride);
188  } else if (in->Precision() == QUDA_HALF_PRECISION) {
189  dslash = new CloverDslashCuda<short4, short4, short4>(out, gauge, cloverInv, in, x, a, parity, dagger, commOverride);
190  }
191 
192  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume(), in->GhostFace(), profile);
193  dslash_policy.apply(0);
194 
195  delete dslash;
196 #else
197  errorQuda("Clover dslash has not been built");
198 #endif
199 
200  }
201 
202 }
virtual long long bytes() const
QudaPrecision bindCloverTex(const FullClover &clover, const int oddBit, T &dslashParam)
enum QudaPrecision_s QudaPrecision
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover &cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
#define DSLASH_SHARED_FLOATS_PER_THREAD
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void unbindCloverTex(const FullClover clover)
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
virtual long long flops() const
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43