QUDA  0.9.0
dslash_twisted_clover.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 
24 #include <inline_ptx.h>
25 
26 namespace quda {
27 
28  namespace twistedclover {
29 
30 #include <dslash_constants.h>
31 #include <dslash_textures.h>
32 #include <dslash_index.cuh>
33 
34  // Enable shared memory dslash for Fermi architecture
35  //#define SHARED_WILSON_DSLASH
36  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
37 
38 #ifdef GPU_TWISTED_CLOVER_DIRAC
39 #include <tmc_dslash_def.h> // Twisted Clover kernels
40 #endif
41 
42 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
43 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
44 #endif
45 
46 #include <dslash_quda.cuh>
47 
48  } // end namespace twisted_clover
49 
50  // declare the dslash events
51 #include <dslash_events.cuh>
52 
53  using namespace twistedclover;
54 
55 #ifdef GPU_TWISTED_CLOVER_DIRAC
56  template <typename sFloat, typename gFloat, typename cFloat>
57  class TwistedCloverDslashCuda : public SharedDslashCuda {
58 
59  private:
60  const QudaTwistCloverDslashType dslashType;
61  double a, b, c, d;
62  const FullClover &clover;
63  const FullClover &cloverInv;
64 
65  protected:
66  unsigned int sharedBytesPerThread() const
67  {
68  if (dslashParam.kernel_type == INTERIOR_KERNEL) {
69  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
70  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
71  } else {
72  return 0;
73  }
74  }
75 
76  public:
77  TwistedCloverDslashCuda(cudaColorSpinorField *out, const GaugeField &gauge, const FullClover &clover, const FullClover &cloverInv,
78  //const cFloat *clover, const float *cNorm, const cFloat *cloverInv, const float *cNrm2, int cl_stride,
79  const cudaColorSpinorField *in, const cudaColorSpinorField *x, const QudaTwistCloverDslashType dslashType,
80  const double kappa, const double mu, const double epsilon, const double k,
81  const int parity, const int dagger, const int *commOverride)
82  : SharedDslashCuda(out, in, x, gauge, parity, dagger, commOverride), clover(clover), cloverInv(cloverInv), dslashType(dslashType)
83  {
84  QudaPrecision clover_prec = bindTwistedCloverTex(clover, cloverInv, parity, dslashParam);
85  if (in->Precision() != clover_prec) errorQuda("Mixing clover and spinor precision not supported");
86 
87 #ifndef DYNAMIC_CLOVER
88  if (clover.stride != cloverInv.stride)
89  errorQuda("clover and cloverInv must have matching strides (%d != %d)", clover.stride, cloverInv.stride);
90 #endif
91 
92  a = kappa;
93  b = mu;
94  c = epsilon;
95  d = k;
96 
97  dslashParam.twist_a = 0.0;
98  dslashParam.twist_b = 0.0;
99  dslashParam.a = kappa;
100  dslashParam.a_f = kappa;
101  dslashParam.b = mu;
102  dslashParam.b_f = mu;
103  dslashParam.cl_stride = clover.stride;
104  dslashParam.fl_stride = in->VolumeCB();
105  }
106 
107  virtual ~TwistedCloverDslashCuda() {
108  unbindSpinorTex<sFloat>(in, out, x);
110  }
111 
112  void apply(const cudaStream_t &stream)
113  {
114 #ifdef SHARED_WILSON_DSLASH
115  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) errorQuda("Shared dslash does not yet support X-dimension partitioning");
116 #endif
117 #ifndef USE_TEXTURE_OBJECTS
118  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
119 #endif // USE_TEXTURE_OBJECTS
120  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
121  setParam();
122  dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
123  for (int i=0; i<4; i++) dslashParam.grid[i] = ( (i==0 ? 2 : 1) * in->X(i)) / dslashParam.block[i];
124 
125  switch(dslashType){
127  DSLASH(twistedCloverInvDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
128  break;
130  DSLASH(twistedCloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
131  break;
133  DSLASH(twistedCloverDslashTwist, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
134  break;
135  default:
136  errorQuda("Invalid twisted clover dslash type");
137  }
138  }
139 
140  TuneKey tuneKey() const
141  {
142  TuneKey key = DslashCuda::tuneKey();
143  switch (dslashType) {
145 #ifndef DYNAMIC_CLOVER
146  strcat(key.aux,",CloverTwistInvDslash");
147 #else
148  strcat(key.aux,",CloverTwistInvDynDslash");
149 #endif
150  break;
152 #ifndef DYNAMIC_CLOVER
153  strcat(key.aux,",Dslash");
154 #else
155  strcat(key.aux,",DynDslash");
156 #endif
157  break;
159 #ifndef DYNAMIC_CLOVER
160  strcat(key.aux,",DslashCloverTwist");
161 #else
162  strcat(key.aux,",DynDslashCloverTwist");
163 #endif
164  break;
165  default:
166  errorQuda("Unsupported twisted-dslash type %d", dslashType);
167  }
168  return key;
169  }
170 
171  long long flops() const {
172  int clover_flops = 504 + 48;
173  long long flops = DslashCuda::flops();
174  switch(dslashParam.kernel_type) {
175  case EXTERIOR_KERNEL_X:
176  case EXTERIOR_KERNEL_Y:
177  case EXTERIOR_KERNEL_Z:
178  case EXTERIOR_KERNEL_T:
179  case EXTERIOR_KERNEL_ALL:
180  break;
181  case INTERIOR_KERNEL:
182  case KERNEL_POLICY:
183  // clover flops are done in the interior kernel
184  flops += clover_flops * in->VolumeCB();
185  break;
186  }
187  return flops;
188  }
189 
190  long long bytes() const {
191  bool isHalf = in->Precision() == sizeof(short) ? true : false;
192  int clover_bytes = 72 * in->Precision() + (isHalf ? 2*sizeof(float) : 0);
193  long long bytes = DslashCuda::bytes();
194  switch(dslashParam.kernel_type) {
195  case EXTERIOR_KERNEL_X:
196  case EXTERIOR_KERNEL_Y:
197  case EXTERIOR_KERNEL_Z:
198  case EXTERIOR_KERNEL_T:
199  case EXTERIOR_KERNEL_ALL:
200  break;
201  case INTERIOR_KERNEL:
202  case KERNEL_POLICY:
203  bytes += clover_bytes*in->VolumeCB();
204  break;
205  }
206 
207  return bytes;
208  }
209 
210  };
211 #endif // GPU_TWISTED_CLOVER_DIRAC
212 
213 #include <dslash_policy.cuh>
214 
216  const cudaColorSpinorField *in, const int parity, const int dagger,
217  const cudaColorSpinorField *x, const QudaTwistCloverDslashType type, const double &kappa, const double &mu,
218  const double &epsilon, const double &k, const int *commOverride, TimeProfile &profile)
219  {
220 #ifdef GPU_TWISTED_CLOVER_DIRAC
221  const_cast<cudaColorSpinorField*>(in)->createComms(1);
222 
223  DslashCuda *dslash = nullptr;
224  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
225  dslash = new TwistedCloverDslashCuda<double2,double2,double2>
226  (out, gauge, *clover, *cloverInv, in, x, type, kappa, mu, epsilon, k, parity, dagger, commOverride);
227  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
228  dslash = new TwistedCloverDslashCuda<float4,float4,float4>
229  (out, gauge, *clover, *cloverInv, in, x, type, kappa, mu, epsilon, k, parity, dagger, commOverride);
230  } else if (in->Precision() == QUDA_HALF_PRECISION) {
231  dslash = new TwistedCloverDslashCuda<short4,short4,short4>
232  (out, gauge, *clover, *cloverInv, in, x, type, kappa, mu, epsilon, k, parity, dagger, commOverride);
233  }
234 
235  int ghost_threads[4] = {0};
236  int bulk_threads = (in->TwistFlavor() == QUDA_TWIST_SINGLET) ? in->Volume() : in->Volume() / 2;
237  for (int i=0;i<4;i++) ghost_threads[i] = (in->TwistFlavor() == QUDA_TWIST_SINGLET) ? in->GhostFace()[i] : in->GhostFace()[i] / 2;
238 
239  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), bulk_threads, ghost_threads, profile);
240  dslash_policy.apply(0);
241 
242  delete dslash;
243 #else
244  errorQuda("Twisted clover dslash has not been built");
245 #endif
246  }
247 
248 }
virtual long long bytes() const
double mu
Definition: test_util.cpp:1643
enum QudaPrecision_s QudaPrecision
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
QudaPrecision bindTwistedCloverTex(const FullClover clover, const FullClover cloverInv, const int oddBit, T &dslashParam)
enum QudaTwistCloverDslashType_s QudaTwistCloverDslashType
cudaStream_t * stream
void twistedCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover *clover, const FullClover *cloverInv, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const QudaTwistCloverDslashType type, const double &kappa, const double &mu, const double &epsilon, const double &k, const int *commDim, TimeProfile &profile)
char * strcat(char *__s1, const char *__s2)
#define b
VOLATILE spinorFloat kappa
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
virtual TuneKey tuneKey() const
const void * c
#define DSLASH_SHARED_FLOATS_PER_THREAD
void unbindTwistedCloverTex(const FullClover clover)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
virtual long long flops() const
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43