QUDA  0.9.0
dslash_clover_asym.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 
24 #include <inline_ptx.h>
25 
26 namespace quda {
27 
28  namespace asym_clover {
29 
30 #undef GPU_STAGGERED_DIRAC
31 #include <dslash_constants.h>
32 #include <dslash_textures.h>
33 #include <dslash_index.cuh>
34 
35  // Enable shared memory dslash for Fermi architecture
36  //#define SHARED_WILSON_DSLASH
37  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
38 
39 #ifdef GPU_CLOVER_DIRAC
40 #define DD_CLOVER 2
41 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover)
42 #undef DD_CLOVER
43 #endif
44 
45 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
46 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
47 #endif
48 
49 #include <dslash_quda.cuh>
50 
51  } // end namespace asym_clover
52 
53  // declare the dslash events
54 #include <dslash_events.cuh>
55 
56  using namespace asym_clover;
57 
58 #ifdef GPU_CLOVER_DIRAC
59  template <typename sFloat, typename gFloat, typename cFloat>
60  class AsymCloverDslashCuda : public SharedDslashCuda {
61 
62  protected:
63  const FullClover &clover;
64 
65  unsigned int sharedBytesPerThread() const
66  {
67  if (dslashParam.kernel_type == INTERIOR_KERNEL) {
68  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
69  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
70  } else {
71  return 0;
72  }
73  }
74 
75  public:
76  AsymCloverDslashCuda(cudaColorSpinorField *out, const GaugeField &gauge, const FullClover &clover,
77  const cudaColorSpinorField *in, const cudaColorSpinorField *x, const double a,
78  const int parity, const int dagger, const int *commOverride)
79  : SharedDslashCuda(out, in, x, gauge, parity, dagger, commOverride), clover(clover)
80  {
81  QudaPrecision clover_prec = bindCloverTex(clover, parity, dslashParam);
82  if (in->Precision() != clover_prec) errorQuda("Mixing clover and spinor precision not supported");
83  dslashParam.a = a;
84  dslashParam.a_f = a;
85  dslashParam.cl_stride = clover.stride;
86  dslashParam.rho = clover.rho;
87  dslashParam.rho_f = clover.rho;
88 
89  if (!x) errorQuda("Asymmetric clover dslash only defined for Xpay");
90  }
91 
92  virtual ~AsymCloverDslashCuda() {
93  unbindSpinorTex<sFloat>(in, out, x);
95  }
96 
97  void apply(const cudaStream_t &stream)
98  {
99 #ifdef SHARED_WILSON_DSLASH
100  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) errorQuda("Shared dslash does not yet support X-dimension partitioning");
101 #endif
102 #ifndef USE_TEXTURE_OBJECTS
103  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
104 #endif // USE_TEXTURE_OBJECTS
105  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
106  setParam();
107  dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
108  for (int i=0; i<4; i++) dslashParam.grid[i] = ( (i==0 ? 2 : 1) * in->X(i)) / dslashParam.block[i];
109  ASYM_DSLASH(asymCloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
110  }
111 
112  long long flops() const {
113  int clover_flops = 504;
114  long long flops = DslashCuda::flops();
115  switch(dslashParam.kernel_type) {
116  case EXTERIOR_KERNEL_X:
117  case EXTERIOR_KERNEL_Y:
118  case EXTERIOR_KERNEL_Z:
119  case EXTERIOR_KERNEL_T:
120  case EXTERIOR_KERNEL_ALL:
121  break;
122  case INTERIOR_KERNEL:
123  case KERNEL_POLICY:
124  // clover flops are done in the interior kernel
125  flops += clover_flops * in->VolumeCB();
126  break;
127  }
128  return flops;
129  }
130 
131  long long bytes() const {
132  bool isHalf = in->Precision() == sizeof(short) ? true : false;
133  int clover_bytes = 72 * in->Precision() + (isHalf ? 2*sizeof(float) : 0);
134  long long bytes = DslashCuda::bytes();
135  switch(dslashParam.kernel_type) {
136  case EXTERIOR_KERNEL_X:
137  case EXTERIOR_KERNEL_Y:
138  case EXTERIOR_KERNEL_Z:
139  case EXTERIOR_KERNEL_T:
140  case EXTERIOR_KERNEL_ALL:
141  break;
142  case INTERIOR_KERNEL:
143  case KERNEL_POLICY:
144  bytes += clover_bytes*in->VolumeCB();
145  break;
146  }
147 
148  return bytes;
149  }
150 
151  };
152 #endif // GPU_CLOVER_DIRAC
153 
154 #include <dslash_policy.cuh>
155 
157  const cudaColorSpinorField *in, const int parity, const int dagger,
158  const cudaColorSpinorField *x, const double &a, const int *commOverride,
159  TimeProfile &profile)
160  {
161 #ifdef GPU_CLOVER_DIRAC
162  const_cast<cudaColorSpinorField*>(in)->createComms(1);
163 
164  DslashCuda *dslash = nullptr;
165  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
166  dslash = new AsymCloverDslashCuda<double2, double2, double2>(out, gauge, clover, in, x, a, parity, dagger, commOverride);
167  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
168  dslash = new AsymCloverDslashCuda<float4, float4, float4>(out, gauge, clover, in, x, a, parity, dagger, commOverride);
169  } else if (in->Precision() == QUDA_HALF_PRECISION) {
170  dslash = new AsymCloverDslashCuda<short4, short4, short4>(out, gauge, clover, in, x, a, parity, dagger, commOverride);
171  }
172 
173  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume(), in->GhostFace(), profile);
174  dslash_policy.apply(0);
175 
176  delete dslash;
177 #else
178  errorQuda("Clover dslash has not been built");
179 #endif
180 
181  }
182 
183 }
virtual long long bytes() const
QudaPrecision bindCloverTex(const FullClover &clover, const int oddBit, T &dslashParam)
enum QudaPrecision_s QudaPrecision
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
#define DSLASH_SHARED_FLOATS_PER_THREAD
cudaStream_t * stream
#define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
void asymCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover &cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void unbindCloverTex(const FullClover clover)
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
virtual long long flops() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
unsigned long long bytes
Definition: blas_quda.cu:43