QUDA  0.9.0
dslash_wilson.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 
20 #include <quda_internal.h>
21 #include <dslash_quda.h>
22 #include <sys/time.h>
23 #include <blas_quda.h>
24 
25 #include <inline_ptx.h>
26 
27 namespace quda {
28 
29  namespace wilson {
30 
31 #include <dslash_constants.h>
32 #include <dslash_textures.h>
33 #include <dslash_index.cuh>
34 
35  // Enable shared memory dslash for Fermi architecture
36  //#define SHARED_WILSON_DSLASH
37  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
38 
39 #ifdef GPU_WILSON_DIRAC
40 #define DD_CLOVER 0
41 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover)
42 #undef DD_CLOVER
43 #endif
44 
45 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
46 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
47 #endif
48 
49 #include <dslash_quda.cuh>
50 
51  } // end namespace wilson
52 
53  // declare the dslash events
54 #include <dslash_events.cuh>
55 
56  using namespace wilson;
57 
58 #ifdef GPU_WILSON_DIRAC
59  template <typename sFloat, typename gFloat>
60  class WilsonDslashCuda : public SharedDslashCuda {
61 
62  protected:
63  unsigned int sharedBytesPerThread() const
64  {
65  if (dslashParam.kernel_type == INTERIOR_KERNEL) { // Interior kernels use shared memory for common iunput
66  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
67  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
68  } else { // Exterior kernels use no shared memory
69  return 0;
70  }
71  }
72 
73  public:
74  WilsonDslashCuda(cudaColorSpinorField *out, const GaugeField &gauge, const cudaColorSpinorField *in,
75  const cudaColorSpinorField *x, const double a, const int parity, const int dagger,
76  const int *commOverride)
77  : SharedDslashCuda(out, in, x, gauge, parity, dagger, commOverride)
78  {
79  dslashParam.a = a;
80  dslashParam.a_f = a;
81  }
82 
83  virtual ~WilsonDslashCuda() {
84  unbindSpinorTex<sFloat>(in, out, x);
85  }
86 
87  void apply(const cudaStream_t &stream)
88  {
89 #ifdef SHARED_WILSON_DSLASH
90  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X) errorQuda("Shared dslash does not yet support X-dimension partitioning");
91 #endif
92 #ifndef USE_TEXTURE_OBJECTS
93  if (dslashParam.kernel_type == INTERIOR_KERNEL) bindSpinorTex<sFloat>(in, out, x);
94 #endif // USE_TEXTURE_OBJECTS
95  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
96  setParam();
97  dslashParam.block[0] = tp.aux.x; dslashParam.block[1] = tp.aux.y; dslashParam.block[2] = tp.aux.z; dslashParam.block[3] = tp.aux.w;
98  for (int i=0; i<4; i++) dslashParam.grid[i] = ( (i==0 ? 2 : 1) * in->X(i)) / dslashParam.block[i];
99  DSLASH(dslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam);
100  }
101 
102  };
103 #endif // GPU_WILSON_DIRAC
104 
105 #include <dslash_policy.cuh>
106 
107  // Wilson wrappers
109  const int parity, const int dagger, const cudaColorSpinorField *x, const double &k,
110  const int *commOverride, TimeProfile &profile)
111  {
112 #ifdef GPU_WILSON_DIRAC
113  const_cast<cudaColorSpinorField*>(in)->createComms(1);
114 
115  DslashCuda *dslash = nullptr;
116  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
117  dslash = new WilsonDslashCuda<double2, double2>(out, gauge, in, x, k, parity, dagger, commOverride);
118  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
119  dslash = new WilsonDslashCuda<float4, float4>(out, gauge, in, x, k, parity, dagger, commOverride);
120  } else if (in->Precision() == QUDA_HALF_PRECISION) {
121  dslash = new WilsonDslashCuda<short4, short4>(out, gauge, in, x, k, parity, dagger, commOverride);
122  }
123 
124  DslashPolicyTune dslash_policy(*dslash, const_cast<cudaColorSpinorField*>(in), in->Volume(), in->GhostFace(), profile);
125  dslash_policy.apply(0);
126 
127  delete dslash;
128 #else
129  errorQuda("Wilson dslash has not been built");
130 #endif // GPU_WILSON_DIRAC
131 
132  }
133 
134 }
#define DSLASH_SHARED_FLOATS_PER_THREAD
void setParam(int kernel, int prec, int threads, int blocks)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void wilsonDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile)
cudaStream_t * stream
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define DSLASH(FUNC, gridDim, blockDim, shared, stream, param)
cpuColorSpinorField * out
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
#define a