QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_wilson.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 
20 #include <quda_internal.h>
21 #include <dslash_quda.h>
22 #include <sys/time.h>
23 #include <blas_quda.h>
24 #include <face_quda.h>
25 
26 #include <inline_ptx.h>
27 
28 namespace quda {
29 
30  namespace wilson {
31 
32 #include <dslash_constants.h>
33 #include <dslash_textures.h>
34 #include <dslash_index.cuh>
35 
36  // Enable shared memory dslash for Fermi architecture
37  //#define SHARED_WILSON_DSLASH
38  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
39 
40 #ifdef GPU_WILSON_DIRAC
41 #define DD_CLOVER 0
42 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover)
43 #undef DD_CLOVER
44 #endif
45 
46 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
47 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
48 #endif
49 
50 #include <dslash_quda.cuh>
51 
52  } // end namespace wilson
53 
54  // declare the dslash events
55 #include <dslash_events.cuh>
56 
57  using namespace wilson;
58 
59 #ifdef GPU_WILSON_DIRAC
60  template <typename sFloat, typename gFloat>
61  class WilsonDslashCuda : public SharedDslashCuda {
62 
63  private:
64  const gFloat *gauge0, *gauge1;
65  const double a;
66 
67  protected:
68  unsigned int sharedBytesPerThread() const
69  {
70 #if (__COMPUTE_CAPABILITY__ >= 200) // Fermi uses shared memory for common input
71  if (dslashParam.kernel_type == INTERIOR_KERNEL) { // Interior kernels use shared memory for common iunput
72  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
73  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
74  } else { // Exterior kernels use no shared memory
75  return 0;
76  }
77 #else // Pre-Fermi uses shared memory only for pseudo-registers
78  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
79  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
80 #endif
81  }
82 
83  public:
84  WilsonDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
85  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
86  const cudaColorSpinorField *x, const double a, const int dagger)
87  : SharedDslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), a(a)
88  {
89  bindSpinorTex<sFloat>(in, out, x);
90  }
91 
92  virtual ~WilsonDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
93 
94  void apply(const cudaStream_t &stream)
95  {
96 #ifdef SHARED_WILSON_DSLASH
97  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X)
98  errorQuda("Shared dslash does not yet support X-dimension partitioning");
99 #endif
100  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
101  DSLASH(dslash, tp.grid, tp.block, tp.shared_bytes, stream,
102  dslashParam, (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
103  (sFloat*)in->V(), (float*)in->Norm(), (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
104  }
105 
106  long long flops() const { return (x ? 1368ll : 1320ll) * in->VolumeCB(); } // FIXME for multi-GPU
107  };
108 #endif // GPU_WILSON_DIRAC
109 
110 #include <dslash_policy.cuh>
111 
112  // Wilson wrappers
114  const int parity, const int dagger, const cudaColorSpinorField *x, const double &k,
115  const int *commOverride, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
116  {
117  inSpinor = (cudaColorSpinorField*)in; // EVIL
118 
119 #ifdef GPU_WILSON_DIRAC
120  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
121  for(int i=0;i<4;i++){
122  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
123  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
124  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
125  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
126  }
127 
128  void *gauge0, *gauge1;
129  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
130 
131  if (in->Precision() != gauge.Precision())
132  errorQuda("Mixing gauge %d and spinor %d precision not supported",
133  gauge.Precision(), in->Precision());
134 
135  DslashCuda *dslash = 0;
136  size_t regSize = sizeof(float);
137  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
138 #if (__COMPUTE_CAPABILITY__ >= 130)
139  dslash = new WilsonDslashCuda<double2, double2>(out, (double2*)gauge0, (double2*)gauge1,
140  gauge.Reconstruct(), in, x, k, dagger);
141  regSize = sizeof(double);
142 #else
143  errorQuda("Double precision not supported on this GPU");
144 #endif
145  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
146  dslash = new WilsonDslashCuda<float4, float4>(out, (float4*)gauge0, (float4*)gauge1,
147  gauge.Reconstruct(), in, x, k, dagger);
148  } else if (in->Precision() == QUDA_HALF_PRECISION) {
149  dslash = new WilsonDslashCuda<short4, short4>(out, (short4*)gauge0, (short4*)gauge1,
150  gauge.Reconstruct(), in, x, k, dagger);
151  }
152 
153 #ifndef GPU_COMMS
154  DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
155 #else
156  DslashPolicyImp* dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
157 #endif
158 
159  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume(), in->GhostFace(), profile);
160  delete dslashImp;
161 
162  delete dslash;
163  unbindGaugeTex(gauge);
164 
165  checkCudaError();
166 #else
167  errorQuda("Wilson dslash has not been built");
168 #endif // GPU_WILSON_DIRAC
169 
170  }
171 
172 }
void unbindGaugeTex(const cudaGaugeField &gauge)
#define DSLASH_SHARED_FLOATS_PER_THREAD
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
int GhostNormOffset(const int i) const
QudaDagType dagger
Definition: test_util.cpp:1558
QudaPrecision Precision() const
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
int x[4]
void wilsonDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const