QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_staggered.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 //these are access control for staggered action
10 #ifdef GPU_STAGGERED_DIRAC
11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
12 //#define DIRECT_ACCESS_FAT_LINK
13 //#define DIRECT_ACCESS_LONG_LINK
14 //#define DIRECT_ACCESS_SPINOR
15 //#define DIRECT_ACCESS_ACCUM
16 //#define DIRECT_ACCESS_INTER
17 //#define DIRECT_ACCESS_PACK
18 #elif (__COMPUTE_CAPABILITY__ >= 200)
19 //#define DIRECT_ACCESS_FAT_LINK
20 //#define DIRECT_ACCESS_LONG_LINK
21 #define DIRECT_ACCESS_SPINOR
22 //#define DIRECT_ACCESS_ACCUM
23 //#define DIRECT_ACCESS_INTER
24 //#define DIRECT_ACCESS_PACK
25 #else
26 #define DIRECT_ACCESS_FAT_LINK
27 //#define DIRECT_ACCESS_LONG_LINK
28 //#define DIRECT_ACCESS_SPINOR
29 //#define DIRECT_ACCESS_ACCUM
30 //#define DIRECT_ACCESS_INTER
31 //#define DIRECT_ACCESS_PACK
32 #endif
33 #endif // GPU_STAGGERED_DIRAC
34 
35 #include <quda_internal.h>
36 #include <dslash_quda.h>
37 #include <sys/time.h>
38 #include <blas_quda.h>
39 #include <face_quda.h>
40 
41 #include <inline_ptx.h>
42 
43 namespace quda {
44 
45  namespace staggered {
46 #include <dslash_constants.h>
47 #include <dslash_textures.h>
48 #include <dslash_index.cuh>
49 
50 #undef GPU_CLOVER_DIRAC
51 #undef GPU_DOMAIN_WALL_DIRAC
52 #define DD_IMPROVED 0
53 #include <staggered_dslash_def.h> // staggered Dslash kernels
54 #undef DD_IMPROVED
55 
56 #include <dslash_quda.cuh>
57  } // end namespace staggered
58 
59  // declare the dslash events
60 #include <dslash_events.cuh>
61 
62  using namespace staggered;
63 
64  template<typename T> struct RealType {};
65  template<> struct RealType<double2> { typedef double type; };
66  template<> struct RealType<float2> { typedef float type; };
67  template<> struct RealType<float4> { typedef float type; };
68  template<> struct RealType<short2> { typedef short type; };
69  template<> struct RealType<short4> { typedef short type; };
70 
71 #ifdef GPU_STAGGERED_DIRAC
72  template <typename sFloat, typename gFloat>
73  class StaggeredDslashCuda : public DslashCuda {
74 
75  private:
76  const gFloat *gauge0, *gauge1;
77  const double a;
78 
79  protected:
80  unsigned int sharedBytesPerThread() const
81  {
82  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
83  return 6 * reg_size;
84  }
85 
86  public:
87  StaggeredDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
88  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
89  const cudaColorSpinorField *x, const double a, const int dagger)
90  : DslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), a(a)
91  {
92  bindSpinorTex<sFloat>(in, out, x);
93  }
94 
95  virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
96 
97  void apply(const cudaStream_t &stream)
98  {
99  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
100  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
101  STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
102  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
103  (sFloat*)in->V(), (float*)in->Norm(),
104  (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
105  }
106 
107  int Nface() { return 2; }
108 
109  long long flops() const {
110  long long flops;
111  flops = (x ? 666ll : 654ll) * in->VolumeCB();
112  return flops;
113  }
114  };
115 #endif // GPU_STAGGERED_DIRAC
116 
117 #include <dslash_policy.cuh>
118 
120  const cudaColorSpinorField *in, const int parity,
121  const int dagger, const cudaColorSpinorField *x,
122  const double &k, const int *commOverride, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
123  {
124  inSpinor = (cudaColorSpinorField*)in; // EVIL
125 
126 #ifdef GPU_STAGGERED_DIRAC
127 
128  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
129 
130  dslashParam.parity = parity;
131  dslashParam.gauge_stride = gauge.Stride();
132  dslashParam.fat_link_max = gauge.LinkMax(); // May need to use this in the preconditioning step
133  // in the solver for the improved staggered action
134 
135  for(int i=0;i<4;i++){
136  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
137  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
138  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
139  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
140  }
141  void *gauge0, *gauge1;
142  bindFatGaugeTex(gauge, parity, &gauge0, &gauge1);
143 
144  if (in->Precision() != gauge.Precision()) {
145  errorQuda("Mixing precisions gauge=%d and spinor=%d not supported",
146  gauge.Precision(), in->Precision());
147  }
148 
149  DslashCuda *dslash = 0;
150  size_t regSize = sizeof(float);
151 
152  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
153 #if (__COMPUTE_CAPABILITY__ >= 130)
154  dslash = new StaggeredDslashCuda<double2, double2>
155  (out, (double2*)gauge0, (double2*)gauge1, gauge.Reconstruct(), in, x, k, dagger);
156  regSize = sizeof(double);
157 #else
158  errorQuda("Double precision not supported on this GPU");
159 #endif
160  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
161  dslash = new StaggeredDslashCuda<float2, float2>
162  (out, (float2*)gauge0, (float2*)gauge1, gauge.Reconstruct(), in, x, k, dagger);
163  } else if (in->Precision() == QUDA_HALF_PRECISION) {
164  dslash = new StaggeredDslashCuda<short2, short2>
165  (out, (short2*)gauge0, (short2*)gauge1, gauge.Reconstruct(), in, x, k, dagger);
166  }
167 
168 #ifndef GPU_COMMS
169  DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
170 #else
171  DslashPolicyImp* dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
172 #endif
173 
174  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume(), in->GhostFace(), profile);
175  delete dslashImp;
176 
177  delete dslash;
178  unbindFatGaugeTex(gauge);
179 
180  checkCudaError();
181 
182 #else
183  errorQuda("Staggered dslash has not been built");
184 #endif // GPU_STAGGERED_DIRAC
185  }
186 
187 }
int commDimPartitioned(int dir)
void bindFatGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
void staggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
int GhostNormOffset(const int i) const
QudaDagType dagger
Definition: test_util.cpp:1558
QudaPrecision Precision() const
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
int Stride() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
const double & LinkMax() const
Definition: gauge_field.h:192
int x[4]
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const
void unbindFatGaugeTex(const cudaGaugeField &gauge)