QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_improved_staggered.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 //these are access control for staggered action
10 #ifdef GPU_STAGGERED_DIRAC
11 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
12 //#define DIRECT_ACCESS_FAT_LINK
13 //#define DIRECT_ACCESS_LONG_LINK
14 //#define DIRECT_ACCESS_SPINOR
15 //#define DIRECT_ACCESS_ACCUM
16 //#define DIRECT_ACCESS_INTER
17 //#define DIRECT_ACCESS_PACK
18 #elif (__COMPUTE_CAPABILITY__ >= 200)
19 //#define DIRECT_ACCESS_FAT_LINK
20 //#define DIRECT_ACCESS_LONG_LINK
21 #define DIRECT_ACCESS_SPINOR
22 //#define DIRECT_ACCESS_ACCUM
23 //#define DIRECT_ACCESS_INTER
24 //#define DIRECT_ACCESS_PACK
25 #else
26 #define DIRECT_ACCESS_FAT_LINK
27 //#define DIRECT_ACCESS_LONG_LINK
28 //#define DIRECT_ACCESS_SPINOR
29 //#define DIRECT_ACCESS_ACCUM
30 //#define DIRECT_ACCESS_INTER
31 //#define DIRECT_ACCESS_PACK
32 #endif
33 #endif // GPU_STAGGERED_DIRAC
34 
35 #include <quda_internal.h>
36 #include <dslash_quda.h>
37 #include <sys/time.h>
38 #include <blas_quda.h>
39 #include <face_quda.h>
40 
41 #include <inline_ptx.h>
42 
43 namespace quda {
44 
45  namespace improvedstaggered {
46 #include <dslash_constants.h>
47 #include <dslash_textures.h>
48 #include <dslash_index.cuh>
49 
50 #define STAGGERED_TESLA_HACK
51 #undef GPU_NDEG_TWISTED_MASS_DIRAC
52 #undef GPU_CLOVER_DIRAC
53 #undef GPU_DOMAIN_WALL_DIRAC
54 #define DD_IMPROVED 1
55 #include <staggered_dslash_def.h> // staggered Dslash kernels
56 #undef DD_IMPROVED
57 
58 #include <dslash_quda.cuh>
59  } // end namespace improvedstaggered
60 
61  // declare the dslash events
62 #include <dslash_events.cuh>
63 
64  using namespace improvedstaggered;
65 
66  template<typename T> struct RealType {};
67  template<> struct RealType<double2> { typedef double type; };
68  template<> struct RealType<float2> { typedef float type; };
69  template<> struct RealType<float4> { typedef float type; };
70  template<> struct RealType<short2> { typedef short type; };
71  template<> struct RealType<short4> { typedef short type; };
72 
73 #ifdef GPU_STAGGERED_DIRAC
74  template <typename sFloat, typename fatGFloat, typename longGFloat, typename phaseFloat>
75  class StaggeredDslashCuda : public DslashCuda {
76 
77  private:
78  const fatGFloat *fat0, *fat1;
79  const longGFloat *long0, *long1;
80  const phaseFloat *phase0, *phase1;
81  const double a;
82 
83  protected:
84  unsigned int sharedBytesPerThread() const
85  {
86  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
87  return 6 * reg_size;
88  }
89 
90  public:
91  StaggeredDslashCuda(cudaColorSpinorField *out, const fatGFloat *fat0, const fatGFloat *fat1,
92  const longGFloat *long0, const longGFloat *long1,
93  const phaseFloat *phase0, const phaseFloat *phase1,
94  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
95  const cudaColorSpinorField *x, const double a, const int dagger)
96  : DslashCuda(out, in, x, reconstruct, dagger), fat0(fat0), fat1(fat1), long0(long0),
97  long1(long1), phase0(phase0), phase1(phase1), a(a)
98  {
99  bindSpinorTex<sFloat>(in, out, x);
100  }
101 
102  virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
103 
104  void apply(const cudaStream_t &stream)
105  {
106  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
107  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
108 #if (__COMPUTE_CAPABILITY__ >= 200)
109  IMPROVED_STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
110  (sFloat*)out->V(), (float*)out->Norm(),
111  fat0, fat1, long0, long1, phase0, phase1,
112  (sFloat*)in->V(), (float*)in->Norm(),
113  (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
114 #else
115  IMPROVED_STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
116  (sFloat*)out->V(), (float*)out->Norm(),
117  fat0, fat1, long0, long1,
118  (sFloat*)in->V(), (float*)in->Norm(),
119  (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
120 #endif
121  }
122 
123  int Nface() { return 6; }
124 
125  long long flops() const {
126  long long flops;
127  flops = (x ? 1158ll : 1146ll) * in->VolumeCB();
128  return flops;
129  }
130  };
131 #endif // GPU_STAGGERED_DIRAC
132 
133 #include <dslash_policy.cuh>
134 
136  const cudaGaugeField &longGauge, const cudaColorSpinorField *in,
137  const int parity, const int dagger, const cudaColorSpinorField *x,
138  const double &k, const int *commOverride, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
139  {
140  inSpinor = (cudaColorSpinorField*)in; // EVIL
141 
142 #ifdef GPU_STAGGERED_DIRAC
143 
144 #ifdef MULTI_GPU
145  for(int i=0;i < 4; i++){
146  if(commDimPartitioned(i) && (fatGauge.X()[i] < 6)){
147  errorQuda("ERROR: partitioned dimension with local size less than 6 is not supported in staggered dslash\n");
148  }
149  }
150 #endif
151 
152  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
153 
154  dslashParam.parity = parity;
155  dslashParam.gauge_stride = fatGauge.Stride();
156  dslashParam.long_gauge_stride = longGauge.Stride();
157  dslashParam.fat_link_max = fatGauge.LinkMax();
158 
159  for(int i=0;i<4;i++){
160  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
161  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
162  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
163  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
164  }
165 
166  void *fatGauge0, *fatGauge1;
167  void* longGauge0, *longGauge1;
168  bindFatGaugeTex(fatGauge, parity, &fatGauge0, &fatGauge1);
169  bindLongGaugeTex(longGauge, parity, &longGauge0, &longGauge1);
170  void *longPhase0 = (char*)longGauge0 + longGauge.PhaseOffset();
171  void *longPhase1 = (char*)longGauge1 + longGauge.PhaseOffset();
172 
173  if (in->Precision() != fatGauge.Precision() || in->Precision() != longGauge.Precision()){
174  errorQuda("Mixing gauge and spinor precision not supported"
175  "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
176  in->Precision(), fatGauge.Precision(), longGauge.Precision());
177  }
178 
179  DslashCuda *dslash = 0;
180  size_t regSize = sizeof(float);
181 
182  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
183 #if (__COMPUTE_CAPABILITY__ >= 130)
184  dslash = new StaggeredDslashCuda<double2, double2, double2, double>
185  (out, (double2*)fatGauge0, (double2*)fatGauge1,
186  (double2*)longGauge0, (double2*)longGauge1,
187  (double*)longPhase0, (double*)longPhase1,
188  longGauge.Reconstruct(), in, x, k, dagger);
189  regSize = sizeof(double);
190 #else
191  errorQuda("Double precision not supported on this GPU");
192 #endif
193  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
194  dslash = new StaggeredDslashCuda<float2, float2, float4, float>
195  (out, (float2*)fatGauge0, (float2*)fatGauge1,
196  (float4*)longGauge0, (float4*)longGauge1,
197  (float*)longPhase0, (float*)longPhase1,
198  longGauge.Reconstruct(), in, x, k, dagger);
199  } else if (in->Precision() == QUDA_HALF_PRECISION) {
200  dslash = new StaggeredDslashCuda<short2, short2, short4, short>
201  (out, (short2*)fatGauge0, (short2*)fatGauge1,
202  (short4*)longGauge0, (short4*)longGauge1,
203  (short*)longPhase0, (short*)longPhase1,
204  longGauge.Reconstruct(), in, x, k, dagger);
205  }
206 
207 #ifndef GPU_COMMS
208  DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
209 #else
210  DslashPolicyImp* dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
211 #endif
212  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume(), in->GhostFace(), profile);
213  delete dslashImp;
214 
215  delete dslash;
216  unbindFatGaugeTex(fatGauge);
217  unbindLongGaugeTex(longGauge);
218 
219  checkCudaError();
220 
221 #else
222  errorQuda("Staggered dslash has not been built");
223 #endif // GPU_STAGGERED_DIRAC
224  }
225 
226 }
size_t PhaseOffset() const
Definition: gauge_field.h:199
int commDimPartitioned(int dir)
void bindFatGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
const int * X() const
void bindLongGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
cudaStream_t * stream
void improvedStaggeredDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &fatGauge, const cudaGaugeField &longGauge, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
int GhostNormOffset(const int i) const
QudaDagType dagger
Definition: test_util.cpp:1558
QudaPrecision Precision() const
void unbindLongGaugeTex(const cudaGaugeField &gauge)
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
int Stride() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
const double & LinkMax() const
Definition: gauge_field.h:192
int x[4]
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
const int * GhostFace() const
void unbindFatGaugeTex(const cudaGaugeField &gauge)