QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_twisted_clover.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 #include <face_quda.h>
24 
25 #include <inline_ptx.h>
26 
27 namespace quda {
28 
29  namespace twistedclover {
30 
31 #include <dslash_constants.h>
32 #include <dslash_textures.h>
33 #include <dslash_index.cuh>
34 
35  // Enable shared memory dslash for Fermi architecture
36  //#define SHARED_WILSON_DSLASH
37  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
38 
39 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
40 #include <tmc_dslash_def.h> // Twisted Clover kernels
41 #endif
42 
43 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
44 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
45 #endif
46 
47 #include <dslash_quda.cuh>
48 
49  } // end namespace twisted_clover
50 
51  // declare the dslash events
52 #include <dslash_events.cuh>
53 
54  using namespace twistedclover;
55 
56 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
57  template <typename sFloat, typename gFloat, typename cFloat>
58  class TwistedCloverDslashCuda : public SharedDslashCuda {
59 
60  private:
61  const gFloat *gauge0, *gauge1;
62  const QudaTwistCloverDslashType dslashType;
63  double a, b, c, d;
64  const cFloat *clover;
65  const float *cNorm;
66  const cFloat *cloverInv;
67  const float *cNrm2;
68 
69  protected:
70  unsigned int sharedBytesPerThread() const
71  {
72 #if (__COMPUTE_CAPABILITY__ >= 200)
73  if (dslashParam.kernel_type == INTERIOR_KERNEL) {
74  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
75  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
76  } else {
77  return 0;
78  }
79 #else
80  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
81  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
82 #endif
83  }
84 
85  public:
86  TwistedCloverDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
87  const QudaReconstructType reconstruct, const cFloat *clover, const float *cNorm,
88  const cFloat *cloverInv, const float *cNrm2, int cl_stride, const cudaColorSpinorField *in,
89  const cudaColorSpinorField *x, const QudaTwistCloverDslashType dslashType, const double kappa,
90  const double mu, const double epsilon, const double k, const int dagger)
91  : SharedDslashCuda(out, in, x, reconstruct,dagger),gauge0(gauge0), gauge1(gauge1), clover(clover),
92  cNorm(cNorm), cloverInv(cloverInv), cNrm2(cNrm2), dslashType(dslashType)
93  {
94  bindSpinorTex<sFloat>(in, out, x);
95  dslashParam.cl_stride = cl_stride;
96  dslashParam.fl_stride = in->VolumeCB();
97  a = kappa;
98  b = mu;
99  c = epsilon;
100  d = k;
101  }
102  virtual ~TwistedCloverDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
103 
104  TuneKey tuneKey() const
105  {
106  TuneKey key = DslashCuda::tuneKey();
107  switch(dslashType){
109  strcat(key.aux,",CloverTwistInvDslash");
110  break;
112  strcat(key.aux,",Dslash");
113  break;
115  strcat(key.aux,",DslashCloverTwist");
116  break;
117  }
118  return key;
119  }
120 
121  void apply(const cudaStream_t &stream)
122  {
123 #ifdef SHARED_WILSON_DSLASH
124  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X)
125  errorQuda("Shared dslash does not yet support X-dimension partitioning");
126 #endif
127  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
128  switch(dslashType){
129 
131  DSLASH(twistedCloverInvDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
132  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, clover, cNorm, cloverInv, cNrm2,
133  (sFloat*)in->V(), (float*)in->Norm(), a, b, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0));
134  break;
136  DSLASH(twistedCloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
137  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, clover, cNorm, cloverInv, cNrm2,
138  (sFloat*)in->V(), (float*)in->Norm(), a, b, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0));
139  break;
141  DSLASH(twistedCloverDslashTwist, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
142  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, clover, cNorm, cloverInv, cNrm2,
143  (sFloat*)in->V(), (float*)in->Norm(), a, b, (sFloat*)x->V(), (float*)x->Norm());
144  break;
145  default: errorQuda("Invalid twisted clover dslash type");
146  }
147  }
148 
149  long long flops() const { return (x ? 1416ll : 1392ll) * in->VolumeCB(); } // FIXME for multi-GPU
150  };
151 #endif // GPU_TWISTED_CLOVER_DIRAC
152 
153 #include <dslash_policy.cuh>
154 
155  void twistedCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover *clover, const FullClover *cloverInv,
156  const cudaColorSpinorField *in, const int parity, const int dagger,
157  const cudaColorSpinorField *x, const QudaTwistCloverDslashType type, const double &kappa, const double &mu,
158  const double &epsilon, const double &k, const int *commOverride,
159  TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
160  {
161  if (dslashPolicy == QUDA_FUSED_DSLASH || dslashPolicy == QUDA_FUSED_GPU_COMMS_DSLASH)
162  errorQuda("Twisted-clover dslash does not yet support a fused exterior dslash kernel");
163 
164  inSpinor = (cudaColorSpinorField*)in; // EVIL
165 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
166  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
167 
168  int ghost_threads[4] = {0};
169  int bulk_threads = ((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) ? in->Volume() : in->Volume() / 2;
170 
171  for(int i=0;i<4;i++){
172  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
173  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
174  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
175  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
176  ghost_threads[i] = ((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) ? in->GhostFace()[i] : in->GhostFace()[i] / 2;
177  }
178 
179 #ifdef MULTI_GPU
180  twist_a = 0.;
181  twist_b = 0.;
182 #endif
183 
184  void *gauge0, *gauge1;
185  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
186 
187  void *cloverP, *cloverNormP, *cloverInvP, *cloverInvNormP;
188  QudaPrecision clover_prec = bindTwistedCloverTex(*clover, *cloverInv, parity, &cloverP, &cloverNormP, &cloverInvP, &cloverInvNormP);
189 
190  if (in->Precision() != clover_prec)
191  errorQuda("Mixing clover and spinor precision not supported");
192 
193  if (in->Precision() != gauge.Precision())
194  errorQuda("Mixing gauge and spinor precision not supported");
195 
196  if (clover->stride != cloverInv->stride)
197  errorQuda("clover and cloverInv must have matching strides (%d != %d)", clover->stride, cloverInv->stride);
198 
199  DslashCuda *dslash = 0;
200  size_t regSize = sizeof(float);
201 
202  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
203 #if (__COMPUTE_CAPABILITY__ >= 130)
204  dslash = new TwistedCloverDslashCuda<double2,double2,double2>(out, (double2*)gauge0,(double2*)gauge1, gauge.Reconstruct(), (double2*)cloverP, (float*)cloverNormP,
205  (double2*)cloverInvP, (float*)cloverInvNormP, clover->stride, in, x, type, kappa, mu, epsilon, k, dagger);
206 
207  regSize = sizeof(double);
208 #else
209  errorQuda("Double precision not supported on this GPU");
210 #endif
211  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
212  dslash = new TwistedCloverDslashCuda<float4,float4,float4>(out, (float4*)gauge0,(float4*)gauge1, gauge.Reconstruct(), (float4*)cloverP, (float*)cloverNormP,
213  (float4*)cloverInvP, (float*)cloverInvNormP, clover->stride, in, x, type, kappa, mu, epsilon, k, dagger);
214 
215  } else if (in->Precision() == QUDA_HALF_PRECISION) {
216  dslash = new TwistedCloverDslashCuda<short4,short4,short4>(out, (short4*)gauge0,(short4*)gauge1, gauge.Reconstruct(), (short4*)cloverP, (float*)cloverNormP,
217  (short4*)cloverInvP, (float*)cloverInvNormP, clover->stride, in, x, type, kappa, mu, epsilon, k, dagger);
218  }
219 
220 #ifndef GPU_COMMS
221  DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
222 #else
223  DslashPolicyImp* dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
224 #endif
225  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, bulk_threads, ghost_threads, profile);
226  delete dslashImp;
227 
228  delete dslash;
229 
230  unbindGaugeTex(gauge);
231  unbindTwistedCloverTex(*clover);
232 
233  checkCudaError();
234 #else
235 
236 #if (__COMPUTE_CAPABILITY__ < 200)
237  errorQuda("Twisted-clover fermions not supported on pre-Fermi architecture");
238 #else
239  errorQuda("Twisted clover dslash has not been built");
240 #endif
241 
242 #endif
243  }
244 
245 }
QudaPrecision bindTwistedCloverTex(const FullClover clover, const FullClover cloverInv, const int oddBit, void **cloverP, void **cloverNormP, void **cloverInvP, void **cloverInvNormP)
void unbindGaugeTex(const cudaGaugeField &gauge)
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
enum QudaTwistCloverDslashType_s QudaTwistCloverDslashType
cudaStream_t * stream
int GhostNormOffset(const int i) const
QudaDagType dagger
Definition: test_util.cpp:1558
QudaPrecision Precision() const
VOLATILE spinorFloat kappa
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
void twistedCloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover *clover, const FullClover *cloverInv, const cudaColorSpinorField *in, const int parity, const int dagger, const cudaColorSpinorField *x, const QudaTwistCloverDslashType type, const double &kappa, const double &mu, const double &epsilon, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
int x[4]
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
QudaTwistFlavorType TwistFlavor() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
#define DSLASH_SHARED_FLOATS_PER_THREAD
#define checkCudaError()
Definition: util_quda.h:110
void unbindTwistedCloverTex(const FullClover clover)
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const