QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_clover.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 #include <quda_internal.h>
20 #include <dslash_quda.h>
21 #include <sys/time.h>
22 #include <blas_quda.h>
23 #include <face_quda.h>
24 
25 #include <inline_ptx.h>
26 
27 namespace quda {
28 
29  namespace clover {
30 
31 #undef GPU_STAGGERED_DIRAC // do not delete - hack for Tesla architecture
32 #define GPU_DOMAIN_WALL_DIRAC // do not delete - work around for CUDA 6.5 alignment bug
33 
34 #include <dslash_constants.h>
35 #include <dslash_textures.h>
36 #include <dslash_index.cuh>
37 
38  // Enable shared memory dslash for Fermi architecture
39  //#define SHARED_WILSON_DSLASH
40  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
41 
42 #ifdef GPU_CLOVER_DIRAC
43 #define DD_CLOVER 1
44 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover)
45 #undef DD_CLOVER
46 #endif
47 
48 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
49 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
50 #endif
51 
52 #include <dslash_quda.cuh>
53 
54  } // end namespace clover
55 
56  // declare the dslash events
57 #include <dslash_events.cuh>
58 
59  using namespace clover;
60 
61 #ifdef GPU_CLOVER_DIRAC
62  template <typename sFloat, typename gFloat, typename cFloat>
63  class CloverDslashCuda : public SharedDslashCuda {
64 
65  private:
66  const gFloat *gauge0, *gauge1;
67  const cFloat *clover;
68  const float *cloverNorm;
69  const double a;
70 
71  protected:
72  unsigned int sharedBytesPerThread() const
73  {
74 #if (__COMPUTE_CAPABILITY__ >= 200)
75  if (dslashParam.kernel_type == INTERIOR_KERNEL) {
76  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
77  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
78  } else {
79  return 0;
80  }
81 #else
82  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
83  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
84 #endif
85  }
86  public:
87  CloverDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
88  const QudaReconstructType reconstruct, const cFloat *clover,
89  const float *cloverNorm, int cl_stride, const cudaColorSpinorField *in,
90  const cudaColorSpinorField *x, const double a, const int dagger)
91  : SharedDslashCuda(out, in, x, reconstruct, dagger), gauge0(gauge0), gauge1(gauge1), clover(clover),
92  cloverNorm(cloverNorm), a(a)
93  {
94  bindSpinorTex<sFloat>(in, out, x);
95  dslashParam.cl_stride = cl_stride;
96  }
97  virtual ~CloverDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
98 
99  void apply(const cudaStream_t &stream)
100  {
101 #ifdef SHARED_WILSON_DSLASH
102  if (dslashParam.kernel_type == EXTERIOR_KERNEL_X)
103  errorQuda("Shared dslash does not yet support X-dimension partitioning");
104 #endif
105  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
106  DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
107  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, clover, cloverNorm,
108  (sFloat*)in->V(), (float*)in->Norm(), (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
109  }
110 
111  long long flops() const { return (x ? 1872ll : 1824ll) * in->VolumeCB(); } // FIXME for multi-GPU
112  };
113 #endif // GPU_CLOVER_DIRAC
114 
115 #include <dslash_policy.cuh>
116 
118  const cudaColorSpinorField *in, const int parity, const int dagger,
119  const cudaColorSpinorField *x, const double &a, const int *commOverride,
120  TimeProfile &profile, const QudaDslashPolicy &dslashPolicy)
121  {
122  inSpinor = (cudaColorSpinorField*)in; // EVIL
123 
124 #ifdef GPU_CLOVER_DIRAC
125  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
126  for(int i=0;i<4;i++){
127  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
128  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
129  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
130  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
131  }
132 
133  void *cloverP, *cloverNormP;
134  QudaPrecision clover_prec = bindCloverTex(cloverInv, parity, &cloverP, &cloverNormP);
135 
136  void *gauge0, *gauge1;
137  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
138 
139  if (in->Precision() != gauge.Precision())
140  errorQuda("Mixing gauge and spinor precision not supported");
141 
142  if (in->Precision() != clover_prec)
143  errorQuda("Mixing clover and spinor precision not supported");
144 
145  DslashCuda *dslash = 0;
146  size_t regSize = sizeof(float);
147 
148  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
149 #if (__COMPUTE_CAPABILITY__ >= 130)
150  dslash = new CloverDslashCuda<double2, double2, double2>
151  (out, (double2*)gauge0, (double2*)gauge1, gauge.Reconstruct(),
152  (double2*)cloverP, (float*)cloverNormP, cloverInv.stride, in, x, a, dagger);
153  regSize = sizeof(double);
154 #else
155  errorQuda("Double precision not supported on this GPU");
156 #endif
157  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
158  dslash = new CloverDslashCuda<float4, float4, float4>
159  (out, (float4*)gauge0, (float4*)gauge1, gauge.Reconstruct(),
160  (float4*)cloverP, (float*)cloverNormP, cloverInv.stride, in, x, a, dagger);
161  } else if (in->Precision() == QUDA_HALF_PRECISION) {
162  dslash = new CloverDslashCuda<short4, short4, short4>
163  (out, (short4*)gauge0, (short4*)gauge1, gauge.Reconstruct(),
164  (short4*)cloverP, (float*)cloverNormP, cloverInv.stride, in, x, a, dagger);
165  }
166 
167 #ifndef GPU_COMMS
168  DslashPolicyImp* dslashImp = DslashFactory::create(dslashPolicy);
169 #else
170  DslashPolicyImp* dslashImp = DslashFactory::create(QUDA_GPU_COMMS_DSLASH);
171 #endif
172  (*dslashImp)(*dslash, const_cast<cudaColorSpinorField*>(in), regSize, parity, dagger, in->Volume(), in->GhostFace(), profile);
173  delete dslashImp;
174 
175  delete dslash;
176  unbindGaugeTex(gauge);
177  unbindCloverTex(cloverInv);
178 
179  checkCudaError();
180 #else
181  errorQuda("Clover dslash has not been built");
182 #endif
183 
184  }
185 
186 }
void unbindGaugeTex(const cudaGaugeField &gauge)
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
int GhostNormOffset(const int i) const
#define DSLASH_SHARED_FLOATS_PER_THREAD
QudaDagType dagger
Definition: test_util.cpp:1558
void cloverDslashCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover cloverInv, const cudaColorSpinorField *in, const int oddBit, const int daggerBit, const cudaColorSpinorField *x, const double &k, const int *commDim, TimeProfile &profile, const QudaDslashPolicy &dslashPolicy=QUDA_DSLASH2)
QudaPrecision Precision() const
cpuColorSpinorField * in
enum QudaDslashPolicy_s QudaDslashPolicy
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
void unbindCloverTex(const FullClover clover)
int x[4]
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
QudaPrecision Precision() const
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
QudaPrecision bindCloverTex(const FullClover clover, const int oddBit, void **cloverP, void **cloverNormP)
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int GhostOffset(const int i) const
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
const int * GhostFace() const