QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
color_spinor_wuppertal.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <gauge_field.h>
4 #include <gauge_field_order.h>
5 #include <index_helper.cuh>
6 #include <color_spinor.h>
7 #include <color_spinor_field.h>
9 #include <tune_quda.h>
10 
11 namespace quda {
12 
13  template <typename Float, int Ns, int Nc, QudaReconstructType gRecon>
17 
18  F out; // output vector field
19  const F in; // input vector field
20  const G U; // the gauge field
21  const Float A; // A parameter
22  const Float B; // B parameter
23  const int parity; // only use this for single parity fields
24  const int nParity; // number of parities we're working on
25  const int nFace; // hard code to 1 for now
26  const int dim[5]; // full lattice dimensions
27  const int commDim[4]; // whether a given dimension is partitioned or not
28  const int volumeCB; // checkerboarded volume
29 
30  WuppertalSmearingArg(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U,
31  Float A, Float B)
32  : out(out), in(in), U(U), A(A), B(B), parity(parity), nParity(in.SiteSubset()), nFace(1),
33  dim{ (3-nParity) * in.X(0), in.X(1), in.X(2), in.X(3), 1 },
35  volumeCB(in.VolumeCB())
36  {
37  if (in.FieldOrder() != QUDA_FLOAT2_FIELD_ORDER || !U.isNative())
38  errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
39  }
40  };
41 
50  template <typename Float, int Nc, typename Vector, typename Arg>
51  __device__ __host__ inline void computeNeighborSum(Vector &out, Arg &arg, int x_cb, int parity) {
52 
53  typedef Matrix<complex<Float>,Nc> Link;
54  const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
55 
56  int coord[5];
57  getCoords(coord, x_cb, arg.dim, parity);
58  coord[4] = 0;
59 
60 #pragma unroll
61  for (int dir=0; dir<3; dir++) { // loop over spatial directions
62 
63  //Forward gather - compute fwd offset for vector fetch
64  const int fwd_idx = linkIndexP1(coord, arg.dim, dir);
65 
66  if ( arg.commDim[dir] && (coord[dir] + arg.nFace >= arg.dim[dir]) ) {
67  const int ghost_idx = ghostFaceIndex<1>(coord, arg.dim, dir, arg.nFace);
68 
69  const Link U = arg.U(dir, x_cb, parity);
70  const Vector in = arg.in.Ghost(dir, 1, ghost_idx, their_spinor_parity);
71 
72  out += U * in;
73  } else {
74  const Link U = arg.U(dir, x_cb, parity);
75  const Vector in = arg.in(fwd_idx, their_spinor_parity);
76 
77  out += U * in;
78  }
79 
80  //Backward gather - compute back offset for spinor and gauge fetch
81  const int back_idx = linkIndexM1(coord, arg.dim, dir);
82  const int gauge_idx = back_idx;
83 
84  if ( arg.commDim[dir] && (coord[dir] - arg.nFace < 0) ) {
85  const int ghost_idx = ghostFaceIndex<0>(coord, arg.dim, dir, arg.nFace);
86 
87  const Link U = arg.U.Ghost(dir, ghost_idx, 1-parity);
88  const Vector in = arg.in.Ghost(dir, 0, ghost_idx, their_spinor_parity);
89 
90  out += conj(U) * in;
91  } else {
92  const Link U = arg.U(dir, gauge_idx, 1-parity);
93  const Vector in = arg.in(back_idx, their_spinor_parity);
94 
95  out += conj(U) * in;
96  }
97  }
98  }
99 
100  //out(x) = A in(x) + B computeNeighborSum(out, x)
101  template <typename Float, int Ns, int Nc, typename Arg>
102  __device__ __host__ inline void computeWupperalStep(Arg &arg, int x_cb, int parity)
103  {
105  Vector out;
106 
107  computeNeighborSum<Float,Nc>(out, arg, x_cb, parity);
108 
109  Vector in = arg.in(x_cb, parity);
110  out = arg.A*in + arg.B*out;
111 
112  arg.out(x_cb, parity) = out;
113  }
114 
115  // CPU kernel for applying a wuppertal smearing step to a vector
116  template <typename Float, int Ns, int Nc, typename Arg>
118  {
119 
120  for (int parity= 0; parity < arg.nParity; parity++) {
121  // for full fields then set parity from loop else use arg setting
122  parity = (arg.nParity == 2) ? parity : arg.parity;
123 
124  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
125  computeWupperalStep<Float,Ns,Nc>(arg, x_cb, parity);
126  } // 4-d volumeCB
127  } // parity
128 
129  }
130 
131  // GPU Kernel for applying a wuppertal smearing step to a vector
132  template <typename Float, int Ns, int Nc, typename Arg>
133  __global__ void wuppertalStepGPU(Arg arg)
134  {
135  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
136 
137  // for full fields set parity from y thread index else use arg setting
138  int parity = blockDim.y*blockIdx.y + threadIdx.y;
139 
140  if (x_cb >= arg.volumeCB) return;
141  if (parity >= arg.nParity) return;
142  parity = (arg.nParity == 2) ? parity : arg.parity;
143 
144  computeWupperalStep<Float,Ns,Nc>(arg, x_cb, parity);
145  }
146 
147  template <typename Float, int Ns, int Nc, typename Arg>
149 
150  protected:
153 
154  long long flops() const
155  {
156  return (2*3*Ns*Nc*(8*Nc-2) + 2*3*Nc*Ns )*arg.nParity*(long long)meta.VolumeCB();
157  }
158  long long bytes() const
159  {
160  return arg.out.Bytes() + (2*3+1)*arg.in.Bytes() + arg.nParity*2*3*arg.U.Bytes()*meta.VolumeCB();
161  }
162  bool tuneGridDim() const { return false; }
163  unsigned int minThreads() const { return arg.volumeCB; }
164 
165  public:
166  WuppertalSmearing(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
167  {
168  strcpy(aux, meta.AuxString());
169  strcat(aux, comm_dim_partitioned_string());
170  }
171  virtual ~WuppertalSmearing() { }
172 
173  void apply(const cudaStream_t &stream) {
174  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
175  wuppertalStepCPU<Float,Ns,Nc>(arg);
176  } else {
177  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
178  wuppertalStepGPU<Float,Ns,Nc> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
179  }
180  }
181 
182  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
183  };
184 
185  template<typename Float, int Ns, int Nc, QudaReconstructType gRecon>
187  const GaugeField& U, double A, double B)
188  {
189  WuppertalSmearingArg<Float,Ns,Nc,gRecon> arg(out, in, parity, U, A, B);
191  wuppertal.apply(0);
192  }
193 
194  // template on the gauge reconstruction
195  template<typename Float, int Ns, int Nc>
197  const GaugeField& U, double A, double B)
198  {
199  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
200  wuppertalStep<Float,Ns,Nc,QUDA_RECONSTRUCT_NO>(out, in, parity, U, A, B);
201  } else if(U.Reconstruct() == QUDA_RECONSTRUCT_12) {
202  wuppertalStep<Float,Ns,Nc,QUDA_RECONSTRUCT_12>(out, in, parity, U, A, B);
203  } else if(U.Reconstruct() == QUDA_RECONSTRUCT_8) {
204  wuppertalStep<Float,Ns,Nc,QUDA_RECONSTRUCT_8>(out, in, parity, U, A, B);
205  } else {
206  errorQuda("Reconstruction type %d of origin gauge field not supported", U.Reconstruct());
207  }
208  }
209 
210 
211  // template on the number of colors
212  template<typename Float, int Ns>
213  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
214  const GaugeField& U, double A, double B)
215  {
216  if (out.Ncolor() != in.Ncolor()) {
217  errorQuda("Orign and destination fields must have the same number of colors\n");
218  }
219 
220  if (out.Ncolor() == 3 ) {
221  wuppertalStep<Float,Ns,3>(out, in, parity, U, A, B);
222  } else {
223  errorQuda(" is not implemented for Ncolor!=3");
224  }
225  }
226 
227  // template on the number of spins
228  template<typename Float>
229  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
230  const GaugeField& U, double A, double B)
231  {
232  if(out.Nspin() != in.Nspin()) {
233  errorQuda("Orign and destination fields must have the same number of spins\n");
234  }
235 
236  if (out.Nspin() == 4 ){
237  wuppertalStep<Float,4>(out, in, parity, U, A, B);
238  }else if (in.Nspin() == 1 ){
239  wuppertalStep<Float,1>(out, in, parity, U, A, B);
240  }else{
241  errorQuda("Nspin %d not supported", out.Nspin());
242  }
243  }
244 
254  // template on the precision
255  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
256  const GaugeField& U, double A, double B)
257  {
258  if (in.V() == out.V()) {
259  errorQuda("Orign and destination fields must be different pointers");
260  }
261 
262  // check precisions match
263  checkPrecision(out, in, U);
264 
265  // check all locations match
266  checkLocation(out, in, U);
267 
268  const int nFace = 1;
269  in.exchangeGhost((QudaParity)(1-parity), nFace, 0); // last parameter is dummy
270 
271  if (out.Precision() == QUDA_SINGLE_PRECISION){
272  wuppertalStep<float>(out, in, parity, U, A, B);
273  } else if(out.Precision() == QUDA_DOUBLE_PRECISION) {
274  wuppertalStep<double>(out, in, parity, U, A, B);
275  } else {
276  errorQuda("Precision %d not supported", out.Precision());
277  }
278 
279  in.bufferIndex = (1 - in.bufferIndex);
280  return;
281  }
282 
291  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField& U, double alpha)
292  {
293  wuppertalStep(out, in, parity, U, 1./(1.+6.*alpha), alpha/(1.+6.*alpha));
294  }
295 } // namespace quda
gauge_mapper< Float, gRecon >::type G
colorspinor_mapper< Float, Ns, Nc >::type F
const char * AuxString() const
__device__ __host__ void computeNeighborSum(Vector &out, Arg &arg, int x_cb, int parity)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define checkPrecision(...)
void apply(const cudaStream_t &stream)
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
const char * VolString() const
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
static int bufferIndex
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
__device__ __host__ void computeWupperalStep(Arg &arg, int x_cb, int parity)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
const ColorSpinorField & meta
#define checkLocation(...)
WuppertalSmearing(Arg &arg, const ColorSpinorField &meta)
Main header file for host and device accessors to GaugeFields.
enum QudaParity_s QudaParity
QudaFieldLocation Location() const
void wuppertalStepCPU(Arg arg)
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
const int nParity
Definition: spinor_noise.cu:25
WuppertalSmearingArg(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, Float A, Float B)
__global__ void wuppertalStepGPU(Arg arg)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
VectorXcd Vector
const int volumeCB
Definition: spinor_noise.cu:26
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
const int * X() const
unsigned int minThreads() const
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const =0
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.