QUDA  0.9.0
color_spinor_wuppertal.cu
Go to the documentation of this file.
1 #include <transfer.h>
2 #include <quda_internal.h>
3 #include <quda_matrix.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <index_helper.cuh>
7 #include <color_spinor.h>
8 #include <color_spinor_field.h>
10 #include <tune_quda.h>
11 
12 namespace quda {
13 
14  template <typename Float, int Ns, int Nc, QudaReconstructType gRecon>
18 
19  F out; // output vector field
20  const F in; // input vector field
21  const G U; // the gauge field
22  const Float A; // A parameter
23  const Float B; // B parameter
24  const int parity; // only use this for single parity fields
25  const int nParity; // number of parities we're working on
26  const int nFace; // hard code to 1 for now
27  const int dim[5]; // full lattice dimensions
28  const int commDim[4]; // whether a given dimension is partitioned or not
29  const int volumeCB; // checkerboarded volume
30 
32  Float A, Float B)
33  : out(out), in(in), U(U), A(A), B(B), parity(parity), nParity(in.SiteSubset()), nFace(1),
34  dim{ (3-nParity) * in.X(0), in.X(1), in.X(2), in.X(3), 1 },
36  volumeCB(in.VolumeCB())
37  {
38  if (in.FieldOrder() != QUDA_FLOAT2_FIELD_ORDER || !U.isNative())
39  errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
40  }
41  };
42 
51  template <typename Float, int Nc, typename Vector, typename Arg>
52  __device__ __host__ inline void computeNeighborSum(Vector &out, Arg &arg, int x_cb, int parity) {
53 
54  typedef Matrix<complex<Float>,Nc> Link;
55  const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
56 
57  int coord[5];
58  getCoords(coord, x_cb, arg.dim, parity);
59  coord[4] = 0;
60 
61 #pragma unroll
62  for (int dir=0; dir<3; dir++) { // loop over spatial directions
63 
64  //Forward gather - compute fwd offset for vector fetch
65  const int fwd_idx = linkIndexP1(coord, arg.dim, dir);
66 
67  if ( arg.commDim[dir] && (coord[dir] + arg.nFace >= arg.dim[dir]) ) {
68  const int ghost_idx = ghostFaceIndex<1>(coord, arg.dim, dir, arg.nFace);
69 
70  const Link U = arg.U(dir, x_cb, parity);
71  const Vector in = arg.in.Ghost(dir, 1, ghost_idx, their_spinor_parity);
72 
73  out += U * in;
74  } else {
75  const Link U = arg.U(dir, x_cb, parity);
76  const Vector in = arg.in(fwd_idx, their_spinor_parity);
77 
78  out += U * in;
79  }
80 
81  //Backward gather - compute back offset for spinor and gauge fetch
82  const int back_idx = linkIndexM1(coord, arg.dim, dir);
83  const int gauge_idx = back_idx;
84 
85  if ( arg.commDim[dir] && (coord[dir] - arg.nFace < 0) ) {
86  const int ghost_idx = ghostFaceIndex<0>(coord, arg.dim, dir, arg.nFace);
87 
88  const Link U = arg.U.Ghost(dir, ghost_idx, 1-parity);
89  const Vector in = arg.in.Ghost(dir, 0, ghost_idx, their_spinor_parity);
90 
91  out += conj(U) * in;
92  } else {
93  const Link U = arg.U(dir, gauge_idx, 1-parity);
94  const Vector in = arg.in(back_idx, their_spinor_parity);
95 
96  out += conj(U) * in;
97  }
98  }
99  }
100 
101  //out(x) = A in(x) + B computeNeighborSum(out, x)
102  template <typename Float, int Ns, int Nc, typename Arg>
103  __device__ __host__ inline void computeWupperalStep(Arg &arg, int x_cb, int parity)
104  {
106  Vector out;
107 
108  computeNeighborSum<Float,Nc>(out, arg, x_cb, parity);
109 
110  Vector in;
111  arg.in.load((Float*)in.data, x_cb, parity);
112  out = arg.A*in + arg.B*out;
113 
114  arg.out(x_cb, parity) = out;
115  }
116 
117  // CPU kernel for applying a wuppertal smearing step to a vector
118  template <typename Float, int Ns, int Nc, typename Arg>
120  {
121 
122  for (int parity= 0; parity < arg.nParity; parity++) {
123  // for full fields then set parity from loop else use arg setting
124  parity = (arg.nParity == 2) ? parity : arg.parity;
125 
126  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
127  computeWupperalStep<Float,Ns,Nc>(arg, x_cb, parity);
128  } // 4-d volumeCB
129  } // parity
130 
131  }
132 
133  // GPU Kernel for applying a wuppertal smearing step to a vector
134  template <typename Float, int Ns, int Nc, typename Arg>
135  __global__ void wuppertalStepGPU(Arg arg)
136  {
137  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
138 
139  // for full fields set parity from y thread index else use arg setting
140  int parity = blockDim.y*blockIdx.y + threadIdx.y;
141 
142  if (x_cb >= arg.volumeCB) return;
143  if (parity >= arg.nParity) return;
144  parity = (arg.nParity == 2) ? parity : arg.parity;
145 
146  computeWupperalStep<Float,Ns,Nc>(arg, x_cb, parity);
147  }
148 
149  template <typename Float, int Ns, int Nc, typename Arg>
151 
152  protected:
153  Arg &arg;
155 
156  long long flops() const
157  {
158  return (2*3*Ns*Nc*(8*Nc-2) + 2*3*Nc*Ns )*arg.nParity*(long long)meta.VolumeCB();
159  }
160  long long bytes() const
161  {
162  return arg.out.Bytes() + (2*3+1)*arg.in.Bytes() + arg.nParity*2*3*arg.U.Bytes()*meta.VolumeCB();
163  }
164  bool tuneGridDim() const { return false; }
165  unsigned int minThreads() const { return arg.volumeCB; }
166  unsigned int maxBlockSize() const { return deviceProp.maxThreadsPerBlock / arg.nParity; }
167 
168  public:
170  {
171  strcpy(aux, meta.AuxString());
173  }
174  virtual ~WuppertalSmearing() { }
175 
176  void apply(const cudaStream_t &stream) {
178  wuppertalStepCPU<Float,Ns,Nc>(arg);
179  } else {
180  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
181  wuppertalStepGPU<Float,Ns,Nc> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
182  }
183  }
184 
185  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
186  };
187 
188  template<typename Float, int Ns, int Nc, QudaReconstructType gRecon>
190  const GaugeField& U, double A, double B)
191  {
194  wuppertal.apply(0);
195  }
196 
197  // template on the gauge reconstruction
198  template<typename Float, int Ns, int Nc>
199  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
200  const GaugeField& U, double A, double B)
201  {
202  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
203  wuppertalStep<Float,Ns,Nc,QUDA_RECONSTRUCT_NO>(out, in, parity, U, A, B);
204  } else if(U.Reconstruct() == QUDA_RECONSTRUCT_12) {
205  wuppertalStep<Float,Ns,Nc,QUDA_RECONSTRUCT_12>(out, in, parity, U, A, B);
206  } else if(U.Reconstruct() == QUDA_RECONSTRUCT_8) {
207  wuppertalStep<Float,Ns,Nc,QUDA_RECONSTRUCT_8>(out, in, parity, U, A, B);
208  } else {
209  errorQuda("Reconstruction type %d of origin gauge field not supported", U.Reconstruct());
210  }
211  }
212 
213 
214  // template on the number of colors
215  template<typename Float, int Ns>
216  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
217  const GaugeField& U, double A, double B)
218  {
219  if (out.Ncolor() != in.Ncolor()) {
220  errorQuda("Orign and destination fields must have the same number of colors\n");
221  }
222 
223  if (out.Ncolor() == 3 ) {
224  wuppertalStep<Float,Ns,3>(out, in, parity, U, A, B);
225  } else {
226  errorQuda(" is not implemented for Ncolor!=3");
227  }
228  }
229 
230  // template on the number of spins
231  template<typename Float>
232  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
233  const GaugeField& U, double A, double B)
234  {
235  if(out.Nspin() != in.Nspin()) {
236  errorQuda("Orign and destination fields must have the same number of spins\n");
237  }
238 
239  if (out.Nspin() == 4 ){
240  wuppertalStep<Float,4>(out, in, parity, U, A, B);
241  }else if (in.Nspin() == 1 ){
242  wuppertalStep<Float,1>(out, in, parity, U, A, B);
243  }else{
244  errorQuda("Nspin %d not supported", out.Nspin());
245  }
246  }
247 
257  // template on the precision
258  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity,
259  const GaugeField& U, double A, double B)
260  {
261  if (in.V() == out.V()) {
262  errorQuda("Orign and destination fields must be different pointers");
263  }
264 
265  // check precisions match
266  checkPrecision(out, in, U);
267 
268  // check all locations match
269  checkLocation(out, in, U);
270 
271  const int nFace = 1;
272  in.exchangeGhost((QudaParity)(1-parity), nFace, 0); // last parameter is dummy
273 
275  wuppertalStep<float>(out, in, parity, U, A, B);
276  } else if(out.Precision() == QUDA_DOUBLE_PRECISION) {
277  wuppertalStep<double>(out, in, parity, U, A, B);
278  } else {
279  errorQuda("Precision %d not supported", out.Precision());
280  }
281 
282  in.bufferIndex = (1 - in.bufferIndex);
283  return;
284  }
285 
294  void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField& U, double alpha)
295  {
296  wuppertalStep(out, in, parity, U, 1./(1.+6.*alpha), alpha/(1.+6.*alpha));
297  }
298 } // namespace quda
gauge_mapper< Float, gRecon >::type G
dim3 dim3 blockDim
colorspinor_mapper< Float, Ns, Nc >::type F
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_mpi.cpp:342
cudaDeviceProp deviceProp
const char * AuxString() const
__device__ __host__ void computeNeighborSum(Vector &out, Arg &arg, int x_cb, int parity)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define checkPrecision(...)
void apply(const cudaStream_t &stream)
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
char * strcpy(char *__dst, const char *__src)
const char * VolString() const
char * strcat(char *__s1, const char *__s2)
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
unsigned int maxBlockSize() const
static int bufferIndex
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
cpuColorSpinorField * in
for(int s=0;s< param.dc.Ls;s++)
__device__ __host__ void computeWupperalStep(Arg &arg, int x_cb, int parity)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
const ColorSpinorField & meta
#define checkLocation(...)
WuppertalSmearing(Arg &arg, const ColorSpinorField &meta)
Main header file for host and device accessors to GaugeFields.
enum QudaParity_s QudaParity
QudaFieldLocation Location() const
void wuppertalStepCPU(Arg arg)
void wuppertalStep(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, double A, double B)
cpuColorSpinorField * out
WuppertalSmearingArg(ColorSpinorField &out, const ColorSpinorField &in, int parity, const GaugeField &U, Float A, Float B)
__global__ void wuppertalStepGPU(Arg arg)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
VectorXcd Vector
unsigned int minThreads() const
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
QudaParity parity
Definition: covdev_test.cpp:53
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
int comm_dim_partitioned(int dim)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)