QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
clover_deriv_quda.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cuda.h>
4 #include <quda_internal.h>
5 #include <tune_quda.h>
6 #include <gauge_field.h>
7 #include <quda_matrix.h>
8 #include <cassert>
9 
10 namespace quda {
11 
12 #ifdef GPU_CLOVER_DIRAC
13 
14  template<class Cmplx>
15  struct CloverDerivArg
16  {
17  int X[4];
18  int border[4];
19  int mu;
20  int nu;
21  typename RealTypeId<Cmplx>::Type coeff;
22  int parity;
23  int volumeCB;
24 
25  Cmplx* gauge;
26  Cmplx* force;
27  Cmplx* oprod;
28 
29  int forceStride;
30  int gaugeStride;
31  int oprodStride;
32 
33  int forceOffset;
34  int gaugeOffset;
35  int oprodOffset;
36 
37  bool conjugate;
38 
39  CloverDerivArg(cudaGaugeField& force, cudaGaugeField& gauge, cudaGaugeField& oprod, int mu, int nu, double coeff, int parity, bool conjugate) :
40  mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.VolumeCB()),
41  force(reinterpret_cast<Cmplx*>(force.Gauge_p())), gauge(reinterpret_cast<Cmplx*>(gauge.Gauge_p())), oprod(reinterpret_cast<Cmplx*>(oprod.Gauge_p())),
42  forceStride(force.Stride()), gaugeStride(gauge.Stride()), oprodStride(oprod.Stride()),
43  forceOffset(force.Bytes()/(2*sizeof(Cmplx))), gaugeOffset(gauge.Bytes()/(2*sizeof(Cmplx))), oprodOffset(oprod.Bytes()/(2*sizeof(Cmplx)))
44  {
45  for(int dir=0; dir<4; ++dir) X[dir] = force.X()[dir];
46  //for(int dir=0; dir<4; ++dir) border[dir] = commDimPartitioned(dir) ? 2 : 0;
47  for(int dir=0; dir<4; ++dir) border[dir] = 2;
48  }
49  };
50 
51  __device__ void getCoords(int x[4], int cb_index, const int X[4], int parity)
52  {
53  x[3] = cb_index/(X[2]*X[1]*X[0]/2);
54  x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
55  x[1] = (cb_index/(X[0]/2)) % X[1];
56  x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
57 
58  return;
59  }
60 
61  __device__ int linkIndex(const int x[4], const int dx[4], const int X[4])
62  {
63  int y[4];
64  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
65  return (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0])/2;
66  }
67 
68 
69  template<typename Cmplx, bool isConjugate>
70  __global__ void
71  cloverDerivativeKernel(const CloverDerivArg<Cmplx> arg)
72  {
73  int index = threadIdx.x + blockIdx.x*blockDim.x;
74 
75  if(index >= arg.volumeCB) return;
76 
77 
78  int x[4];
79  int y[4];
80  int otherparity = (1-arg.parity);
81  getCoords(x, index, arg.X, arg.parity);
82  getCoords(y, index, arg.X, otherparity);
83  int X[4];
84  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
85 
86  for(int dir=0; dir<4; ++dir){
87  x[dir] += arg.border[dir];
88  y[dir] += arg.border[dir];
89  X[dir] += 2*arg.border[dir];
90  }
91 
92  Cmplx* thisGauge = arg.gauge + arg.parity*arg.gaugeOffset;
93  Cmplx* otherGauge = arg.gauge + (otherparity)*arg.gaugeOffset;
94 
95  Cmplx* thisOprod = arg.oprod + arg.parity*arg.oprodOffset;
96 
97  const int& mu = arg.mu;
98  const int& nu = arg.nu;
99 
100  Matrix<Cmplx,3> thisForce;
101  Matrix<Cmplx,3> otherForce;
102 
103  // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
104  {
105  int d[4] = {0, 0, 0, 0};
106 
107  // load U(x)_(+mu)
108  Matrix<Cmplx,3> U1;
109  loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X),
110  arg.gaugeStride, &U1);
111 
112 
113  // load U(x+mu)_(+nu)
114  Matrix<Cmplx,3> U2;
115  d[mu]++;
116  loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X),
117  arg.gaugeStride, &U2);
118  d[mu]--;
119 
120 
121  // load U(x+nu)_(+mu)
122  Matrix<Cmplx,3> U3;
123  d[nu]++;
124  loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X),
125  arg.gaugeStride, &U3);
126  d[nu]--;
127 
128  // load U(x)_(+nu)
129  Matrix<Cmplx,3> U4;
130  loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X),
131  arg.gaugeStride, &U4);
132 
133  // load Oprod
134  Matrix<Cmplx,3> Oprod1;
135  loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1);
136 
137  if(isConjugate) Oprod1 -= conj(Oprod1);
138  thisForce = U1*U2*conj(U3)*conj(U4)*Oprod1;
139 
140  Matrix<Cmplx,3> Oprod2;
141  d[mu]++; d[nu]++;
142  loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod2);
143  d[mu]--; d[nu]--;
144 
145  if(isConjugate) Oprod2 -= conj(Oprod2);
146 
147  thisForce += U1*U2*Oprod2*conj(U3)*conj(U4);
148 
149  }
150 
151  {
152  int d[4] = {0, 0, 0, 0};
153  // load U(x)_(+mu)
154  Matrix<Cmplx,3> U1;
155  loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X),
156  arg.gaugeStride, &U1);
157 
158  // load U(x+mu)_(+nu)
159  Matrix<Cmplx,3> U2;
160  d[mu]++;
161  loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X),
162  arg.gaugeStride, &U2);
163  d[mu]--;
164 
165  // load U(x+nu)_(+mu)
166  Matrix<Cmplx,3> U3;
167  d[nu]++;
168  loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X),
169  arg.gaugeStride, &U3);
170  d[nu]--;
171 
172  // load U(x)_(+nu)
173  Matrix<Cmplx,3> U4;
174  loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X),
175  arg.gaugeStride, &U4);
176 
177  // load opposite parity Oprod
178  Matrix<Cmplx,3> Oprod3;
179  d[nu]++;
180  loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod3);
181  d[nu]--;
182 
183  if(isConjugate) Oprod3 -= conj(Oprod3);
184  otherForce = U1*U2*conj(U3)*Oprod3*conj(U4);
185 
186  // load Oprod(x+mu)
187  Matrix<Cmplx, 3> Oprod4;
188  d[mu]++;
189  loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod4);
190  d[mu]--;
191 
192  if(isConjugate) Oprod4 -= conj(Oprod4);
193 
194  otherForce += U1*Oprod4*U2*conj(U3)*conj(U4);
195  }
196 
197 
198  // Lower leaf
199  // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
200  {
201  int d[4] = {0, 0, 0, 0};
202  // load U(x-nu)(+nu)
203  Matrix<Cmplx,3> U1;
204  d[nu]--;
205  loadLinkVariableFromArray(thisGauge, nu, linkIndex(y, d, X),
206  arg.gaugeStride, &U1);
207  d[nu]++;
208 
209  // load U(x-nu)(+mu)
210  Matrix<Cmplx, 3> U2;
211  d[nu]--;
212  loadLinkVariableFromArray(thisGauge, mu, linkIndex(y, d, X),
213  arg.gaugeStride, &U2);
214  d[nu]++;
215 
216  // load U(x+mu-nu)(nu)
217  Matrix<Cmplx, 3> U3;
218  d[mu]++; d[nu]--;
219  loadLinkVariableFromArray(otherGauge, nu, linkIndex(y, d, X),
220  arg.gaugeStride, &U3);
221  d[mu]--; d[nu]++;
222 
223  // load U(x)_(+mu)
224  Matrix<Cmplx,3> U4;
225  loadLinkVariableFromArray(otherGauge, mu, linkIndex(y, d, X),
226  arg.gaugeStride, &U4);
227 
228  // load Oprod(x+mu)
229  Matrix<Cmplx, 3> Oprod1;
230  d[mu]++;
231  loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod1);
232  d[mu]--;
233 
234  if(isConjugate) Oprod1 -= conj(Oprod1);
235 
236  otherForce -= conj(U1)*U2*U3*Oprod1*conj(U4);
237 
238  Matrix<Cmplx,3> Oprod2;
239  d[nu]--;
240  loadMatrixFromArray(thisOprod, linkIndex(y, d, X), arg.oprodStride, &Oprod2);
241  d[nu]++;
242 
243  if(isConjugate) Oprod2 -= conj(Oprod2);
244  otherForce -= conj(U1)*Oprod2*U2*U3*conj(U4);
245  }
246 
247  {
248  int d[4] = {0, 0, 0, 0};
249  // load U(x-nu)(+nu)
250  Matrix<Cmplx,3> U1;
251  d[nu]--;
252  loadLinkVariableFromArray(otherGauge, nu, linkIndex(x, d, X),
253  arg.gaugeStride, &U1);
254  d[nu]++;
255 
256  // load U(x-nu)(+mu)
257  Matrix<Cmplx, 3> U2;
258  d[nu]--;
259  loadLinkVariableFromArray(otherGauge, mu, linkIndex(x, d, X),
260  arg.gaugeStride, &U2);
261  d[nu]++;
262 
263  // load U(x+mu-nu)(nu)
264  Matrix<Cmplx, 3> U3;
265  d[mu]++; d[nu]--;
266  loadLinkVariableFromArray(thisGauge, nu, linkIndex(x, d, X),
267  arg.gaugeStride, &U3);
268  d[mu]--; d[nu]++;
269 
270  // load U(x)_(+mu)
271  Matrix<Cmplx,3> U4;
272  loadLinkVariableFromArray(thisGauge, mu, linkIndex(x, d, X),
273  arg.gaugeStride, &U4);
274 
275 
276  Matrix<Cmplx,3> Oprod1;
277  d[mu]++; d[nu]--;
278  loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod1);
279  d[mu]--; d[nu]++;
280 
281  if(isConjugate) Oprod1 -= conj(Oprod1);
282  thisForce -= conj(U1)*U2*Oprod1*U3*conj(U4);
283 
284  Matrix<Cmplx, 3> Oprod4;
285  loadMatrixFromArray(thisOprod, linkIndex(x, d, X), arg.oprodStride, &Oprod4);
286 
287  if(isConjugate) Oprod4 -= conj(Oprod4);
288  thisForce -= Oprod4*conj(U1)*U2*U3*conj(U4);
289  }
290 
291  thisForce *= arg.coeff;
292  otherForce *= arg.coeff;
293 
294 
295  // Write to array
296  {
297  appendMatrixToArray(thisForce, index, arg.forceStride, arg.force + arg.parity*arg.forceOffset);
298  appendMatrixToArray(otherForce, index, arg.forceStride, arg.force + otherparity*arg.forceOffset);
299  }
300  return;
301  } // cloverDerivativeKernel
302 
303 
304  template<typename Complex>
305  class CloverDerivative : public Tunable {
306 
307  private:
308  CloverDerivArg<Complex> arg;
309  const GaugeField &meta;
310 
311  unsigned int sharedBytesPerThread() const { return 0; }
312  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
313 
314  unsigned int minThreads() const { return arg.volumeCB; }
315  bool tuneGridDim() const { return false; }
316 
317  public:
318  CloverDerivative(const CloverDerivArg<Complex> &arg, const GaugeField &meta)
319  : arg(arg), meta(meta) {
320  writeAuxString("threads=%d,prec=%lu,stride=%d,geometery=%d",arg.volumeCB,sizeof(Complex)/2,arg.forceOffset);
321  }
322  virtual ~CloverDerivative() {}
323 
324  void apply(const cudaStream_t &stream){
325  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
326  if(arg.conjugate){
327  cloverDerivativeKernel<Complex,true><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
328  }else{
329  cloverDerivativeKernel<Complex,false><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
330  }
331  } // apply
332 
333  void preTune(){}
334  void postTune(){}
335 
336  long long flops() const {
337  return 0;
338  }
339 
340  long long bytes() const { return 0; }
341 
342  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
343  };
344 
345 
346  // FIXME - the Tunable class isn't used here
347  template<typename Float>
348  void cloverDerivative(cudaGaugeField &out,
349  cudaGaugeField& gauge,
350  cudaGaugeField& oprod,
351  int mu, int nu, double coeff, int parity,
352  int conjugate)
353  {
354  typedef typename ComplexTypeId<Float>::Type Complex;
355  CloverDerivArg<Complex> arg(out, gauge, oprod, mu, nu, coeff, parity, conjugate);
356 // CloverDerivative<Complex> cloverDerivative(arg);
357 // cloverDerivative.apply(0);
358  dim3 blockDim(128, 1, 1);
359  dim3 gridDim((arg.volumeCB + blockDim.x-1)/blockDim.x, 1, 1);
360  if(conjugate){
361  cloverDerivativeKernel<Complex,true><<<gridDim,blockDim,0>>>(arg);
362  }else{
363  cloverDerivativeKernel<Complex,false><<<gridDim,blockDim,0>>>(arg);
364  }
365  }
366 
367 #endif
368 
370  cudaGaugeField& gauge,
371  cudaGaugeField& oprod,
372  int mu, int nu, double coeff, QudaParity parity, int conjugate)
373  {
374 #ifdef GPU_CLOVER_DIRAC
375  assert(oprod.Geometry() == QUDA_SCALAR_GEOMETRY);
376  assert(out.Geometry() == QUDA_SCALAR_GEOMETRY);
377 
378  int device_parity = (parity == QUDA_EVEN_PARITY) ? 0 : 1;
379 
380  if(out.Precision() == QUDA_DOUBLE_PRECISION){
381  cloverDerivative<double>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);
382  } else if (out.Precision() == QUDA_SINGLE_PRECISION){
383  cloverDerivative<float>(out, gauge, oprod, mu, nu, coeff, device_parity, conjugate);
384  } else {
385  errorQuda("Precision %d not supported", out.Precision());
386  }
387  return;
388 #else
389  errorQuda("Clover has not been built");
390 #endif
391  }
392 
393 
394 } // namespace quda
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
#define errorQuda(...)
Definition: util_quda.h:73
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
void cloverDerivative(cudaGaugeField &out, cudaGaugeField &gauge, cudaGaugeField &oprod, int mu, int nu, double coeff, QudaParity parity, int conjugate)
QudaPrecision Precision() const
__device__ __host__ int index(int i, int j)
Definition: quda_matrix.h:342
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
__constant__ double coeff
enum QudaParity_s QudaParity
int x[4]
int dx[4]
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< T, N > *mat)
Definition: quda_matrix.h:778
cpuColorSpinorField * out
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
Definition: quda_matrix.h:767
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
__device__ void appendMatrixToArray(const Matrix< double2, 3 > &mat, const int idx, const int stride, double2 *const array)
Definition: quda_matrix.h:810
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:177
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)