QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
clover_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <clover_field.h>
5 #include <gauge_field.h>
6 #include <gauge_field_order.h>
7 
8 namespace CloverOrder {
9  using namespace quda;
10 #include <clover_field_order.h>
11 } // CloverOrder
12 
13 
14 
15 namespace quda {
16 
17 #ifdef GPU_CLOVER_DIRAC
18 
19  template<typename Float, typename Clover, typename Gauge>
20  struct CloverArg {
21  int threads; // number of active threads required
22  int X[4]; // grid dimensions
23 #ifdef MULTI_GPU
24  int border[4];
25 #endif
26  double cloverCoeff;
27 
28  int FmunuStride; // stride used on Fmunu field
29  int FmunuOffset; // parity offset
30 
31  typename ComplexTypeId<Float>::Type* Fmunu;
32  Gauge gauge;
33  Clover clover;
34 
35  CloverArg(Clover &clover, Gauge &gauge, GaugeField& Fmunu, double cloverCoeff)
36  : threads(Fmunu.Volume()),
37  cloverCoeff(cloverCoeff),
38  FmunuStride(Fmunu.Stride()), FmunuOffset(Fmunu.Bytes()/(4*sizeof(Float))),
39  Fmunu(reinterpret_cast<typename ComplexTypeId<Float>::Type*>(Fmunu.Gauge_p())),
40  gauge(gauge), clover(clover) {
41  for(int dir=0; dir<4; ++dir) X[dir] = Fmunu.X()[dir];
42 
43 #ifdef MULTI_GPU
44  for(int dir=0; dir<4; ++dir){
45  border[dir] = 2;
46  }
47 #endif
48  }
49  };
50 
51  __device__ __host__ inline int linkIndex(int x[], int dx[], const int X[4]) {
52  int y[4];
53  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
54  int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
55  return idx;
56  }
57 
58 
59  __device__ __host__ inline void getCoords(int x[4], int cb_index, const int X[4], int parity)
60  {
61  x[3] = cb_index/(X[2]*X[1]*X[0]/2);
62  x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
63  x[1] = (cb_index/(X[0]/2)) % X[1];
64  x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
65 
66  return;
67  }
68 
69 
70 
71 
72 
73  template <typename Float, typename Clover, typename GaugeOrder>
74  __host__ __device__ void computeFmunuCore(CloverArg<Float,Clover,GaugeOrder>& arg, int idx) {
75 
76  // compute spacetime dimensions and parity
77  int parity = 0;
78  if(idx >= arg.threads/2){
79  parity = 1;
80  idx -= arg.threads/2;
81  }
82 
83  int X[4];
84  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
85 
86  int x[4];
87  getCoords(x, idx, X, parity);
88 #ifdef MULTI_GPU
89  for(int dir=0; dir<4; ++dir){
90  x[dir] += arg.border[dir];
91  X[dir] += 2*arg.border[dir];
92  }
93 #endif
94 
95  typedef typename ComplexTypeId<Float>::Type Cmplx;
96 
97 
98 
99  for (int mu=0; mu<4; mu++) {
100  for (int nu=0; nu<mu; nu++) {
101  Matrix<Cmplx,3> F;
102  setZero(&F);
103  { // U(x,mu) U(x+mu,nu) U[dagger](x+nu,mu) U[dagger](x,nu)
104 
105  // load U(x)_(+mu)
106  Matrix<Cmplx,3> U1;
107  int dx[4] = {0, 0, 0, 0};
108  arg.gauge.load((Float*)(U1.data),linkIndex(x,dx,X), mu, parity);
109  // load U(x+mu)_(+nu)
110  Matrix<Cmplx,3> U2;
111  dx[mu]++;
112  arg.gauge.load((Float*)(U2.data),linkIndex(x,dx,X), nu, 1-parity);
113  dx[mu]--;
114 
115 
116  Matrix<Cmplx,3> Ftmp = U1 * U2;
117 
118  // load U(x+nu)_(+mu)
119  Matrix<Cmplx,3> U3;
120  dx[nu]++;
121  arg.gauge.load((Float*)(U3.data),linkIndex(x,dx,X), mu, 1-parity);
122  dx[nu]--;
123 
124  Ftmp = Ftmp * conj(U3) ;
125 
126  // load U(x)_(+nu)
127  Matrix<Cmplx,3> U4;
128  arg.gauge.load((Float*)(U4.data),linkIndex(x,dx,X), nu, parity);
129 
130  // complete the plaquette
131  F = Ftmp * conj(U4);
132  }
133 
134 
135  { // U(x,nu) U[dagger](x+nu-mu,mu) U[dagger](x-mu,nu) U(x-mu, mu)
136 
137  // load U(x)_(+nu)
138  Matrix<Cmplx,3> U1;
139  int dx[4] = {0, 0, 0, 0};
140  arg.gauge.load((Float*)(U1.data), linkIndex(x,dx,X), nu, parity);
141 
142  // load U(x+nu)_(-mu) = U(x+nu-mu)_(+mu)
143  Matrix<Cmplx,3> U2;
144  dx[nu]++;
145  dx[mu]--;
146  arg.gauge.load((Float*)(U2.data), linkIndex(x,dx,X), mu, parity);
147  dx[mu]++;
148  dx[nu]--;
149 
150  Matrix<Cmplx,3> Ftmp = U1 * conj(U2);
151 
152  // load U(x-mu)_nu
153  Matrix<Cmplx,3> U3;
154  dx[mu]--;
155  arg.gauge.load((Float*)(U3.data), linkIndex(x,dx,X), nu, 1-parity);
156  dx[mu]++;
157 
158  Ftmp = Ftmp * conj(U3);
159 
160  // load U(x)_(-mu) = U(x-mu)_(+mu)
161  Matrix<Cmplx,3> U4;
162  dx[mu]--;
163  arg.gauge.load((Float*)(U4.data), linkIndex(x,dx,X), mu, 1-parity);
164  dx[mu]++;
165 
166  // complete the plaquette
167  Ftmp = Ftmp * U4;
168 
169  // sum this contribution to Fmunu
170  F += Ftmp;
171  }
172 
173  { // U[dagger](x-nu,nu) U(x-nu,mu) U(x+mu-nu,nu) U[dagger](x,mu)
174 
175 
176  // load U(x)_(-nu)
177  Matrix<Cmplx,3> U1;
178  int dx[4] = {0, 0, 0, 0};
179  dx[nu]--;
180  arg.gauge.load((Float*)(U1.data), linkIndex(x,dx,X), nu, 1-parity);
181  dx[nu]++;
182 
183  // load U(x-nu)_(+mu)
184  Matrix<Cmplx,3> U2;
185  dx[nu]--;
186  arg.gauge.load((Float*)(U2.data), linkIndex(x,dx,X), mu, 1-parity);
187  dx[nu]++;
188 
189  Matrix<Cmplx,3> Ftmp = conj(U1) * U2;
190 
191  // load U(x+mu-nu)_(+nu)
192  Matrix<Cmplx,3> U3;
193  dx[mu]++;
194  dx[nu]--;
195  arg.gauge.load((Float*)(U3.data), linkIndex(x,dx,X), nu, parity);
196  dx[nu]++;
197  dx[mu]--;
198 
199  Ftmp = Ftmp * U3;
200 
201  // load U(x)_(+mu)
202  Matrix<Cmplx,3> U4;
203  arg.gauge.load((Float*)(U4.data), linkIndex(x,dx,X), mu, parity);
204 
205  Ftmp = Ftmp * conj(U4);
206 
207  // sum this contribution to Fmunu
208  F += Ftmp;
209  }
210 
211  { // U[dagger](x-mu,mu) U[dagger](x-mu-nu,nu) U(x-mu-nu,mu) U(x-nu,nu)
212 
213 
214  // load U(x)_(-mu)
215  Matrix<Cmplx,3> U1;
216  int dx[4] = {0, 0, 0, 0};
217  dx[mu]--;
218  arg.gauge.load((Float*)(U1.data), linkIndex(x,dx,X), mu, 1-parity);
219  dx[mu]++;
220 
221 
222 
223  // load U(x-mu)_(-nu) = U(x-mu-nu)_(+nu)
224  Matrix<Cmplx,3> U2;
225  dx[mu]--;
226  dx[nu]--;
227  arg.gauge.load((Float*)(U2.data), linkIndex(x,dx,X), nu, parity);
228  dx[nu]++;
229  dx[mu]++;
230 
231  Matrix<Cmplx,3> Ftmp = conj(U1) * conj(U2);
232 
233  // load U(x-nu)_mu
234  Matrix<Cmplx,3> U3;
235  dx[mu]--;
236  dx[nu]--;
237  arg.gauge.load((Float*)(U3.data), linkIndex(x,dx,X), mu, parity);
238  dx[nu]++;
239  dx[mu]++;
240 
241  Ftmp = Ftmp * U3;
242 
243  // load U(x)_(-nu) = U(x-nu)_(+nu)
244  Matrix<Cmplx,3> U4;
245  dx[nu]--;
246  arg.gauge.load((Float*)(U4.data), linkIndex(x,dx,X), nu, 1-parity);
247  dx[nu]++;
248 
249  // complete the plaquette
250  Ftmp = Ftmp * U4;
251 
252  // sum this contribution to Fmunu
253  F += Ftmp;
254 
255  }
256  // 3 matrix additions, 12 matrix-matrix multiplications, 8 matrix conjugations
257  // Each matrix conjugation involves 9 unary minus operations but these ar not included in the operation count
258  // Each matrix addition involves 18 real additions
259  // Each matrix-matrix multiplication involves 9*3 complex multiplications and 9*2 complex additions
260  // = 9*3*6 + 9*2*2 = 198 floating-point ops
261  // => Total number of floating point ops per site above is
262  // 3*18 + 12*198 = 54 + 2376 = 2430
263 
264  {
265  F -= conj(F); // 18 real subtractions + one matrix conjugation
266  F *= 1.0/8.0; // 18 real multiplications
267  // 36 floating point operations here
268  }
269 
270 
271 
272  Cmplx* thisFmunu = arg.Fmunu + parity*arg.FmunuOffset;
273  int munu_idx = (mu*(mu-1))/2 + nu; // lower-triangular indexing
274 
275  writeLinkVariableToArray(F, munu_idx, idx, arg.FmunuStride, thisFmunu);
276  } // nu < mu
277  } // mu
278  // F[1,0], F[2,0], F[2,1], F[3,0], F[3,1], F[3,2]
279  return;
280  }
281 
282 
283 
284  template<typename Float, typename Clover, typename Gauge>
285  __global__ void computeFmunuKernel(CloverArg<Float,Clover,Gauge> arg){
286  int idx = threadIdx.x + blockIdx.x*blockDim.x;
287  if(idx >= arg.threads) return;
288  computeFmunuCore<Float,Clover,Gauge>(arg,idx);
289  }
290 
291  template<typename Float, typename Clover, typename Gauge>
292  void computeFmunuCPU(CloverArg<Float,Clover,Gauge>& arg){
293  errorQuda("computeFmunuCPU not yet supported\n");
294  for(int idx=0; idx<arg.threads; idx++){
295  computeFmunuCore(arg,idx);
296  }
297  }
298 
299 
300 
301  template<typename Float, typename Clover, typename Gauge>
302  class FmunuCompute : Tunable {
303  CloverArg<Float,Clover,Gauge> arg;
304  const GaugeField &meta;
306 
307  private:
308  unsigned int sharedBytesPerThread() const { return 0; }
309  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
310 
311  bool tuneSharedBytes() const { return false; } // Don't tune shared memory
312  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
313  unsigned int minThreads() const { return arg.threads; }
314 
315  public:
316  FmunuCompute(CloverArg<Float,Clover,Gauge> &arg, const GaugeField &meta, QudaFieldLocation location)
317  : arg(arg), meta(meta), location(location) {
318  writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(Float));
319  }
320  virtual ~FmunuCompute() {}
321 
322  void apply(const cudaStream_t &stream){
324 #if (__COMPUTE_CAPABILITY__ >= 200)
325  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
326  computeFmunuKernel<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
327 #else
328  errorQuda("computeFmunuKernel not supported on pre-Fermi architecture");
329 #endif
330  }else{
331  computeFmunuCPU(arg);
332  }
333  }
334 
335  TuneKey tuneKey() const {
336  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
337  }
338 
339 
340  std::string paramString(const TuneParam &param) const {
341  std::stringstream ps;
342  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << ")";
343  ps << "shared=" << param.shared_bytes;
344  return ps.str();
345  }
346 
347  void preTune(){}
348  void postTune(){}
349  long long flops() const { return (2430 + 36)*6*arg.threads; }
350  long long bytes() const { return (4*4*18 + 18)*6*arg.threads*sizeof(Float); } // Ignores link reconstruction
351 
352  }; // FmunuCompute
353 
354  // Put into clover order
355  // Upper-left block (chirality index 0)
356  // / \
357  // | 1 + c*I*(F[0,1] - F[2,3]) , c*I*(F[1,2] - F[0,3]) + c*(F[0,2] + F[1,3]) |
358  // | |
359  // | c*I*(F[1,2] - F[0,3]) - c*(F[0,2] + F[1,3]), 1 - c*I*(F[0,1] - F[2,3]) |
360  // | |
361  // \ /
362 
363  // /
364  // | 1 - c*I*(F[0] - F[5]), -c*I*(F[2] - F[3]) - c*(F[1] + F[4])
365  // |
366  // | -c*I*(F[2] -F[3]) + c*(F[1] + F[4]), 1 + c*I*(F[0] - F[5])
367  // |
368  // \
369  //
370  // Lower-right block (chirality index 1)
371  //
372  // / \
373  // | 1 - c*I*(F[0] + F[5]), -c*I*(F[2] + F[3]) - c*(F[1] - F[4]) |
374  // | |
375  // | -c*I*(F[2]+F[3]) + c*(F[1]-F[4]), 1 + c*I*(F[0] + F[5]) |
376  // \ /
377  //
378 
379  // Core routine for constructing clover term from field strength
380  template<typename Float, typename Clover, typename Gauge>
381  __device__ __host__
382  void cloverComputeCore(CloverArg<Float,Clover,Gauge>& arg, int idx){
383 
384  int parity = 0;
385  if(idx >= arg.threads/2){
386  parity = 1;
387  idx -= arg.threads/2;
388  }
389  typedef typename ComplexTypeId<Float>::Type Cmplx;
390 
391 
392  // Load the field-strength tensor from global memory
393  Matrix<Cmplx,3> F[6];
394  for(int i=0; i<6; ++i){
395  loadLinkVariableFromArray(arg.Fmunu + parity*arg.FmunuOffset, i, idx, arg.FmunuStride, &F[i]);
396  }
397 
398  Cmplx I; I.x = 0; I.y = 1.0;
399  Cmplx coeff; coeff.x = 0; coeff.y = arg.cloverCoeff;
400  Matrix<Cmplx,3> block1[2];
401  Matrix<Cmplx,3> block2[2];
402  block1[0] = coeff*(F[0]-F[5]); // (18 + 6*9=) 72 floating-point ops
403  block1[1] = coeff*(F[0]+F[5]); // 72 floating-point ops
404  block2[0] = arg.cloverCoeff*(F[1]+F[4] - I*(F[2]-F[3])); // 126 floating-point ops
405  block2[1] = arg.cloverCoeff*(F[1]-F[4] - I*(F[2]+F[3])); // 126 floating-point ops
406 
407 
408  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
409  Float diag[6];
410  Cmplx triangle[15];
411  Float A[72];
412 
413  // This uses lots of unnecessary memory
414  for(int ch=0; ch<2; ++ch){
415  // c = 0(1) => positive(negative) chiral block
416  // Compute real diagonal elements
417  for(int i=0; i<3; ++i){
418  diag[i] = 1.0 - block1[ch](i,i).x;
419  diag[i+3] = 1.0 + block1[ch](i,i).x;
420  }
421 
422  // Compute off diagonal components
423  // First row
424  triangle[0] = - block1[ch](1,0);
425  // Second row
426  triangle[1] = - block1[ch](2,0);
427  triangle[2] = - block1[ch](2,1);
428  // Third row
429  triangle[3] = block2[ch](0,0);
430  triangle[4] = block2[ch](0,1);
431  triangle[5] = block2[ch](0,2);
432  // Fourth row
433  triangle[6] = block2[ch](1,0);
434  triangle[7] = block2[ch](1,1);
435  triangle[8] = block2[ch](1,2);
436  triangle[9] = block1[ch](1,0);
437  // Fifth row
438  triangle[10] = block2[ch](2,0);
439  triangle[11] = block2[ch](2,1);
440  triangle[12] = block2[ch](2,2);
441  triangle[13] = block1[ch](2,0);
442  triangle[14] = block1[ch](2,1);
443 
444 
445  for(int i=0; i<6; ++i){
446  A[ch*36 + i] = 0.5*diag[i];
447  }
448  for(int i=0; i<15; ++i){
449  A[ch*36+6+2*i] = 0.5*triangle[idtab[i]].x;
450  A[ch*36+6+2*i + 1] = 0.5*triangle[idtab[i]].y;
451  }
452  } // ch
453  // 84 floating-point ops
454 
455 
456  arg.clover.save(A, idx, parity);
457  return;
458  }
459 
460 
461  template<typename Float, typename Clover, typename Gauge>
462  __global__
463  void cloverComputeKernel(CloverArg<Float,Clover,Gauge> arg){
464  int idx = threadIdx.x + blockIdx.x*blockDim.x;
465  if(idx >= arg.threads) return;
466  cloverComputeCore(arg, idx);
467  }
468 
469  template<typename Float, typename Clover, typename Gauge>
470  void cloverComputeCPU(CloverArg<Float,Clover,Gauge> arg){
471  for(int idx=0; idx<arg.threads; ++idx){
472  cloverComputeCore(arg, idx);
473  }
474  }
475 
476 
477  template<typename Float, typename Clover, typename Gauge>
478  class CloverCompute : Tunable {
479  CloverArg<Float, Clover, Gauge> arg;
480  const GaugeField &meta;
482 
483  private:
484  unsigned int sharedBytesPerThread() const { return 0; }
485  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
486 
487  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
488  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
489  unsigned int minThreads() const { return arg.threads; }
490 
491  public:
492  CloverCompute(CloverArg<Float,Clover,Gauge> &arg, const GaugeField &meta, QudaFieldLocation location)
493  : arg(arg), meta(meta), location(location) {
494  writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(Float));
495  }
496 
497  virtual ~CloverCompute() {}
498 
499  void apply(const cudaStream_t &stream) {
501 #if (__COMPUTE_CAPABILITY__ >= 200)
502  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
503  cloverComputeKernel<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
504 #else
505  errorQuda("cloverComputeKernel not supported on pre-Fermi architecture");
506 #endif
507  }else{ // run the CPU code
508  cloverComputeCPU(arg);
509  }
510  }
511 
512  TuneKey tuneKey() const {
513  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
514  }
515 
516  std::string paramString(const TuneParam &param) const { // Don't print the grid dim.
517  std::stringstream ps;
518  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
519  ps << "shared=" << param.shared_bytes;
520  return ps.str();
521  }
522 
523  void preTune(){}
524  void postTune(){}
525  long long flops() const { return 480*arg.threads; }
526  long long bytes() const { return arg.threads*(6*18 + 72)*sizeof(Float); }
527  };
528 
529 
530 
531  template<typename Float,typename Clover,typename Gauge>
532  void computeClover(Clover clover, Gauge gauge, GaugeField& Fmunu, Float cloverCoeff, QudaFieldLocation location){
533  CloverArg<Float,Clover,Gauge> arg(clover, gauge, Fmunu, cloverCoeff);
534  FmunuCompute<Float,Clover,Gauge> fmunuCompute(arg, Fmunu, location);
535  fmunuCompute.apply(0);
536  CloverCompute<Float,Clover,Gauge> cloverCompute(arg, Fmunu, location);
537  cloverCompute.apply(0);
538  cudaDeviceSynchronize();
539  }
540 
541 
542  template<typename Float>
543  void computeClover(CloverField &clover, const GaugeField& gauge, Float cloverCoeff, QudaFieldLocation location){
544  int pad = 0;
545  GaugeFieldParam tensorParam(clover.X(), clover.Precision(), QUDA_RECONSTRUCT_NO, pad, QUDA_TENSOR_GEOMETRY);
546  tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
547  GaugeField* Fmunu = NULL;
548  if(location == QUDA_CPU_FIELD_LOCATION){
549  Fmunu = new cpuGaugeField(tensorParam);
550  } else if (location == QUDA_CUDA_FIELD_LOCATION){
551  Fmunu = new cudaGaugeField(tensorParam);
552  } else {
553  errorQuda("Invalid location\n");
554  }
555 
556  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
557  // Need to fix this!!
558 
559  if(clover.Order() == QUDA_FLOAT2_CLOVER_ORDER){
560  if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){
561  if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){
562  computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float, 18, 2, 18>(gauge), *Fmunu, cloverCoeff, location);
563  }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){
564  computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float, 18, 2, 12>(gauge), *Fmunu, cloverCoeff, location);
565  }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_8){
566  computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float, 18, 2, 8>(gauge), *Fmunu, cloverCoeff, location);
567  }else{
568  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
569  }
570 
571  }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){
572  if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){
573  computeClover(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0), FloatNOrder<Float,18,4,12>(gauge), *Fmunu, cloverCoeff, location);
574  }else{
575  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
576  }
577  }
578  }else if(clover.Order() == QUDA_FLOAT4_CLOVER_ORDER){
579  if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){
580  if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){
581  computeClover(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0), FloatNOrder<Float,18,2,18>(gauge), *Fmunu, cloverCoeff, location);
582  }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){
583  computeClover(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0), FloatNOrder<Float,18,2,12>(gauge), *Fmunu, cloverCoeff, location);
584  }else{
585  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
586  }
587 
588  }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){
589  if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){
590  computeClover(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0), FloatNOrder<Float,18,4,12>(gauge), *Fmunu, cloverCoeff, location);
591  }else{
592  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
593  } // gauge order
594  }
595  } // clover order
596 
597  if(Fmunu) delete Fmunu;
598  }
599 
600 #endif
601 
602  void computeClover(CloverField &clover, const GaugeField& gauge, double cloverCoeff, QudaFieldLocation location){
603 
604 #ifdef GPU_CLOVER_DIRAC
605  if(clover.Precision() == QUDA_HALF_PRECISION){
606  errorQuda("Half precision not supported\n");
607  }
608 
609  if (clover.Precision() == QUDA_SINGLE_PRECISION){
610  computeClover<float>(clover, gauge, cloverCoeff, location);
611  } else if(clover.Precision() == QUDA_DOUBLE_PRECISION) {
612  computeClover<double>(clover, gauge, cloverCoeff, location);
613  } else {
614  errorQuda("Precision %d not supported", clover.Precision());
615  }
616  return;
617 #else
618  errorQuda("Clover has not been built");
619 #endif
620 
621  }
622 
623 } // namespace quda
624 
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:640
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
#define errorQuda(...)
Definition: util_quda.h:73
const int * X() const
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
::std::string string
Definition: gtest.h:1979
QudaSiteSubset siteSubset
Definition: lattice_field.h:42
QudaGaugeParam param
Definition: pack_test.cpp:17
QudaPrecision Precision() const
const QudaFieldLocation location
Definition: pack_test.cpp:46
FloatingPoint< float > Float
Definition: gtest.h:7350
QudaCloverFieldOrder Order() const
Definition: clover_field.h:66
T data[N *N]
Definition: quda_matrix.h:351
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
__constant__ double coeff
int x[4]
int dx[4]
enum QudaFieldLocation_s QudaFieldLocation
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
Definition: quda_matrix.h:767
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
Definition: quda_matrix.h:830
QudaTune getTuning()
Definition: util_quda.cpp:32
getCoords(x, sid, kparam.D, oddBit)
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:602
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15