QUDA  v1.1.0
A library for QCD on GPUs
clover_quda.cu
Go to the documentation of this file.
1 #include <quda_matrix.h>
2 #include <tune_quda.h>
3 #include <clover_field.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <clover_field_order.h>
7 #include <instantiate.h>
8 
9 namespace quda {
10 
11  template <typename store_t_>
12  struct CloverArg {
13  using store_t = store_t_;
14  using real = typename mapper<store_t>::type;
15  static constexpr int nColor = 3;
16  static constexpr int nSpin = 4;
17 
18  using Clover = typename clover_mapper<store_t>::type;
19  using Fmunu = typename gauge_mapper<store_t, QUDA_RECONSTRUCT_NO>::type;
20 
21  Clover clover;
22  const Fmunu f;
23  int threads; // number of active threads required
24  int X[4]; // grid dimensions
25  real coeff;
26 
27  CloverArg(CloverField &clover, const GaugeField &f, double coeff) :
28  clover(clover, 0),
29  f(f),
30  threads(f.VolumeCB()),
31  coeff(coeff)
32  {
33  for (int dir=0; dir<4; ++dir) X[dir] = f.X()[dir];
34  }
35  };
36 
37  /*
38  Put into clover order
39  Upper-left block (chirality index 0)
40  / \
41  | 1 + c*I*(F[0,1] - F[2,3]) , c*I*(F[1,2] - F[0,3]) + c*(F[0,2] + F[1,3]) |
42  | |
43  | c*I*(F[1,2] - F[0,3]) - c*(F[0,2] + F[1,3]), 1 - c*I*(F[0,1] - F[2,3]) |
44  | |
45  \ /
46 
47  /
48  | 1 - c*I*(F[0] - F[5]), -c*I*(F[2] - F[3]) - c*(F[1] + F[4])
49  |
50  | -c*I*(F[2] -F[3]) + c*(F[1] + F[4]), 1 + c*I*(F[0] - F[5])
51  |
52  \
53 
54  Lower-right block (chirality index 1)
55 
56  / \
57  | 1 - c*I*(F[0] + F[5]), -c*I*(F[2] + F[3]) - c*(F[1] - F[4]) |
58  | |
59  | -c*I*(F[2]+F[3]) + c*(F[1]-F[4]), 1 + c*I*(F[0] + F[5]) |
60  \ /
61  */
62  // Core routine for constructing clover term from field strength
63  template <typename Arg>
64  __device__ __host__ void cloverComputeCore(Arg &arg, int x_cb, int parity)
65  {
66  constexpr int N = Arg::nColor*Arg::nSpin / 2;
67  using real = typename Arg::real;
68  using Complex = complex<real>;
69  using Link = Matrix<Complex, Arg::nColor>;
70 
71  // Load the field-strength tensor from global memory
72  Link F[6];
73 #pragma unroll
74  for (int i=0; i<6; ++i) F[i] = arg.f(i, x_cb, parity);
75 
76  Complex I(0.0,1.0);
77  Complex coeff(0.0, arg.coeff);
78  Link block1[2], block2[2];
79  block1[0] = coeff*(F[0]-F[5]); // (18 + 6*9=) 72 floating-point ops
80  block1[1] = coeff*(F[0]+F[5]); // 72 floating-point ops
81  block2[0] = arg.coeff*(F[1]+F[4] - I*(F[2]-F[3])); // 126 floating-point ops
82  block2[1] = arg.coeff*(F[1]-F[4] - I*(F[2]+F[3])); // 126 floating-point ops
83 
84  // This uses lots of unnecessary memory
85 #pragma unroll
86  for (int ch=0; ch<2; ++ch) {
87  HMatrix<real,N> A;
88  // c = 0(1) => positive(negative) chiral block
89  // Compute real diagonal elements
90 #pragma unroll
91  for (int i=0; i<N/2; ++i) {
92  A(i+0,i+0) = 1.0 - block1[ch](i,i).real();
93  A(i+3,i+3) = 1.0 + block1[ch](i,i).real();
94  }
95 
96  // Compute off diagonal components
97  // First row
98  A(1,0) = -block1[ch](1,0);
99  // Second row
100  A(2,0) = -block1[ch](2,0);
101  A(2,1) = -block1[ch](2,1);
102  // Third row
103  A(3,0) = block2[ch](0,0);
104  A(3,1) = block2[ch](0,1);
105  A(3,2) = block2[ch](0,2);
106  // Fourth row
107  A(4,0) = block2[ch](1,0);
108  A(4,1) = block2[ch](1,1);
109  A(4,2) = block2[ch](1,2);
110  A(4,3) = block1[ch](1,0);
111  // Fifth row
112  A(5,0) = block2[ch](2,0);
113  A(5,1) = block2[ch](2,1);
114  A(5,2) = block2[ch](2,2);
115  A(5,3) = block1[ch](2,0);
116  A(5,4) = block1[ch](2,1);
117  A *= static_cast<real>(0.5);
118 
119  arg.clover(x_cb, parity, ch) = A;
120  } // ch
121  // 84 floating-point ops
122  }
123 
124  template <typename Arg> __global__ void cloverComputeKernel(Arg arg)
125  {
126  int x_cb = threadIdx.x + blockIdx.x*blockDim.x;
127  int parity = threadIdx.y + blockIdx.y*blockDim.y;
128  if (x_cb >= arg.threads) return;
129  cloverComputeCore(arg, x_cb, parity);
130  }
131 
132  template <typename Arg> void cloverComputeCPU(Arg arg)
133  {
134  for (int parity = 0; parity<2; parity++) {
135  for (int x_cb=0; x_cb<arg.threads; x_cb++){
136  cloverComputeCore(arg, x_cb, parity);
137  }
138  }
139  }
140 
141  template <typename store_t>
142  class CloverCompute : TunableVectorY {
143  CloverArg<store_t> arg;
144  const GaugeField &meta;
145  unsigned int sharedBytesPerThread() const { return 0; }
146  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
147  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
148  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
149  unsigned int minThreads() const { return arg.threads; }
150 
151  public:
152  CloverCompute(CloverField &clover, const GaugeField& f, double coeff) :
153  TunableVectorY(2),
154  arg(clover, f, coeff),
155  meta(f)
156  {
157  checkNative(clover, f);
158  writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(store_t));
159 
160  apply(0);
161  qudaDeviceSynchronize();
162  }
163 
164  void apply(const qudaStream_t &stream)
165  {
166  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
167  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
168  qudaLaunchKernel(cloverComputeKernel<decltype(arg)>, tp, stream, arg);
169  } else { // run the CPU code
170  errorQuda("Not implemented");
171  }
172  }
173 
174  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
175  long long flops() const { return 2*arg.threads*480ll; }
176  long long bytes() const { return 2*arg.threads*(6*arg.f.Bytes() + arg.clover.Bytes()); }
177  };
178 
179  void computeClover(CloverField &clover, const GaugeField& f, double coeff)
180  {
181 #ifdef GPU_CLOVER_DIRAC
182  instantiate<CloverCompute>(clover, f, coeff);
183 #else
184  errorQuda("Clover has not been built");
185 #endif
186  }
187 
188 } // namespace quda
189