QUDA  0.9.0
field_strength_tensor.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <index_helper.cuh>
7 
8 namespace quda {
9 
10 #ifdef GPU_GAUGE_TOOLS
11 
12  template<typename Float, typename Fmunu, typename Gauge>
13  struct FmunuArg {
14  int threads; // number of active threads required
15  int X[4]; // grid dimensions
16  int border[4];
17  Fmunu f;
18  Gauge gauge;
19 
20  FmunuArg(Fmunu &f, Gauge &gauge, const GaugeField &meta, const GaugeField &meta_ex)
21  : threads(meta.VolumeCB()), f(f), gauge(gauge) {
22  for (int dir=0; dir<4; ++dir) {
23  X[dir] = meta.X()[dir];
24  border[dir] = (meta_ex.X()[dir] - X[dir])/2;
25  }
26  }
27  };
28 
29  template <int mu, int nu, typename Float, typename Arg>
30  __device__ __forceinline__ void computeFmunuCore(Arg &arg, int idx, int parity) {
31 
32  typedef Matrix<complex<Float>,3> Link;
33 
34  int x[4];
35  auto &X = arg.X;
36 
37  getCoords(x, idx, X, parity);
38  for (int dir=0; dir<4; ++dir) {
39  x[dir] += arg.border[dir];
40  X[dir] += 2*arg.border[dir];
41  }
42 
43  Link F;
44  { // U(x,mu) U(x+mu,nu) U[dagger](x+nu,mu) U[dagger](x,nu)
45 
46  // load U(x)_(+mu)
47  int dx[4] = {0, 0, 0, 0};
48  Link U1 = arg.gauge(mu, linkIndexShift(x,dx,X), parity);
49 
50  // load U(x+mu)_(+nu)
51  dx[mu]++;
52  Link U2 = arg.gauge(nu, linkIndexShift(x,dx,X), 1-parity);
53  dx[mu]--;
54 
55  // load U(x+nu)_(+mu)
56  dx[nu]++;
57  Link U3 = arg.gauge(mu, linkIndexShift(x,dx,X), 1-parity);
58  dx[nu]--;
59 
60  // load U(x)_(+nu)
61  Link U4 = arg.gauge(nu, linkIndexShift(x,dx,X), parity);
62 
63  // compute plaquette
64  F = U1 * U2 * conj(U3) * conj(U4);
65  }
66 
67  { // U(x,nu) U[dagger](x+nu-mu,mu) U[dagger](x-mu,nu) U(x-mu, mu)
68 
69  // load U(x)_(+nu)
70  int dx[4] = {0, 0, 0, 0};
71  Link U1 = arg.gauge(nu, linkIndexShift(x,dx,X), parity);
72 
73  // load U(x+nu)_(-mu) = U(x+nu-mu)_(+mu)
74  dx[nu]++;
75  dx[mu]--;
76  Link U2 = arg.gauge(mu, linkIndexShift(x,dx,X), parity);
77  dx[mu]++;
78  dx[nu]--;
79 
80  // load U(x-mu)_nu
81  dx[mu]--;
82  Link U3 = arg.gauge(nu, linkIndexShift(x,dx,X), 1-parity);
83  dx[mu]++;
84 
85  // load U(x)_(-mu) = U(x-mu)_(+mu)
86  dx[mu]--;
87  Link U4 = arg.gauge(mu, linkIndexShift(x,dx,X),1-parity);
88  dx[mu]++;
89 
90  // sum this contribution to Fmunu
91  F += U1 * conj(U2) * conj(U3) * U4;
92  }
93 
94  { // U[dagger](x-nu,nu) U(x-nu,mu) U(x+mu-nu,nu) U[dagger](x,mu)
95 
96  // load U(x)_(-nu)
97  int dx[4] = {0, 0, 0, 0};
98  dx[nu]--;
99  Link U1 = arg.gauge(nu, linkIndexShift(x,dx,X), 1-parity);
100  dx[nu]++;
101 
102  // load U(x-nu)_(+mu)
103  dx[nu]--;
104  Link U2 = arg.gauge(mu, linkIndexShift(x,dx,X), 1-parity);
105  dx[nu]++;
106 
107  // load U(x+mu-nu)_(+nu)
108  dx[mu]++;
109  dx[nu]--;
110  Link U3 = arg.gauge(nu, linkIndexShift(x,dx,X), parity);
111  dx[nu]++;
112  dx[mu]--;
113 
114  // load U(x)_(+mu)
115  Link U4 = arg.gauge(mu, linkIndexShift(x,dx,X), parity);
116 
117  // sum this contribution to Fmunu
118  F += conj(U1) * U2 * U3 * conj(U4);
119  }
120 
121  { // U[dagger](x-mu,mu) U[dagger](x-mu-nu,nu) U(x-mu-nu,mu) U(x-nu,nu)
122 
123  // load U(x)_(-mu)
124  int dx[4] = {0, 0, 0, 0};
125  dx[mu]--;
126  Link U1 = arg.gauge(mu, linkIndexShift(x,dx,X), 1-parity);
127  dx[mu]++;
128 
129  // load U(x-mu)_(-nu) = U(x-mu-nu)_(+nu)
130  dx[mu]--;
131  dx[nu]--;
132  Link U2 = arg.gauge(nu, linkIndexShift(x,dx,X), parity);
133  dx[nu]++;
134  dx[mu]++;
135 
136  // load U(x-nu)_mu
137  dx[mu]--;
138  dx[nu]--;
139  Link U3 = arg.gauge(mu, linkIndexShift(x,dx,X), parity);
140  dx[nu]++;
141  dx[mu]++;
142 
143  // load U(x)_(-nu) = U(x-nu)_(+nu)
144  dx[nu]--;
145  Link U4 = arg.gauge(nu, linkIndexShift(x,dx,X), 1-parity);
146  dx[nu]++;
147 
148  // sum this contribution to Fmunu
149  F += conj(U1) * conj(U2) * U3 * U4;
150  }
151  // 3 matrix additions, 12 matrix-matrix multiplications, 8 matrix conjugations
152  // Each matrix conjugation involves 9 unary minus operations but these ar not included in the operation count
153  // Each matrix addition involves 18 real additions
154  // Each matrix-matrix multiplication involves 9*3 complex multiplications and 9*2 complex additions
155  // = 9*3*6 + 9*2*2 = 198 floating-point ops
156  // => Total number of floating point ops per site above is
157  // 3*18 + 12*198 = 54 + 2376 = 2430
158  {
159  F -= conj(F); // 18 real subtractions + one matrix conjugation
160  F *= static_cast<Float>(0.125); // 18 real multiplications
161  // 36 floating point operations here
162  }
163 
164  constexpr int munu_idx = (mu*(mu-1))/2 + nu; // lower-triangular indexing
165  arg.f(munu_idx, idx, parity) = F;
166  }
167 
168 
169  template<typename Float, typename Arg>
170  __global__ void computeFmunuKernel(Arg arg){
171  int x_cb = threadIdx.x + blockIdx.x*blockDim.x;
172  int parity = threadIdx.y + blockIdx.y*blockDim.y;
173  int mu_nu = threadIdx.z + blockIdx.z*blockDim.z;
174  if (x_cb >= arg.threads) return;
175  if (mu_nu >= 6) return;
176 
177  switch(mu_nu) { // F[1,0], F[2,0], F[2,1], F[3,0], F[3,1], F[3,2]
178  case 0: computeFmunuCore<1,0,Float>(arg, x_cb, parity); break;
179  case 1: computeFmunuCore<2,0,Float>(arg, x_cb, parity); break;
180  case 2: computeFmunuCore<2,1,Float>(arg, x_cb, parity); break;
181  case 3: computeFmunuCore<3,0,Float>(arg, x_cb, parity); break;
182  case 4: computeFmunuCore<3,1,Float>(arg, x_cb, parity); break;
183  case 5: computeFmunuCore<3,2,Float>(arg, x_cb, parity); break;
184  }
185  }
186 
187  template<typename Float, typename Arg>
188  void computeFmunuCPU(Arg &arg) {
189  for (int parity=0; parity<2; parity++) {
190  for (int x_cb=0; x_cb<arg.threads; x_cb++) {
191  for (int mu=0; mu<4; mu++) {
192  for (int nu=0; nu<mu; nu++) {
193  int mu_nu = (mu*(mu-1))/2 + nu;
194  switch(mu_nu) { // F[1,0], F[2,0], F[2,1], F[3,0], F[3,1], F[3,2]
195  case 0: computeFmunuCore<1,0,Float>(arg, x_cb, parity); break;
196  case 1: computeFmunuCore<2,0,Float>(arg, x_cb, parity); break;
197  case 2: computeFmunuCore<2,1,Float>(arg, x_cb, parity); break;
198  case 3: computeFmunuCore<3,0,Float>(arg, x_cb, parity); break;
199  case 4: computeFmunuCore<3,1,Float>(arg, x_cb, parity); break;
200  case 5: computeFmunuCore<3,2,Float>(arg, x_cb, parity); break;
201  }
202  }
203  }
204  }
205  }
206  }
207 
208 
209  template<typename Float, typename Arg>
210  class FmunuCompute : TunableVectorYZ {
211  Arg &arg;
212  const GaugeField &meta;
213  const QudaFieldLocation location;
214 
215  private:
216  unsigned int minThreads() const { return arg.threads; }
217  bool tuneGridDim() const { return false; }
218 
219  public:
220  FmunuCompute(Arg &arg, const GaugeField &meta, QudaFieldLocation location)
221  : TunableVectorYZ(2,6), arg(arg), meta(meta), location(location) {
222  writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,meta.Stride(),sizeof(Float));
223  }
224  virtual ~FmunuCompute() {}
225 
226  void apply(const cudaStream_t &stream){
227  if (location == QUDA_CUDA_FIELD_LOCATION) {
228  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
229  computeFmunuKernel<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
230  } else {
231  computeFmunuCPU<Float>(arg);
232  }
233  }
234 
235  TuneKey tuneKey() const {
236  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
237  }
238 
239  long long flops() const { return (2430 + 36)*6*2*(long long)arg.threads; }
240  long long bytes() const { return ((16*arg.gauge.Bytes() + arg.f.Bytes())*6*2*arg.threads); } // Ignores link reconstruction
241 
242  }; // FmunuCompute
243 
244 
245  template<typename Float, typename Fmunu, typename Gauge>
246  void computeFmunu(Fmunu f_munu, Gauge gauge, const GaugeField &meta, const GaugeField &meta_ex, QudaFieldLocation location) {
247  FmunuArg<Float,Fmunu,Gauge> arg(f_munu, gauge, meta, meta_ex);
248  FmunuCompute<Float,FmunuArg<Float,Fmunu,Gauge> > fmunuCompute(arg, meta, location);
249  fmunuCompute.apply(0);
251  checkCudaError();
252  }
253 
254  template<typename Float>
255  void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge, QudaFieldLocation location) {
256  if (Fmunu.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
257  if (gauge.isNative()) {
258  typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
259 
260  if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
261  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
262  computeFmunu<Float>(F(Fmunu), G(gauge), Fmunu, gauge, location);
263  } else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12) {
264  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
265  computeFmunu<Float>(F(Fmunu), G(gauge), Fmunu, gauge, location);
266  } else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_8) {
267  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
268  computeFmunu<Float>(F(Fmunu), G(gauge), Fmunu, gauge, location);
269  } else {
270  errorQuda("Reconstruction type %d not supported", gauge.Reconstruct());
271  }
272  } else {
273  errorQuda("Gauge field order %d not supported", gauge.Order());
274  }
275  } else {
276  errorQuda("Fmunu field order %d not supported", Fmunu.Order());
277  }
278 
279  }
280 
281 #endif // GPU_GAUGE_TOOLS
282 
283  void computeFmunu(GaugeField &Fmunu, const GaugeField& gauge, QudaFieldLocation location){
284 
285 #ifdef GPU_GAUGE_TOOLS
286  if (Fmunu.Precision() != gauge.Precision()) {
287  errorQuda("Fmunu precision %d must match gauge precision %d", Fmunu.Precision(), gauge.Precision());
288  }
289 
290  if (gauge.Precision() == QUDA_DOUBLE_PRECISION){
291  computeFmunu<double>(Fmunu, gauge, location);
292  } else if(gauge.Precision() == QUDA_SINGLE_PRECISION) {
293  computeFmunu<float>(Fmunu, gauge, location);
294  } else {
295  errorQuda("Precision %d not supported", gauge.Precision());
296  }
297  return;
298 #else
299  errorQuda("Fmunu has not been built");
300 #endif // GPU_GAUGE_TOOLS
301 
302  }
303 
304 } // namespace quda
305 
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge, QudaFieldLocation location)
cudaStream_t * stream
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
int int int enum cudaChannelFormatKind f
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
enum QudaFieldLocation_s QudaFieldLocation
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
#define checkCudaError()
Definition: util_quda.h:129
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)