quda-ref/v1.0.0/field__strength__tensor_8cuh_source.html

 #include <gauge_field_order.h>
 #include <index_helper.cuh>
 #include <quda_matrix.h>

 namespace quda
 {

   template <typename Float, typename Fmunu, typename Gauge> struct FmunuArg {
     int threads; // number of active threads required
     int X[4];    // grid dimensions
     int border[4];
     Fmunu f;
     Gauge gauge;

     FmunuArg(Fmunu &f, Gauge &gauge, const GaugeField &meta, const GaugeField &meta_ex) :
         threads(meta.VolumeCB()),
         f(f),
         gauge(gauge)
     {
       for (int dir = 0; dir < 4; ++dir) {
         X[dir] = meta.X()[dir];
         border[dir] = (meta_ex.X()[dir] - X[dir]) / 2;
       }
     }
   };

   template <int mu, int nu, typename Float, typename Arg>
   __device__ __host__ __forceinline__ void computeFmunuCore(Arg &arg, int idx, int parity)
   {
     typedef Matrix<complex<Float>, 3> Link;

     int x[4];
     auto &X = arg.X;

     getCoords(x, idx, X, parity);
     for (int dir = 0; dir < 4; ++dir) {
       x[dir] += arg.border[dir];
       X[dir] += 2 * arg.border[dir];
     }

     Link F;
     { // U(x,mu) U(x+mu,nu) U[dagger](x+nu,mu) U[dagger](x,nu)

       // load U(x)_(+mu)
       int dx[4] = {0, 0, 0, 0};
       Link U1 = arg.gauge(mu, linkIndexShift(x, dx, X), parity);

       // load U(x+mu)_(+nu)
       dx[mu]++;
       Link U2 = arg.gauge(nu, linkIndexShift(x, dx, X), 1 - parity);
       dx[mu]--;

       // load U(x+nu)_(+mu)
       dx[nu]++;
       Link U3 = arg.gauge(mu, linkIndexShift(x, dx, X), 1 - parity);
       dx[nu]--;

       // load U(x)_(+nu)
       Link U4 = arg.gauge(nu, linkIndexShift(x, dx, X), parity);

       // compute plaquette
       F = U1 * U2 * conj(U3) * conj(U4);
     }

     { // U(x,nu) U[dagger](x+nu-mu,mu) U[dagger](x-mu,nu) U(x-mu, mu)

       // load U(x)_(+nu)
       int dx[4] = {0, 0, 0, 0};
       Link U1 = arg.gauge(nu, linkIndexShift(x, dx, X), parity);

       // load U(x+nu)_(-mu) = U(x+nu-mu)_(+mu)
       dx[nu]++;
       dx[mu]--;
       Link U2 = arg.gauge(mu, linkIndexShift(x, dx, X), parity);
       dx[mu]++;
       dx[nu]--;

       // load U(x-mu)_nu
       dx[mu]--;
       Link U3 = arg.gauge(nu, linkIndexShift(x, dx, X), 1 - parity);
       dx[mu]++;

       // load U(x)_(-mu) = U(x-mu)_(+mu)
       dx[mu]--;
       Link U4 = arg.gauge(mu, linkIndexShift(x, dx, X), 1 - parity);
       dx[mu]++;

       // sum this contribution to Fmunu
       F += U1 * conj(U2) * conj(U3) * U4;
     }

     { // U[dagger](x-nu,nu) U(x-nu,mu) U(x+mu-nu,nu) U[dagger](x,mu)

       // load U(x)_(-nu)
       int dx[4] = {0, 0, 0, 0};
       dx[nu]--;
       Link U1 = arg.gauge(nu, linkIndexShift(x, dx, X), 1 - parity);
       dx[nu]++;

       // load U(x-nu)_(+mu)
       dx[nu]--;
       Link U2 = arg.gauge(mu, linkIndexShift(x, dx, X), 1 - parity);
       dx[nu]++;

       // load U(x+mu-nu)_(+nu)
       dx[mu]++;
       dx[nu]--;
       Link U3 = arg.gauge(nu, linkIndexShift(x, dx, X), parity);
       dx[nu]++;
       dx[mu]--;

       // load U(x)_(+mu)
       Link U4 = arg.gauge(mu, linkIndexShift(x, dx, X), parity);

       // sum this contribution to Fmunu
       F += conj(U1) * U2 * U3 * conj(U4);
     }

     { // U[dagger](x-mu,mu) U[dagger](x-mu-nu,nu) U(x-mu-nu,mu) U(x-nu,nu)

       // load U(x)_(-mu)
       int dx[4] = {0, 0, 0, 0};
       dx[mu]--;
       Link U1 = arg.gauge(mu, linkIndexShift(x, dx, X), 1 - parity);
       dx[mu]++;

       // load U(x-mu)_(-nu) = U(x-mu-nu)_(+nu)
       dx[mu]--;
       dx[nu]--;
       Link U2 = arg.gauge(nu, linkIndexShift(x, dx, X), parity);
       dx[nu]++;
       dx[mu]++;

       // load U(x-nu)_mu
       dx[mu]--;
       dx[nu]--;
       Link U3 = arg.gauge(mu, linkIndexShift(x, dx, X), parity);
       dx[nu]++;
       dx[mu]++;

       // load U(x)_(-nu) = U(x-nu)_(+nu)
       dx[nu]--;
       Link U4 = arg.gauge(nu, linkIndexShift(x, dx, X), 1 - parity);
       dx[nu]++;

       // sum this contribution to Fmunu
       F += conj(U1) * conj(U2) * U3 * U4;
     }
     // 3 matrix additions, 12 matrix-matrix multiplications, 8 matrix conjugations
     // Each matrix conjugation involves 9 unary minus operations but these ar not included in the operation count
     // Each matrix addition involves 18 real additions
     // Each matrix-matrix multiplication involves 9*3 complex multiplications and 9*2 complex additions
     // = 9*3*6 + 9*2*2 = 198 floating-point ops
     // => Total number of floating point ops per site above is
     // 3*18 + 12*198 =  54 + 2376 = 2430
     {
       F -= conj(F);                   // 18 real subtractions + one matrix conjugation
       F *= static_cast<Float>(0.125); // 18 real multiplications
                                       // 36 floating point operations here
     }

     constexpr int munu_idx = (mu * (mu - 1)) / 2 + nu; // lower-triangular indexing
     arg.f(munu_idx, idx, parity) = F;
   }

   template <typename Float, typename Arg> __global__ void computeFmunuKernel(Arg arg)
   {
     int x_cb = threadIdx.x + blockIdx.x * blockDim.x;
     int parity = threadIdx.y + blockIdx.y * blockDim.y;
     int mu_nu = threadIdx.z + blockIdx.z * blockDim.z;
     if (x_cb >= arg.threads) return;
     if (mu_nu >= 6) return;

     switch (mu_nu) { // F[1,0], F[2,0], F[2,1], F[3,0], F[3,1], F[3,2]
     case 0: computeFmunuCore<1, 0, Float>(arg, x_cb, parity); break;
     case 1: computeFmunuCore<2, 0, Float>(arg, x_cb, parity); break;
     case 2: computeFmunuCore<2, 1, Float>(arg, x_cb, parity); break;
     case 3: computeFmunuCore<3, 0, Float>(arg, x_cb, parity); break;
     case 4: computeFmunuCore<3, 1, Float>(arg, x_cb, parity); break;
     case 5: computeFmunuCore<3, 2, Float>(arg, x_cb, parity); break;
     }
   }

   template <typename Float, typename Arg> void computeFmunuCPU(Arg &arg)
   {
     for (int parity = 0; parity < 2; parity++) {
       for (int x_cb = 0; x_cb < arg.threads; x_cb++) {
         for (int mu = 0; mu < 4; mu++) {
           for (int nu = 0; nu < mu; nu++) {
             int mu_nu = (mu * (mu - 1)) / 2 + nu;
             switch (mu_nu) { // F[1,0], F[2,0], F[2,1], F[3,0], F[3,1], F[3,2]
             case 0: computeFmunuCore<1, 0, Float>(arg, x_cb, parity); break;
             case 1: computeFmunuCore<2, 0, Float>(arg, x_cb, parity); break;
             case 2: computeFmunuCore<2, 1, Float>(arg, x_cb, parity); break;
             case 3: computeFmunuCore<3, 0, Float>(arg, x_cb, parity); break;
             case 4: computeFmunuCore<3, 1, Float>(arg, x_cb, parity); break;
             case 5: computeFmunuCore<3, 2, Float>(arg, x_cb, parity); break;
             }
           }
         }
       }
     }
   }

 } // namespace quda
mu
double mu
Definition: test_util.cpp:1648

quda::linkIndexShift
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
Definition: index_helper.cuh:13

quda::FmunuArg::border
int border[4]
Definition: field_strength_tensor.cuh:11

quda
Definition: blas_cublas.h:5

quda::FmunuArg::f
Fmunu f
Definition: field_strength_tensor.cuh:12

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::Arg
Definition: spinor_noise.cu:22

quda::computeFmunuCore
__device__ __host__ __forceinline__ void computeFmunuCore(Arg &arg, int idx, int parity)
Definition: field_strength_tensor.cuh:28

quda_matrix.h

index_helper.cuh

quda::FmunuArg::threads
int threads
Definition: field_strength_tensor.cuh:9

quda::FmunuArg::X
int X[4]
Definition: field_strength_tensor.cuh:10

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::computeFmunuKernel
__global__ void computeFmunuKernel(Arg arg)
Definition: field_strength_tensor.cuh:166

quda::computeFmunuCPU
void computeFmunuCPU(Arg &arg)
Definition: field_strength_tensor.cuh:184

quda::FmunuArg
Definition: field_strength_tensor.cuh:8

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130

quda::FmunuArg::FmunuArg
FmunuArg(Fmunu &f, Gauge &gauge, const GaugeField &meta, const GaugeField &meta_ex)
Definition: field_strength_tensor.cuh:15

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::Matrix
Definition: quda_matrix.h:64

quda::FmunuArg::gauge
Gauge gauge
Definition: field_strength_tensor.cuh:13

quda::getCoords
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
Definition: dslash_helper.cuh:88

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

quda::GaugeField
Definition: gauge_field.h:164