1 #include <quda_matrix.h>
3 #include <clover_field.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <clover_field_order.h>
7 #include <instantiate.h>
11 template <typename store_t_>
13 using store_t = store_t_;
14 using real = typename mapper<store_t>::type;
15 static constexpr int nColor = 3;
16 static constexpr int nSpin = 4;
18 using Clover = typename clover_mapper<store_t>::type;
19 using Fmunu = typename gauge_mapper<store_t, QUDA_RECONSTRUCT_NO>::type;
23 int threads; // number of active threads required
24 int X[4]; // grid dimensions
27 CloverArg(CloverField &clover, const GaugeField &f, double coeff) :
30 threads(f.VolumeCB()),
33 for (int dir=0; dir<4; ++dir) X[dir] = f.X()[dir];
39 Upper-left block (chirality index 0)
41 | 1 + c*I*(F[0,1] - F[2,3]) , c*I*(F[1,2] - F[0,3]) + c*(F[0,2] + F[1,3]) |
43 | c*I*(F[1,2] - F[0,3]) - c*(F[0,2] + F[1,3]), 1 - c*I*(F[0,1] - F[2,3]) |
48 | 1 - c*I*(F[0] - F[5]), -c*I*(F[2] - F[3]) - c*(F[1] + F[4])
50 | -c*I*(F[2] -F[3]) + c*(F[1] + F[4]), 1 + c*I*(F[0] - F[5])
54 Lower-right block (chirality index 1)
57 | 1 - c*I*(F[0] + F[5]), -c*I*(F[2] + F[3]) - c*(F[1] - F[4]) |
59 | -c*I*(F[2]+F[3]) + c*(F[1]-F[4]), 1 + c*I*(F[0] + F[5]) |
62 // Core routine for constructing clover term from field strength
63 template <typename Arg>
64 __device__ __host__ void cloverComputeCore(Arg &arg, int x_cb, int parity)
66 constexpr int N = Arg::nColor*Arg::nSpin / 2;
67 using real = typename Arg::real;
68 using Complex = complex<real>;
69 using Link = Matrix<Complex, Arg::nColor>;
71 // Load the field-strength tensor from global memory
74 for (int i=0; i<6; ++i) F[i] = arg.f(i, x_cb, parity);
77 Complex coeff(0.0, arg.coeff);
78 Link block1[2], block2[2];
79 block1[0] = coeff*(F[0]-F[5]); // (18 + 6*9=) 72 floating-point ops
80 block1[1] = coeff*(F[0]+F[5]); // 72 floating-point ops
81 block2[0] = arg.coeff*(F[1]+F[4] - I*(F[2]-F[3])); // 126 floating-point ops
82 block2[1] = arg.coeff*(F[1]-F[4] - I*(F[2]+F[3])); // 126 floating-point ops
84 // This uses lots of unnecessary memory
86 for (int ch=0; ch<2; ++ch) {
88 // c = 0(1) => positive(negative) chiral block
89 // Compute real diagonal elements
91 for (int i=0; i<N/2; ++i) {
92 A(i+0,i+0) = 1.0 - block1[ch](i,i).real();
93 A(i+3,i+3) = 1.0 + block1[ch](i,i).real();
96 // Compute off diagonal components
98 A(1,0) = -block1[ch](1,0);
100 A(2,0) = -block1[ch](2,0);
101 A(2,1) = -block1[ch](2,1);
103 A(3,0) = block2[ch](0,0);
104 A(3,1) = block2[ch](0,1);
105 A(3,2) = block2[ch](0,2);
107 A(4,0) = block2[ch](1,0);
108 A(4,1) = block2[ch](1,1);
109 A(4,2) = block2[ch](1,2);
110 A(4,3) = block1[ch](1,0);
112 A(5,0) = block2[ch](2,0);
113 A(5,1) = block2[ch](2,1);
114 A(5,2) = block2[ch](2,2);
115 A(5,3) = block1[ch](2,0);
116 A(5,4) = block1[ch](2,1);
117 A *= static_cast<real>(0.5);
119 arg.clover(x_cb, parity, ch) = A;
121 // 84 floating-point ops
124 template <typename Arg> __global__ void cloverComputeKernel(Arg arg)
126 int x_cb = threadIdx.x + blockIdx.x*blockDim.x;
127 int parity = threadIdx.y + blockIdx.y*blockDim.y;
128 if (x_cb >= arg.threads) return;
129 cloverComputeCore(arg, x_cb, parity);
132 template <typename Arg> void cloverComputeCPU(Arg arg)
134 for (int parity = 0; parity<2; parity++) {
135 for (int x_cb=0; x_cb<arg.threads; x_cb++){
136 cloverComputeCore(arg, x_cb, parity);
141 template <typename store_t>
142 class CloverCompute : TunableVectorY {
143 CloverArg<store_t> arg;
144 const GaugeField &meta;
145 unsigned int sharedBytesPerThread() const { return 0; }
146 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
147 bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
148 bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
149 unsigned int minThreads() const { return arg.threads; }
152 CloverCompute(CloverField &clover, const GaugeField& f, double coeff) :
154 arg(clover, f, coeff),
157 checkNative(clover, f);
158 writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(store_t));
161 qudaDeviceSynchronize();
164 void apply(const qudaStream_t &stream)
166 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
167 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
168 qudaLaunchKernel(cloverComputeKernel<decltype(arg)>, tp, stream, arg);
169 } else { // run the CPU code
170 errorQuda("Not implemented");
174 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
175 long long flops() const { return 2*arg.threads*480ll; }
176 long long bytes() const { return 2*arg.threads*(6*arg.f.Bytes() + arg.clover.Bytes()); }
179 void computeClover(CloverField &clover, const GaugeField& f, double coeff)
181 #ifdef GPU_CLOVER_DIRAC
182 instantiate<CloverCompute>(clover, f, coeff);
184 errorQuda("Clover has not been built");