5 #include <gauge_field_order.h>
6 #include <color_spinor_field_order.h>
7 #include <quda_matrix.h>
8 #include <color_spinor.h>
9 #include <dslash_quda.h>
13 enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };
15 template<typename Float, int nColor_, QudaReconstructType recon>
16 struct CloverForceArg {
17 typedef typename mapper<Float>::type real;
18 static constexpr int nColor = nColor_;
19 static constexpr int nSpin = 4;
20 static constexpr int spin_project = true;
21 using F = typename colorspinor_mapper<Float, nSpin, nColor, spin_project>::type;
22 using Gauge = typename gauge_mapper<Float, recon, 18>::type;
23 using Force = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO, 18>::type;
35 unsigned int displacement;
36 OprodKernelType kernelType;
40 CloverForceArg(GaugeField &force, const GaugeField &gauge, const ColorSpinorField &inA, const ColorSpinorField &inB,
41 const ColorSpinorField &inC, const ColorSpinorField &inD, const unsigned int parity, const double coeff) :
48 length(gauge.VolumeCB()),
52 kernelType(OPROD_INTERIOR_KERNEL),
55 for (int i=0; i<4; ++i) this->X[i] = gauge.X()[i];
56 for (int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
60 template<typename real, typename Arg> __global__ void interiorOprodKernel(Arg arg)
62 typedef complex<real> Complex;
63 int idx = blockIdx.x*blockDim.x + threadIdx.x;
65 ColorSpinor<real, Arg::nColor, 4> A, B_shift, C, D_shift;
66 Matrix<Complex, Arg::nColor> U, result, temp;
68 while (idx<arg.length) {
73 for (int dim=0; dim<4; ++dim) {
74 int shift[4] = {0,0,0,0};
76 const int nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
79 B_shift = arg.inB(nbr_idx, 0);
80 D_shift = arg.inD(nbr_idx, 0);
82 B_shift = (B_shift.project(dim,1)).reconstruct(dim,1);
83 result = outerProdSpinTrace(B_shift,A);
85 D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1);
86 result += outerProdSpinTrace(D_shift,C);
88 temp = arg.force(dim, idx, arg.parity);
89 U = arg.gauge(dim, idx, arg.parity);
90 result = temp + U*result*arg.coeff;
91 arg.force(dim, idx, arg.parity) = result;
95 idx += gridDim.x*blockDim.x;
97 } // interiorOprodKernel
99 template<int dim, typename real, typename Arg> __global__ void exteriorOprodKernel(Arg arg)
101 typedef complex<real> Complex;
102 int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
104 ColorSpinor<real, Arg::nColor, 4> A, B_shift, C, D_shift;
105 ColorSpinor<real, Arg::nColor, 2> projected_tmp;
106 Matrix<Complex, Arg::nColor> U, result, temp;
109 while (cb_idx<arg.length) {
110 coordsFromIndexExterior(x, cb_idx, arg.X, dim, arg.displacement, arg.parity);
111 const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
112 A = arg.inA(bulk_cb_idx, 0);
113 C = arg.inC(bulk_cb_idx, 0);
115 projected_tmp = arg.inB.Ghost(dim, 1, cb_idx, 0);
116 B_shift = projected_tmp.reconstruct(dim, 1);
117 result = outerProdSpinTrace(B_shift,A);
119 projected_tmp = arg.inD.Ghost(dim, 1, cb_idx, 0);
120 D_shift = projected_tmp.reconstruct(dim,-1);
121 result += outerProdSpinTrace(D_shift,C);
123 temp = arg.force(dim, bulk_cb_idx, arg.parity);
124 U = arg.gauge(dim, bulk_cb_idx, arg.parity);
125 result = temp + U*result*arg.coeff;
126 arg.force(dim, bulk_cb_idx, arg.parity) = result;
128 cb_idx += gridDim.x*blockDim.x;
130 } // exteriorOprodKernel
132 template<typename Float, typename Arg>
133 class CloverForce : public Tunable {
135 const GaugeField &meta;
137 unsigned int sharedBytesPerThread() const { return 0; }
138 unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
140 unsigned int minThreads() const { return arg.length; }
141 bool tuneGridDim() const { return false; }
144 CloverForce(Arg &arg, GaugeField &meta) :
145 arg(arg), meta(meta) {
146 writeAuxString(meta.AuxString());
147 // this sets the communications pattern for the packing kernel
148 int comms[QUDA_MAX_DIM] = { commDimPartitioned(0), commDimPartitioned(1), commDimPartitioned(2), commDimPartitioned(3) };
152 void apply(const qudaStream_t &stream) {
153 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
154 // Disable tuning for the time being
155 TuneParam tp = tuneLaunch(*this,getTuning(),getVerbosity());
157 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
158 qudaLaunchKernel(interiorOprodKernel<Float, Arg>, tp, stream, arg);
159 } else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
160 if (arg.dir == 0) qudaLaunchKernel(exteriorOprodKernel<0,Float,Arg>, tp, stream, arg);
161 else if (arg.dir == 1) qudaLaunchKernel(exteriorOprodKernel<1,Float,Arg>, tp, stream, arg);
162 else if (arg.dir == 2) qudaLaunchKernel(exteriorOprodKernel<2,Float,Arg>, tp, stream, arg);
163 else if (arg.dir == 3) qudaLaunchKernel(exteriorOprodKernel<3,Float,Arg>, tp, stream, arg);
165 errorQuda("Kernel type not supported\n");
167 }else{ // run the CPU code
168 errorQuda("No CPU support for staggered outer-product calculation\n");
173 this->arg.force.save();
176 this->arg.force.load();
179 long long flops() const {
180 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
181 return ((long long)arg.length)*4*(24 + 144 + 234); // spin project + spin trace + multiply-add
183 return ((long long)arg.length)*(144 + 234); // spin trace + multiply-add
186 long long bytes() const {
187 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
188 return arg.length*(arg.inA.Bytes() + arg.inC.Bytes() + 4*(arg.inB.Bytes() + arg.inD.Bytes() + 2*arg.force.Bytes() + arg.gauge.Bytes()));
190 return arg.length*(arg.inA.Bytes() + arg.inB.Bytes()/2 + arg.inC.Bytes() + arg.inD.Bytes()/2 + 2*arg.force.Bytes() + arg.gauge.Bytes());
194 TuneKey tuneKey() const {
195 char new_aux[TuneKey::aux_n];
196 strcpy(new_aux, aux);
197 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
198 strcat(new_aux, ",interior");
200 strcat(new_aux, ",exterior");
201 if (arg.dir==0) strcat(new_aux, ",dir=0");
202 else if (arg.dir==1) strcat(new_aux, ",dir=1");
203 else if (arg.dir==2) strcat(new_aux, ",dir=2");
204 else if (arg.dir==3) strcat(new_aux, ",dir=3");
206 return TuneKey(meta.VolString(), "CloverForce", new_aux);
210 void exchangeGhost(cudaColorSpinorField &a, int parity, int dag) {
211 // need to enable packing in temporal direction to get spin-projector correct
212 pushKernelPackT(true);
214 // first transfer src1
215 qudaDeviceSynchronize();
217 MemoryLocation location[2*QUDA_MAX_DIM] = {Device, Device, Device, Device, Device, Device, Device, Device};
218 a.pack(1, 1-parity, dag, Nstream-1, location, Device);
220 qudaDeviceSynchronize();
222 for (int i=3; i>=0; i--) {
223 if (commDimPartitioned(i)) {
224 // Initialize the host transfer from the source spinor
225 a.gather(1, dag, 2*i);
229 qudaDeviceSynchronize(); comm_barrier();
231 for (int i=3; i>=0; i--) {
232 if (commDimPartitioned(i)) {
233 a.commsStart(1, 2*i, dag);
237 for (int i=3; i>=0; i--) {
238 if (commDimPartitioned(i)) {
239 a.commsWait(1, 2*i, dag);
240 a.scatter(1, dag, 2*i);
244 qudaDeviceSynchronize();
245 popKernelPackT(); // restore packing state
247 a.bufferIndex = (1 - a.bufferIndex);
251 template <typename Float, QudaReconstructType recon>
252 void computeCloverForce(GaugeField &force, const GaugeField &gauge, const ColorSpinorField& inA, const ColorSpinorField& inB,
253 const ColorSpinorField& inC, const ColorSpinorField& inD, int parity, const double coeff)
255 // Create the arguments for the interior kernel
256 CloverForceArg<Float, 3, recon> arg(force, gauge, inA, inB, inC, inD, parity, coeff);
257 CloverForce<Float,decltype(arg)> oprod(arg, force);
259 arg.kernelType = OPROD_INTERIOR_KERNEL;
260 arg.length = inA.VolumeCB();
263 for (int i=3; i>=0; i--) {
264 if (commDimPartitioned(i)) {
265 // update parameters for this exterior kernel
266 arg.kernelType = OPROD_EXTERIOR_KERNEL;
268 arg.length = inA.GhostFaceCB()[i];
269 arg.displacement = 1; // forwards displacement
273 } // computeCloverForce
275 void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector<ColorSpinorField *> &x,
276 std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
278 #ifdef GPU_CLOVER_DIRAC
279 if (force.Order() != QUDA_FLOAT2_GAUGE_ORDER) errorQuda("Unsupported output ordering: %d\n", force.Order());
280 checkPrecision(*x[0], *p[0], force, U);
284 for (unsigned int i=0; i<x.size(); i++) {
285 static_cast<cudaColorSpinorField&>(x[i]->Even()).allocateGhostBuffer(1);
286 static_cast<cudaColorSpinorField&>(x[i]->Odd()).allocateGhostBuffer(1);
287 static_cast<cudaColorSpinorField&>(p[i]->Even()).allocateGhostBuffer(1);
288 static_cast<cudaColorSpinorField&>(p[i]->Odd()).allocateGhostBuffer(1);
290 for (int parity=0; parity<2; parity++) {
292 ColorSpinorField& inA = (parity&1) ? p[i]->Odd() : p[i]->Even();
293 ColorSpinorField& inB = (parity&1) ? x[i]->Even(): x[i]->Odd();
294 ColorSpinorField& inC = (parity&1) ? x[i]->Odd() : x[i]->Even();
295 ColorSpinorField& inD = (parity&1) ? p[i]->Even(): p[i]->Odd();
297 if (x[0]->Precision() == QUDA_DOUBLE_PRECISION) {
298 exchangeGhost(static_cast<cudaColorSpinorField&>(inB), parity, dag);
299 exchangeGhost(static_cast<cudaColorSpinorField&>(inD), parity, 1-dag);
301 if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
302 computeCloverForce<double, QUDA_RECONSTRUCT_NO>(force, U, inA, inB, inC, inD, parity, coeff[i]);
303 } else if (U.Reconstruct() == QUDA_RECONSTRUCT_12) {
304 computeCloverForce<double, QUDA_RECONSTRUCT_12>(force, U, inA, inB, inC, inD, parity, coeff[i]);
306 errorQuda("Unsupported recontruction type");
309 errorQuda("Unsupported precision: %d\n", x[0]->Precision());
313 #else // GPU_CLOVER_DIRAC not defined
314 errorQuda("Clover Dirac operator has not been built!");
317 } // computeCloverForce