QUDA  v1.1.0
A library for QCD on GPUs
clover_outer_product.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <gauge_field_order.h>
6 #include <color_spinor_field_order.h>
7 #include <quda_matrix.h>
8 #include <color_spinor.h>
9 #include <dslash_quda.h>
10 
11 namespace quda {
12 
13  enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };
14 
15  template<typename Float, int nColor_, QudaReconstructType recon>
16  struct CloverForceArg {
17  typedef typename mapper<Float>::type real;
18  static constexpr int nColor = nColor_;
19  static constexpr int nSpin = 4;
20  static constexpr int spin_project = true;
21  using F = typename colorspinor_mapper<Float, nSpin, nColor, spin_project>::type;
22  using Gauge = typename gauge_mapper<Float, recon, 18>::type;
23  using Force = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO, 18>::type;
24 
25  const F inA;
26  const F inB;
27  const F inC;
28  const F inD;
29  Gauge gauge;
30  Force force;
31  unsigned int length;
32  int X[4];
33  unsigned int parity;
34  unsigned int dir;
35  unsigned int displacement;
36  OprodKernelType kernelType;
37  bool partitioned[4];
38  Float coeff;
39 
40  CloverForceArg(GaugeField &force, const GaugeField &gauge, const ColorSpinorField &inA, const ColorSpinorField &inB,
41  const ColorSpinorField &inC, const ColorSpinorField &inD, const unsigned int parity, const double coeff) :
42  inA(inA),
43  inB(inB),
44  inC(inC),
45  inD(inD),
46  gauge(gauge),
47  force(force),
48  length(gauge.VolumeCB()),
49  parity(parity),
50  dir(5),
51  displacement(1),
52  kernelType(OPROD_INTERIOR_KERNEL),
53  coeff(coeff)
54  {
55  for (int i=0; i<4; ++i) this->X[i] = gauge.X()[i];
56  for (int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
57  }
58  };
59 
60  template<typename real, typename Arg> __global__ void interiorOprodKernel(Arg arg)
61  {
62  typedef complex<real> Complex;
63  int idx = blockIdx.x*blockDim.x + threadIdx.x;
64 
65  ColorSpinor<real, Arg::nColor, 4> A, B_shift, C, D_shift;
66  Matrix<Complex, Arg::nColor> U, result, temp;
67 
68  while (idx<arg.length) {
69  A = arg.inA(idx, 0);
70  C = arg.inC(idx, 0);
71 
72 #pragma unroll
73  for (int dim=0; dim<4; ++dim) {
74  int shift[4] = {0,0,0,0};
75  shift[dim] = 1;
76  const int nbr_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
77 
78  if (nbr_idx >= 0) {
79  B_shift = arg.inB(nbr_idx, 0);
80  D_shift = arg.inD(nbr_idx, 0);
81 
82  B_shift = (B_shift.project(dim,1)).reconstruct(dim,1);
83  result = outerProdSpinTrace(B_shift,A);
84 
85  D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1);
86  result += outerProdSpinTrace(D_shift,C);
87 
88  temp = arg.force(dim, idx, arg.parity);
89  U = arg.gauge(dim, idx, arg.parity);
90  result = temp + U*result*arg.coeff;
91  arg.force(dim, idx, arg.parity) = result;
92  }
93  } // dim
94 
95  idx += gridDim.x*blockDim.x;
96  }
97  } // interiorOprodKernel
98 
99  template<int dim, typename real, typename Arg> __global__ void exteriorOprodKernel(Arg arg)
100  {
101  typedef complex<real> Complex;
102  int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
103 
104  ColorSpinor<real, Arg::nColor, 4> A, B_shift, C, D_shift;
105  ColorSpinor<real, Arg::nColor, 2> projected_tmp;
106  Matrix<Complex, Arg::nColor> U, result, temp;
107 
108  int x[4];
109  while (cb_idx<arg.length) {
110  coordsFromIndexExterior(x, cb_idx, arg.X, dim, arg.displacement, arg.parity);
111  const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
112  A = arg.inA(bulk_cb_idx, 0);
113  C = arg.inC(bulk_cb_idx, 0);
114 
115  projected_tmp = arg.inB.Ghost(dim, 1, cb_idx, 0);
116  B_shift = projected_tmp.reconstruct(dim, 1);
117  result = outerProdSpinTrace(B_shift,A);
118 
119  projected_tmp = arg.inD.Ghost(dim, 1, cb_idx, 0);
120  D_shift = projected_tmp.reconstruct(dim,-1);
121  result += outerProdSpinTrace(D_shift,C);
122 
123  temp = arg.force(dim, bulk_cb_idx, arg.parity);
124  U = arg.gauge(dim, bulk_cb_idx, arg.parity);
125  result = temp + U*result*arg.coeff;
126  arg.force(dim, bulk_cb_idx, arg.parity) = result;
127 
128  cb_idx += gridDim.x*blockDim.x;
129  }
130  } // exteriorOprodKernel
131 
132  template<typename Float, typename Arg>
133  class CloverForce : public Tunable {
134  Arg &arg;
135  const GaugeField &meta;
136 
137  unsigned int sharedBytesPerThread() const { return 0; }
138  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
139 
140  unsigned int minThreads() const { return arg.length; }
141  bool tuneGridDim() const { return false; }
142 
143  public:
144  CloverForce(Arg &arg, GaugeField &meta) :
145  arg(arg), meta(meta) {
146  writeAuxString(meta.AuxString());
147  // this sets the communications pattern for the packing kernel
148  int comms[QUDA_MAX_DIM] = { commDimPartitioned(0), commDimPartitioned(1), commDimPartitioned(2), commDimPartitioned(3) };
149  setPackComms(comms);
150  }
151 
152  void apply(const qudaStream_t &stream) {
153  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
154  // Disable tuning for the time being
155  TuneParam tp = tuneLaunch(*this,getTuning(),getVerbosity());
156 
157  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
158  qudaLaunchKernel(interiorOprodKernel<Float, Arg>, tp, stream, arg);
159  } else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
160  if (arg.dir == 0) qudaLaunchKernel(exteriorOprodKernel<0,Float,Arg>, tp, stream, arg);
161  else if (arg.dir == 1) qudaLaunchKernel(exteriorOprodKernel<1,Float,Arg>, tp, stream, arg);
162  else if (arg.dir == 2) qudaLaunchKernel(exteriorOprodKernel<2,Float,Arg>, tp, stream, arg);
163  else if (arg.dir == 3) qudaLaunchKernel(exteriorOprodKernel<3,Float,Arg>, tp, stream, arg);
164  } else {
165  errorQuda("Kernel type not supported\n");
166  }
167  }else{ // run the CPU code
168  errorQuda("No CPU support for staggered outer-product calculation\n");
169  }
170  } // apply
171 
172  void preTune() {
173  this->arg.force.save();
174  }
175  void postTune() {
176  this->arg.force.load();
177  }
178 
179  long long flops() const {
180  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
181  return ((long long)arg.length)*4*(24 + 144 + 234); // spin project + spin trace + multiply-add
182  } else {
183  return ((long long)arg.length)*(144 + 234); // spin trace + multiply-add
184  }
185  }
186  long long bytes() const {
187  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
188  return arg.length*(arg.inA.Bytes() + arg.inC.Bytes() + 4*(arg.inB.Bytes() + arg.inD.Bytes() + 2*arg.force.Bytes() + arg.gauge.Bytes()));
189  } else {
190  return arg.length*(arg.inA.Bytes() + arg.inB.Bytes()/2 + arg.inC.Bytes() + arg.inD.Bytes()/2 + 2*arg.force.Bytes() + arg.gauge.Bytes());
191  }
192  }
193 
194  TuneKey tuneKey() const {
195  char new_aux[TuneKey::aux_n];
196  strcpy(new_aux, aux);
197  if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
198  strcat(new_aux, ",interior");
199  } else {
200  strcat(new_aux, ",exterior");
201  if (arg.dir==0) strcat(new_aux, ",dir=0");
202  else if (arg.dir==1) strcat(new_aux, ",dir=1");
203  else if (arg.dir==2) strcat(new_aux, ",dir=2");
204  else if (arg.dir==3) strcat(new_aux, ",dir=3");
205  }
206  return TuneKey(meta.VolString(), "CloverForce", new_aux);
207  }
208  }; // CloverForce
209 
210  void exchangeGhost(cudaColorSpinorField &a, int parity, int dag) {
211  // need to enable packing in temporal direction to get spin-projector correct
212  pushKernelPackT(true);
213 
214  // first transfer src1
215  qudaDeviceSynchronize();
216 
217  MemoryLocation location[2*QUDA_MAX_DIM] = {Device, Device, Device, Device, Device, Device, Device, Device};
218  a.pack(1, 1-parity, dag, Nstream-1, location, Device);
219 
220  qudaDeviceSynchronize();
221 
222  for (int i=3; i>=0; i--) {
223  if (commDimPartitioned(i)) {
224  // Initialize the host transfer from the source spinor
225  a.gather(1, dag, 2*i);
226  } // commDim(i)
227  } // i=3,..,0
228 
229  qudaDeviceSynchronize(); comm_barrier();
230 
231  for (int i=3; i>=0; i--) {
232  if (commDimPartitioned(i)) {
233  a.commsStart(1, 2*i, dag);
234  }
235  }
236 
237  for (int i=3; i>=0; i--) {
238  if (commDimPartitioned(i)) {
239  a.commsWait(1, 2*i, dag);
240  a.scatter(1, dag, 2*i);
241  }
242  }
243 
244  qudaDeviceSynchronize();
245  popKernelPackT(); // restore packing state
246 
247  a.bufferIndex = (1 - a.bufferIndex);
248  comm_barrier();
249  }
250 
251  template <typename Float, QudaReconstructType recon>
252  void computeCloverForce(GaugeField &force, const GaugeField &gauge, const ColorSpinorField& inA, const ColorSpinorField& inB,
253  const ColorSpinorField& inC, const ColorSpinorField& inD, int parity, const double coeff)
254  {
255  // Create the arguments for the interior kernel
256  CloverForceArg<Float, 3, recon> arg(force, gauge, inA, inB, inC, inD, parity, coeff);
257  CloverForce<Float,decltype(arg)> oprod(arg, force);
258 
259  arg.kernelType = OPROD_INTERIOR_KERNEL;
260  arg.length = inA.VolumeCB();
261  oprod.apply(0);
262 
263  for (int i=3; i>=0; i--) {
264  if (commDimPartitioned(i)) {
265  // update parameters for this exterior kernel
266  arg.kernelType = OPROD_EXTERIOR_KERNEL;
267  arg.dir = i;
268  arg.length = inA.GhostFaceCB()[i];
269  arg.displacement = 1; // forwards displacement
270  oprod.apply(0);
271  }
272  } // i=3,..,0
273  } // computeCloverForce
274 
275  void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector<ColorSpinorField *> &x,
276  std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
277  {
278 #ifdef GPU_CLOVER_DIRAC
279  if (force.Order() != QUDA_FLOAT2_GAUGE_ORDER) errorQuda("Unsupported output ordering: %d\n", force.Order());
280  checkPrecision(*x[0], *p[0], force, U);
281 
282  int dag = 1;
283 
284  for (unsigned int i=0; i<x.size(); i++) {
285  static_cast<cudaColorSpinorField&>(x[i]->Even()).allocateGhostBuffer(1);
286  static_cast<cudaColorSpinorField&>(x[i]->Odd()).allocateGhostBuffer(1);
287  static_cast<cudaColorSpinorField&>(p[i]->Even()).allocateGhostBuffer(1);
288  static_cast<cudaColorSpinorField&>(p[i]->Odd()).allocateGhostBuffer(1);
289 
290  for (int parity=0; parity<2; parity++) {
291 
292  ColorSpinorField& inA = (parity&1) ? p[i]->Odd() : p[i]->Even();
293  ColorSpinorField& inB = (parity&1) ? x[i]->Even(): x[i]->Odd();
294  ColorSpinorField& inC = (parity&1) ? x[i]->Odd() : x[i]->Even();
295  ColorSpinorField& inD = (parity&1) ? p[i]->Even(): p[i]->Odd();
296 
297  if (x[0]->Precision() == QUDA_DOUBLE_PRECISION) {
298  exchangeGhost(static_cast<cudaColorSpinorField&>(inB), parity, dag);
299  exchangeGhost(static_cast<cudaColorSpinorField&>(inD), parity, 1-dag);
300 
301  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
302  computeCloverForce<double, QUDA_RECONSTRUCT_NO>(force, U, inA, inB, inC, inD, parity, coeff[i]);
303  } else if (U.Reconstruct() == QUDA_RECONSTRUCT_12) {
304  computeCloverForce<double, QUDA_RECONSTRUCT_12>(force, U, inA, inB, inC, inD, parity, coeff[i]);
305  } else {
306  errorQuda("Unsupported recontruction type");
307  }
308  } else {
309  errorQuda("Unsupported precision: %d\n", x[0]->Precision());
310  }
311  }
312  }
313 #else // GPU_CLOVER_DIRAC not defined
314  errorQuda("Clover Dirac operator has not been built!");
315 #endif
316 
317  } // computeCloverForce
318 
319 } // namespace quda