QUDA  v1.1.0
A library for QCD on GPUs
clover_sigma_outer_product.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <gauge_field.h>
6 #include <color_spinor_field.h>
7 #include <dslash_quda.h>
8 
9 #include <jitify_helper.cuh>
10 #include <kernels/clover_sigma_outer_product.cuh>
11 
12 namespace quda {
13 
14 #ifdef GPU_CLOVER_DIRAC
15 
16  template <typename Float, typename Arg> class CloverSigmaOprod : public TunableVectorYZ
17  {
18  Arg &arg;
19  const GaugeField &meta;
20 
21  unsigned int sharedBytesPerThread() const { return 0; }
22  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
23 
24  unsigned int minThreads() const { return arg.length; }
25  bool tuneGridDim() const { return false; }
26 
27  public:
28  CloverSigmaOprod(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2, 6), arg(arg), meta(meta)
29  {
30  writeAuxString("%s,nvector=%d", meta.AuxString(), arg.nvector);
31  // this sets the communications pattern for the packing kernel
32 #ifdef JITIFY
33  create_jitify_program("kernels/clover_sigma_outer_product.cuh");
34 #endif
35  }
36 
37  virtual ~CloverSigmaOprod() {}
38 
39  void apply(const qudaStream_t &stream)
40  {
41  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
42  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
43 #ifdef JITIFY
44  using namespace jitify::reflection;
45  jitify_error = program->kernel("quda::sigmaOprodKernel")
46  .instantiate(arg.nvector, Type<Float>(), Type<Arg>())
47  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
48  .launch(arg);
49 #else
50  switch (arg.nvector) {
51  case 1: qudaLaunchKernel(sigmaOprodKernel<1, Float, Arg>, tp, stream, arg); break;
52  default: errorQuda("Unsupported nvector = %d\n", arg.nvector);
53  }
54 #endif
55  } else { // run the CPU code
56  errorQuda("No CPU support for staggered outer-product calculation\n");
57  }
58  } // apply
59 
60  void preTune() { this->arg.oprod.save(); }
61  void postTune() { this->arg.oprod.load(); }
62 
63  long long flops() const
64  {
65  return (2 * (long long)arg.length) * 6
66  * ((0 + 144 + 18) * arg.nvector + 18); // spin_mu_nu + spin trace + multiply-add
67  }
68  long long bytes() const
69  {
70  return (2 * (long long)arg.length) * 6
71  * ((arg.inA[0].Bytes() + arg.inB[0].Bytes()) * arg.nvector + 2 * arg.oprod.Bytes());
72  }
73 
74  TuneKey tuneKey() const { return TuneKey(meta.VolString(), "CloverSigmaOprod", aux); }
75  }; // CloverSigmaOprod
76 
77  template<typename Float>
78  void computeCloverSigmaOprod(GaugeField& oprod, const std::vector<ColorSpinorField*> &x,
79  const std::vector<ColorSpinorField*> &p, const std::vector<std::vector<double> > &coeff, int nvector)
80  {
81  // Create the arguments
82  CloverSigmaOprodArg<Float, 3> arg(oprod, x, p, coeff, nvector);
83  CloverSigmaOprod<Float, decltype(arg)> sigma_oprod(arg, oprod);
84  sigma_oprod.apply(0);
85  } // computeCloverSigmaOprod
86 
87 #endif // GPU_CLOVER_FORCE
88 
89  void computeCloverSigmaOprod(GaugeField& oprod,
90  std::vector<ColorSpinorField*> &x,
91  std::vector<ColorSpinorField*> &p,
92  std::vector<std::vector<double> > &coeff)
93  {
94 
95 #ifdef GPU_CLOVER_DIRAC
96  if (x.size() > MAX_NVECTOR) {
97  // divide and conquer
98  std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
99  std::vector<ColorSpinorField*> p0(p.begin(), p.begin()+p.size()/2);
100  std::vector<std::vector<double> > coeff0(coeff.begin(), coeff.begin()+coeff.size()/2);
101  for (unsigned int i=0; i<coeff0.size(); i++) {
102  coeff0[i].reserve(2); coeff0[i][0] = coeff[i][0]; coeff0[i][1] = coeff[i][1];
103  }
104  computeCloverSigmaOprod(oprod, x0, p0, coeff0);
105 
106  std::vector<ColorSpinorField*> x1(x.begin()+x.size()/2, x.end());
107  std::vector<ColorSpinorField*> p1(p.begin()+p.size()/2, p.end());
108  std::vector<std::vector<double> > coeff1(coeff.begin()+coeff.size()/2, coeff.end());
109  for (unsigned int i=0; i<coeff1.size(); i++) {
110  coeff1[i].reserve(2); coeff1[i][0] = coeff[coeff.size()/2 + i][0]; coeff1[i][1] = coeff[coeff.size()/2 + i][1];
111  }
112  computeCloverSigmaOprod(oprod, x1, p1, coeff1);
113 
114  return;
115  }
116 
117  if (oprod.Order() != QUDA_FLOAT2_GAUGE_ORDER) errorQuda("Unsupported output ordering: %d\n", oprod.Order());
118 
119  if (checkPrecision(*x[0], *p[0], oprod) == QUDA_DOUBLE_PRECISION) {
120  computeCloverSigmaOprod<double>(oprod, x, p, coeff, x.size());
121  } else {
122  errorQuda("Unsupported precision: %d\n", oprod.Precision());
123  }
124 #else // GPU_CLOVER_DIRAC not defined
125  errorQuda("Clover Dirac operator has not been built!");
126 #endif
127 
128  } // computeCloverForce
129 
130 } // namespace quda