QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
clover_sigma_outer_product.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <gauge_field.h>
6 #include <color_spinor_field.h>
7 #include <dslash_quda.h>
8 
9 #include <jitify_helper.cuh>
11 
12 namespace quda {
13 
14 #ifdef GPU_CLOVER_DIRAC
15 
16  template <typename Float, typename Arg> class CloverSigmaOprod : public TunableVectorYZ
17  {
18 
19 private:
20  Arg &arg;
21  const GaugeField &meta;
22 
23  unsigned int sharedBytesPerThread() const { return 0; }
24  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
25 
26  unsigned int minThreads() const { return arg.length; }
27  bool tuneGridDim() const { return false; }
28 
29  public:
30  CloverSigmaOprod(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2, 6), arg(arg), meta(meta)
31  {
32  writeAuxString("prec=%lu,stride=%d,nvector=%d", sizeof(Float), arg.inA[0].Stride(), arg.nvector);
33  // this sets the communications pattern for the packing kernel
34 #ifdef JITIFY
35  create_jitify_program("kernels/clover_sigma_outer_product.cuh");
36 #endif
37  }
38 
39  virtual ~CloverSigmaOprod() {}
40 
41  void apply(const cudaStream_t &stream)
42  {
43  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
44  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
45 #ifdef JITIFY
46  using namespace jitify::reflection;
47  jitify_error = program->kernel("quda::sigmaOprodKernel")
48  .instantiate(arg.nvector, Type<Float>(), Type<Arg>())
49  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
50  .launch(arg);
51 #else
52  switch (arg.nvector) {
53  case 1: sigmaOprodKernel<1, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
54  case 2: sigmaOprodKernel<2, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
55  case 3: sigmaOprodKernel<3, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
56  case 4: sigmaOprodKernel<4, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
57  case 5: sigmaOprodKernel<5, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
58  case 6: sigmaOprodKernel<6, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
59  case 7: sigmaOprodKernel<7, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
60  case 8: sigmaOprodKernel<8, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
61  case 9: sigmaOprodKernel<9, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg); break;
62  }
63 #endif
64  } else { // run the CPU code
65  errorQuda("No CPU support for staggered outer-product calculation\n");
66  }
67  } // apply
68 
69  void preTune() { this->arg.oprod.save(); }
70  void postTune() { this->arg.oprod.load(); }
71 
72  long long flops() const
73  {
74  return (2 * (long long)arg.length) * 6
75  * ((0 + 144 + 18) * arg.nvector + 18); // spin_mu_nu + spin trace + multiply-add
76  }
77  long long bytes() const
78  {
79  return (2 * (long long)arg.length) * 6
80  * ((arg.inA[0].Bytes() + arg.inB[0].Bytes()) * arg.nvector + 2 * arg.oprod.Bytes());
81  }
82 
83  TuneKey tuneKey() const { return TuneKey(meta.VolString(), "CloverSigmaOprod", aux); }
84  }; // CloverSigmaOprod
85 
86  template<typename Float, typename Output, typename InputA, typename InputB>
87  void computeCloverSigmaOprod(Output oprod, const GaugeField& out, InputA *inA, InputB *inB,
88  std::vector<std::vector<double> > &coeff, int nvector) {
89  // Create the arguments
90  typedef CloverSigmaOprodArg<Float, Output, InputA, InputB> Arg;
91  Arg arg(oprod, inA, inB, coeff, out, nvector);
92  CloverSigmaOprod<Float, Arg> sigma_oprod(arg, out);
93  sigma_oprod.apply(0);
94  } // computeCloverSigmaOprod
95 
96 #endif // GPU_CLOVER_FORCE
97 
99  std::vector<ColorSpinorField*> &x,
100  std::vector<ColorSpinorField*> &p,
101  std::vector<std::vector<double> > &coeff)
102  {
103 
104 #ifdef GPU_CLOVER_DIRAC
105  if (x.size() > MAX_NVECTOR) {
106  // divide and conquer
107  std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
108  std::vector<ColorSpinorField*> p0(p.begin(), p.begin()+p.size()/2);
109  std::vector<std::vector<double> > coeff0(coeff.begin(), coeff.begin()+coeff.size()/2);
110  for (unsigned int i=0; i<coeff0.size(); i++) {
111  coeff0[i].reserve(2); coeff0[i][0] = coeff[i][0]; coeff0[i][1] = coeff[i][1];
112  }
113  computeCloverSigmaOprod(oprod, x0, p0, coeff0);
114 
115  std::vector<ColorSpinorField*> x1(x.begin()+x.size()/2, x.end());
116  std::vector<ColorSpinorField*> p1(p.begin()+p.size()/2, p.end());
117  std::vector<std::vector<double> > coeff1(coeff.begin()+coeff.size()/2, coeff.end());
118  for (unsigned int i=0; i<coeff1.size(); i++) {
119  coeff1[i].reserve(2); coeff1[i][0] = coeff[coeff.size()/2 + i][0]; coeff1[i][1] = coeff[coeff.size()/2 + i][1];
120  }
121  computeCloverSigmaOprod(oprod, x1, p1, coeff1);
122 
123  return;
124  }
125 
126  if (oprod.Order() != QUDA_FLOAT2_GAUGE_ORDER) errorQuda("Unsupported output ordering: %d\n", oprod.Order());
127 
128  if(x[0]->Precision() != oprod.Precision())
129  errorQuda("Mixed precision not supported: %d %d\n", x[0]->Precision(), oprod.Precision());
130 
131  if(oprod.Precision() == QUDA_DOUBLE_PRECISION){
132 
135 
136  for (unsigned int i=0; i<x.size(); i++) {
137  spinorA[i].set(*dynamic_cast<cudaColorSpinorField*>(x[i]));
138  spinorB[i].set(*dynamic_cast<cudaColorSpinorField*>(p[i]));
139  }
140 
141  computeCloverSigmaOprod<double>(gauge::FloatNOrder<double, 18, 2, 18>(oprod),
142  oprod, spinorA, spinorB, coeff, x.size());
143 
144  } else {
145  errorQuda("Unsupported precision: %d\n", oprod.Precision());
146  }
147 #else // GPU_CLOVER_DIRAC not defined
148  errorQuda("Clover Dirac operator has not been built!");
149 #endif
150 
151  checkCudaError();
152  return;
153  } // computeCloverForce
154 
155 } // namespace quda
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
cudaStream_t * stream
void set(const cudaColorSpinorField &x)
Definition: texture.h:321
#define MAX_NVECTOR
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
#define checkCudaError()
Definition: util_quda.h:161
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const
unsigned long long bytes
Definition: blas_quda.cu:23