QUDA  0.9.0
clover_sigma_outer_product.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 
4 #include <tune_quda.h>
5 #include <quda_internal.h>
6 #include <gauge_field_order.h>
7 #include <quda_matrix.h>
8 #include <color_spinor.h>
9 #include <dslash_quda.h>
10 
11 namespace quda {
12 
13 #ifdef GPU_CLOVER_DIRAC
14 
15  namespace { // anonymous
16 #include <texture.h>
17  }
18 
19  // This is the maximum number of color spinors we can process in a single kernel
20 #if (CUDA_VERSION < 8000)
21 #define MAX_NVECTOR 1 // multi-vector code doesn't seem to work well with CUDA 7.x
22 #else
23 #define MAX_NVECTOR 9
24 #endif
25 
26  template<typename Float, typename Output, typename InputA, typename InputB>
27  struct CloverSigmaOprodArg {
28  Output oprod;
29  InputA inA[MAX_NVECTOR];
30  InputB inB[MAX_NVECTOR];
31  Float coeff[MAX_NVECTOR][2];
32  unsigned int length;
33  int nvector;
34 
35  CloverSigmaOprodArg(Output &oprod, InputA *inA_, InputB *inB_,
36  const std::vector<std::vector<double> > &coeff_,
37  const GaugeField &meta, int nvector)
38  : oprod(oprod), length(meta.VolumeCB()), nvector(nvector)
39  {
40  for (int i=0; i<nvector; i++) {
41  inA[i] = inA_[i];
42  inB[i] = inB_[i];
43  coeff[i][0] = coeff_[i][0];
44  coeff[i][1] = coeff_[i][1];
45  }
46  }
47  };
48 
49  template <typename real, int nvector, int mu, int nu, int parity, typename Arg>
50  inline __device__ void sigmaOprod(Arg &arg, int idx) {
51  typedef complex<real> Complex;
52  Matrix<Complex,3> result;
53 
54 #pragma unroll
55  for (int i=0; i<nvector; i++) {
56  ColorSpinor<real,3,4> A, B;
57 
58  arg.inA[i].load(static_cast<Complex*>(A.data), idx, parity);
59  arg.inB[i].load(static_cast<Complex*>(B.data), idx, parity);
60 
61  // multiply by sigma_mu_nu
62  ColorSpinor<real,3,4> C = A.sigma(nu,mu);
63  result += arg.coeff[i][parity] * outerProdSpinTrace(C,B);
64  }
65 
66  result -= conj(result);
67 
68  Matrix<Complex,3> temp;
69  arg.oprod.load(reinterpret_cast<real*>(temp.data), idx, (mu-1)*mu/2 + nu, parity);
70  temp = result + temp;
71  arg.oprod.save(reinterpret_cast<real*>(temp.data), idx, (mu-1)*mu/2 + nu, parity);
72  }
73 
74  template<int nvector, typename real, typename Output, typename InputA, typename InputB>
75  __global__ void sigmaOprodKernel(CloverSigmaOprodArg<real, Output, InputA, InputB> arg) {
76  typedef complex<real> Complex;
77  int idx = blockIdx.x*blockDim.x + threadIdx.x;
78  int parity = blockIdx.y*blockDim.y + threadIdx.y;
79  int mu_nu = blockIdx.z*blockDim.z + threadIdx.z;
80 
81  if (idx >= arg.length) return;
82  if (mu_nu >= 6) return;
83 
84  switch(parity) {
85  case 0:
86  switch(mu_nu) {
87  case 0: sigmaOprod<real, nvector, 1, 0, 0>(arg, idx); break;
88  case 1: sigmaOprod<real, nvector, 2, 0, 0>(arg, idx); break;
89  case 2: sigmaOprod<real, nvector, 2, 1, 0>(arg, idx); break;
90  case 3: sigmaOprod<real, nvector, 3, 0, 0>(arg, idx); break;
91  case 4: sigmaOprod<real, nvector, 3, 1, 0>(arg, idx); break;
92  case 5: sigmaOprod<real, nvector, 3, 2, 0>(arg, idx); break;
93  }
94  break;
95  case 1:
96  switch(mu_nu) {
97  case 0: sigmaOprod<real, nvector, 1, 0, 1>(arg, idx); break;
98  case 1: sigmaOprod<real, nvector, 2, 0, 1>(arg, idx); break;
99  case 2: sigmaOprod<real, nvector, 2, 1, 1>(arg, idx); break;
100  case 3: sigmaOprod<real, nvector, 3, 0, 1>(arg, idx); break;
101  case 4: sigmaOprod<real, nvector, 3, 1, 1>(arg, idx); break;
102  case 5: sigmaOprod<real, nvector, 3, 2, 1>(arg, idx); break;
103  }
104  break;
105  }
106 
107  return;
108  } // sigmaOprodKernel
109 
110 
111  template<typename Float, typename Output, typename InputA, typename InputB>
112  class CloverSigmaOprod : public TunableVectorYZ {
113 
114  private:
115  CloverSigmaOprodArg<Float,Output,InputA,InputB> &arg;
116  const GaugeField &meta;
117 
118  unsigned int sharedBytesPerThread() const { return 0; }
119  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
120 
121  unsigned int minThreads() const { return arg.length; }
122  bool tuneGridDim() const { return false; }
123 
124  public:
125  CloverSigmaOprod(CloverSigmaOprodArg<Float,Output,InputA,InputB> &arg, const GaugeField &meta)
126  : TunableVectorYZ(2,6), arg(arg), meta(meta) {
127  writeAuxString("prec=%lu,stride=%d,nvector=%d", sizeof(Float), arg.inA[0].Stride(), arg.nvector);
128  // this sets the communications pattern for the packing kernel
129  }
130 
131  virtual ~CloverSigmaOprod() {}
132 
133  void apply(const cudaStream_t &stream){
134  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
135  TuneParam tp = tuneLaunch(*this,getTuning(),getVerbosity());
136  switch(arg.nvector) {
137  case 1: sigmaOprodKernel< 1><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
138  case 2: sigmaOprodKernel< 2><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
139  case 3: sigmaOprodKernel< 3><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
140  case 4: sigmaOprodKernel< 4><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
141  case 5: sigmaOprodKernel< 5><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
142  case 6: sigmaOprodKernel< 6><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
143  case 7: sigmaOprodKernel< 7><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
144  case 8: sigmaOprodKernel< 8><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
145  case 9: sigmaOprodKernel< 9><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
146  }
147  } else { // run the CPU code
148  errorQuda("No CPU support for staggered outer-product calculation\n");
149  }
150  } // apply
151 
152  void preTune() { this->arg.oprod.save(); }
153  void postTune() { this->arg.oprod.load(); }
154 
155  long long flops() const {
156  return (2*(long long)arg.length)*6*((0 + 144 + 18)*arg.nvector + 18); // spin_mu_nu + spin trace + multiply-add
157  }
158  long long bytes() const {
159  return (2*(long long)arg.length)*6*((arg.inA[0].Bytes() + arg.inB[0].Bytes())*arg.nvector + 2*arg.oprod.Bytes());
160  }
161 
162  TuneKey tuneKey() const {
163  return TuneKey(meta.VolString(), "CloverSigmaOprod", aux);
164  }
165  }; // CloverSigmaOprod
166 
167  template<typename Float, typename Output, typename InputA, typename InputB>
168  void computeCloverSigmaOprod(Output oprod, const GaugeField& out, InputA *inA, InputB *inB,
169  std::vector<std::vector<double> > &coeff, int nvector) {
170  // Create the arguments
171  CloverSigmaOprodArg<Float,Output,InputA,InputB> arg(oprod, inA, inB, coeff, out, nvector);
172  CloverSigmaOprod<Float,Output,InputA,InputB> sigma_oprod(arg, out);
173  sigma_oprod.apply(0);
174  } // computeCloverSigmaOprod
175 
176 #endif // GPU_CLOVER_FORCE
177 
179  std::vector<ColorSpinorField*> &x,
180  std::vector<ColorSpinorField*> &p,
181  std::vector<std::vector<double> > &coeff)
182  {
183 
184 #ifdef GPU_CLOVER_DIRAC
185  if (x.size() > MAX_NVECTOR) {
186  // divide and conquer
187  std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
188  std::vector<ColorSpinorField*> p0(p.begin(), p.begin()+p.size()/2);
189  std::vector<std::vector<double> > coeff0(coeff.begin(), coeff.begin()+coeff.size()/2);
190  for (unsigned int i=0; i<coeff0.size(); i++) {
191  coeff0[i].reserve(2); coeff0[i][0] = coeff[i][0]; coeff0[i][1] = coeff[i][1];
192  }
193  computeCloverSigmaOprod(oprod, x0, p0, coeff0);
194 
195  std::vector<ColorSpinorField*> x1(x.begin()+x.size()/2, x.end());
196  std::vector<ColorSpinorField*> p1(p.begin()+p.size()/2, p.end());
197  std::vector<std::vector<double> > coeff1(coeff.begin()+coeff.size()/2, coeff.end());
198  for (unsigned int i=0; i<coeff1.size(); i++) {
199  coeff1[i].reserve(2); coeff1[i][0] = coeff[coeff.size()/2 + i][0]; coeff1[i][1] = coeff[coeff.size()/2 + i][1];
200  }
201  computeCloverSigmaOprod(oprod, x1, p1, coeff1);
202 
203  return;
204  }
205 
206  if(oprod.Order() != QUDA_FLOAT2_GAUGE_ORDER)
207  errorQuda("Unsupported output ordering: %d\n", oprod.Order());
208 
209  if(x[0]->Precision() != oprod.Precision())
210  errorQuda("Mixed precision not supported: %d %d\n", x[0]->Precision(), oprod.Precision());
211 
212  if(oprod.Precision() == QUDA_DOUBLE_PRECISION){
213 
214  Spinor<double2, double2, 12, 0, 0> spinorA[MAX_NVECTOR];
215  Spinor<double2, double2, 12, 0, 1> spinorB[MAX_NVECTOR];
216 
217  for (unsigned int i=0; i<x.size(); i++) {
218  spinorA[i].set(*dynamic_cast<cudaColorSpinorField*>(x[i]));
219  spinorB[i].set(*dynamic_cast<cudaColorSpinorField*>(p[i]));
220  }
221 
222  computeCloverSigmaOprod<double>(gauge::FloatNOrder<double, 18, 2, 18>(oprod),
223  oprod, spinorA, spinorB, coeff, x.size());
224 
225  } else {
226  errorQuda("Unsupported precision: %d\n", oprod.Precision());
227  }
228 #else // GPU_CLOVER_DIRAC not defined
229  errorQuda("Clover Dirac operator has not been built!");
230 #endif
231 
232  checkCudaError();
233  return;
234  } // computeCloverForce
235 
236 } // namespace quda
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
void set(const cudaColorSpinorField &x)
Definition: texture.h:572
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
__device__ __host__ Matrix< complex< Float >, Nc > outerProdSpinTrace(const ColorSpinor< Float, Nc, Ns > &a, const ColorSpinor< Float, Nc, Ns > &b)
Definition: color_spinor.h:849
static __inline__ size_t p
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Main header file for host and device accessors to GaugeFields.
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
#define checkCudaError()
Definition: util_quda.h:129
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43