QUDA  0.9.0
blas_cpu.cpp
Go to the documentation of this file.
1 #include <color_spinor_field.h>
2 #include <blas_quda.h>
3 
4 namespace quda {
5 
6  namespace blas {
7 
8  template <typename Float>
9  void axpby(const Float &a, const Float *x, const Float &b, Float *y, const int N) {
10  for (int i=0; i<N; i++) y[i] = a*x[i] + b*y[i];
11  }
12 
13  void axpbyCpu(const double &a, const cpuColorSpinorField &x,
14  const double &b, cpuColorSpinorField &y) {
15  if (x.Precision() == QUDA_DOUBLE_PRECISION)
16  axpby(a, (double*)x.V(), b, (double*)y.V(), x.Length());
17  else if (x.Precision() == QUDA_SINGLE_PRECISION)
18  axpby((float)a, (float*)x.V(), (float)b, (float*)y.V(), x.Length());
19  else
20  errorQuda("Precision type %d not implemented", x.Precision());
21  }
22 
24  if (x.Precision() == QUDA_DOUBLE_PRECISION)
25  axpby(1.0, (double*)x.V(), 1.0, (double*)y.V(), x.Length());
26  else if (x.Precision() == QUDA_SINGLE_PRECISION)
27  axpby(1.0f, (float*)x.V(), 1.0f, (float*)y.V(), x.Length());
28  else
29  errorQuda("Precision type %d not implemented", x.Precision());
30  }
31 
32  void axpyCpu(const double &a, const cpuColorSpinorField &x,
34  if (x.Precision() == QUDA_DOUBLE_PRECISION)
35  axpby(a, (double*)x.V(), 1.0, (double*)y.V(), x.Length());
36  else if (x.Precision() == QUDA_SINGLE_PRECISION)
37  axpby((float)a, (float*)x.V(), 1.0f, (float*)y.V(), x.Length());
38  else
39  errorQuda("Precision type %d not implemented", x.Precision());
40  }
41 
42  void xpayCpu(const cpuColorSpinorField &x, const double &a,
44  if (x.Precision() == QUDA_DOUBLE_PRECISION)
45  axpby(1.0, (double*)x.V(), a, (double*)y.V(), x.Length());
46  else if (x.Precision() == QUDA_SINGLE_PRECISION)
47  axpby(1.0f, (float*)x.V(), (float)a, (float*)y.V(), x.Length());
48  else
49  errorQuda("Precision type %d not implemented", x.Precision());
50  }
51 
53  if (x.Precision() == QUDA_DOUBLE_PRECISION)
54  axpby(-1.0, (double*)x.V(), 1.0, (double*)y.V(), x.Length());
55  else if (x.Precision() == QUDA_SINGLE_PRECISION)
56  axpby(-1.0f, (float*)x.V(), 1.0f, (float*)y.V(), x.Length());
57  else
58  errorQuda("Precision type %d not implemented", x.Precision());
59  }
60 
61  void axCpu(const double &a, cpuColorSpinorField &x) {
62  if (x.Precision() == QUDA_DOUBLE_PRECISION)
63  axpby(0.0, (double*)x.V(), a, (double*)x.V(), x.Length());
64  else if (x.Precision() == QUDA_SINGLE_PRECISION)
65  axpby(0.0f, (float*)x.V(), (float)a, (float*)x.V(), x.Length());
66  else
67  errorQuda("Precision type %d not implemented", x.Precision());
68  }
69 
70  template <typename Float>
71  void caxpby(const std::complex<Float> &a, const std::complex<Float> *x,
72  const std::complex<Float> &b, std::complex<Float> *y, int N) {
73 
74  for (int i=0; i<N; i++) {
75  y[i] = a*x[i] + b*y[i];
76  }
77 
78  }
79 
80  void caxpyCpu(const Complex &a, const cpuColorSpinorField &x,
82 
83  if ( x.Precision() == QUDA_DOUBLE_PRECISION)
84  caxpby(a, (Complex*)x.V(), Complex(1.0),
85  (Complex*)y.V(), x.Length()/2);
86  else if (x.Precision() == QUDA_SINGLE_PRECISION)
87  caxpby((std::complex<float>)a, (std::complex<float>*)x.V(), std::complex<float>(1.0),
88  (std::complex<float>*)y.V(), x.Length()/2);
89  else
90  errorQuda("Precision type %d not implemented", x.Precision());
91  }
92 
93  void caxpbyCpu(const Complex &a, const cpuColorSpinorField &x,
94  const Complex &b, cpuColorSpinorField &y) {
95 
96  if ( x.Precision() == QUDA_DOUBLE_PRECISION)
97  caxpby(a, (Complex*)x.V(), b, (Complex*)y.V(), x.Length()/2);
98  else if (x.Precision() == QUDA_SINGLE_PRECISION)
99  caxpby((std::complex<float>)a, (std::complex<float>*)x.V(), (std::complex<float>)b,
100  (std::complex<float>*)y.V(), x.Length()/2);
101  else
102  errorQuda("Precision type %d not implemented", x.Precision());
103  }
104 
105  template <typename Float>
106  void caxpbypcz(const std::complex<Float> &a, const std::complex<Float> *x,
107  const std::complex<Float> &b, const std::complex<Float> *y,
108  const std::complex<Float> &c, std::complex<Float> *z, int N) {
109 
110  for (int i=0; i<N; i++) {
111  z[i] = a*x[i] + b*y[i] + c*z[i];
112  }
113 
114  }
115 
116  void cxpaypbzCpu(const cpuColorSpinorField &x, const Complex &a,
117  const cpuColorSpinorField &y, const Complex &b,
119 
120  if (x.Precision() == QUDA_DOUBLE_PRECISION)
121  caxpbypcz(Complex(1, 0), (Complex*)x.V(), a, (Complex*)y.V(),
122  b, (Complex*)z.V(), x.Length()/2);
123  else if (x.Precision() == QUDA_SINGLE_PRECISION)
124  caxpbypcz(std::complex<float>(1, 0), (std::complex<float>*)x.V(), (std::complex<float>)a, (std::complex<float>*)y.V(),
125  (std::complex<float>)b, (std::complex<float>*)z.V(), x.Length()/2);
126  else
127  errorQuda("Precision type %d not implemented", x.Precision());
128  }
129 
131  const double &b, const cpuColorSpinorField& z, const double &c) {
132  axpyCpu(a, x, y);
133  axpbyCpu(b, z, c, x);
134  }
135 
136  // performs the operations: {y[i] = a*x[i] + y[i]; x[i] = z[i] + b*x[i]}
138  const cpuColorSpinorField &z, const double &b) {
139  axpyCpu(a, x, y);
140  xpayCpu(z, b, x);
141  }
142 
143  // performs the operation z[i] = a*x[i] + b*y[i] + z[i] and y[i] -= b*w[i]
144  void caxpbypzYmbwCpu(const Complex &a, const cpuColorSpinorField &x, const Complex &b,
146 
147  if (x.Precision() == QUDA_DOUBLE_PRECISION)
148  caxpbypcz(a, (Complex*)x.V(), b, (Complex*)y.V(),
149  Complex(1, 0), (Complex*)z.V(), x.Length()/2);
150  else if (x.Precision() == QUDA_SINGLE_PRECISION)
151  caxpbypcz((std::complex<float>)a, (std::complex<float>*)x.V(),
152  (std::complex<float>)b, (std::complex<float>*)y.V(),
153  (std::complex<float>)(1.0f), (std::complex<float>*)z.V(), x.Length()/2);
154  else
155  errorQuda("Precision type %d not implemented", x.Precision());
156 
157  caxpyCpu(-b, w, y);
158  }
159 
160  template <typename Float>
161  double norm(const Float *a, const int N) {
162  double norm2 = 0;
163  for (int i=0; i<N; i++) norm2 += a[i]*a[i];
164  return norm2;
165  }
166 
167  double normCpu(const cpuColorSpinorField &a) {
168  double norm2 = 0.0;
169  if (a.Precision() == QUDA_DOUBLE_PRECISION)
170  norm2 = norm((double*)a.V(), a.Length());
171  else if (a.Precision() == QUDA_SINGLE_PRECISION)
172  norm2 = norm((float*)a.V(), a.Length());
173  else
174  errorQuda("Precision type %d not implemented", a.Precision());
176  return norm2;
177  }
178 
179  double axpyNormCpu(const double &a, const cpuColorSpinorField &x,
181  axpyCpu(a, x, y);
182  return normCpu(y);
183  }
184 
185  template <typename Float>
186  double reDotProduct(const Float *a, const Float *b, const int N) {
187  double dot = 0;
188  for (int i=0; i<N; i++) dot += a[i]*b[i];
189  return dot;
190  }
191 
193  double dot = 0.0;
194  if (a.Precision() == QUDA_DOUBLE_PRECISION)
195  dot = reDotProduct((double*)a.V(), (double*)b.V(), a.Length());
196  else if (a.Precision() == QUDA_SINGLE_PRECISION)
197  dot = reDotProduct((float*)a.V(), (float*)b.V(), a.Length());
198  else
199  errorQuda("Precision type %d not implemented", a.Precision());
200  reduceDouble(dot);
201  return dot;
202  }
203 
204  // First performs the operation y[i] = x[i] - y[i]
205  // Second returns the norm of y
207  xpayCpu(x, -1, y);
208  return normCpu(y);
209  }
210 
211  template <typename Float>
212  Complex cDotProduct(const std::complex<Float> *a, const std::complex<Float> *b, const int N) {
213  quda::Complex dot = 0;
214  for (int i=0; i<N; i++) dot += conj(a[i])*b[i];
215  return dot;
216  }
217 
219  Complex dot = 0.0;
220  if (a.Precision() == QUDA_DOUBLE_PRECISION)
221  dot = cDotProduct((Complex*)a.V(), (Complex*)b.V(), a.Length()/2);
222  else if (a.Precision() == QUDA_SINGLE_PRECISION)
223  dot = cDotProduct((std::complex<float>*)a.V(), (std::complex<float>*)b.V(), a.Length()/2);
224  else
225  errorQuda("Precision type %d not implemented", a.Precision());
226  reduceDoubleArray((double*)&dot, 2);
227  return dot;
228  }
229 
230  // First performs the operation y = x + a*y
231  // Second returns complex dot product (z,y)
232  Complex xpaycDotzyCpu(const cpuColorSpinorField &x, const double &a,
234  xpayCpu(x, a, y);
235  return cDotProductCpu(z,y);
236  }
237 
240  double norm = normCpu(a);
241  return make_double3(real(dot), imag(dot), norm);
242  }
243 
246  double norm = normCpu(b);
247  return make_double3(real(dot), imag(dot), norm);
248  }
249 
250  // This convoluted kernel does the following: z += a*x + b*y, y -= b*w, norm = (y,y), dot = (u, y)
252  const Complex &b, cpuColorSpinorField &y,
254  const cpuColorSpinorField &u) {
255 
256  caxpbypzYmbwCpu(a, x, b, y, z, w);
257  return cDotProductNormBCpu(u, y);
258  }
259 
260  void cabxpyAxCpu(const double &a, const Complex &b, cpuColorSpinorField &x, cpuColorSpinorField &y) {
261  axCpu(a, x);
262  caxpyCpu(b, x, y);
263  }
264 
267  caxpyCpu(a, x, y);
268  return norm2(y);
269  }
270 
273  caxpyCpu(a, x, y);
274  caxpyCpu(-a, z, x);
275  return norm2(x);
276  }
277 
280  caxpyCpu(a, x, y);
281  caxpyCpu(-a, z, x);
282  }
283 
284  double cabxpyAxNormCpu(const double &a, const Complex &b, cpuColorSpinorField &x, cpuColorSpinorField &y) {
285  axCpu(a, x);
286  caxpyCpu(b, x, y);
287  return norm2(y);
288  }
289 
292  caxpyCpu(a, x, z);
293  caxpyCpu(b, y, z);
294  }
295 
298  caxpyCpu(a, x, w);
299  caxpyCpu(b, y, w);
300  caxpyCpu(c, z, w);
301 
302  }
303 
306  caxpyCpu(a, x, y);
307  return cDotProductCpu(z, y);
308  }
309 
310  template <typename Float>
311  double3 HeavyQuarkResidualNorm(const Float *x, const Float *r, const int volume, const int Nint) {
312 
313  double3 sum = make_double3(0.0, 0.0, 0.0);
314  for (int i = 0; i<volume; i++) {
315  double x2 = 0;
316  double r2 = 0;
317 
318  for (int j=0; j<Nint; j++) { // loop over internal degrees of freedom
319  int k = i*Nint + j;
320  x2 += x[k]*x[k];
321  r2 += r[k]*r[k];
322  }
323 
324  sum.x += x2;
325  sum.y += r2;
326  sum.z += (x2 > 0.0) ? (r2 / x2) : 1.0;
327  }
328  return sum;
329  }
330 
331 
333  double3 rtn;
334  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
335  rtn = HeavyQuarkResidualNorm<double>((const double*)(x.V()), (const double*)(r.V()),
336  x.Volume(), 2*x.Ncolor()*x.Nspin());
337  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
338  rtn = HeavyQuarkResidualNorm<float>((const float*)(x.V()), (const float*)(r.V()),
339  x.Volume(), 2*x.Ncolor()*x.Nspin());
340  } else {
341  errorQuda("Precision type %d not implemented", x.Precision());
342  }
343 #ifdef MULTI_GPU
344  rtn.z /= (x.Volume()*comm_size());
345 #else
346  rtn.z /= x.Volume();
347 #endif
348  reduceDoubleArray((double*)&rtn, 3);
349 
350  return rtn;
351  }
352 
355  xpyCpu(y, tmp);
356  return HeavyQuarkResidualNormCpu(tmp, r);
357  }
358 
359  } // namespace blas
360 } // namespace quda
361 
double xmyNormCpu(const cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:206
void mxpyCpu(const cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:52
Complex xpaycDotzyCpu(const cpuColorSpinorField &x, const double &a, cpuColorSpinorField &y, const cpuColorSpinorField &z)
Definition: blas_cpu.cpp:232
void xpayCpu(const cpuColorSpinorField &x, const double &a, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:42
void caxpbypzCpu(const Complex &a, cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y, cpuColorSpinorField &z)
Definition: blas_cpu.cpp:290
Complex cDotProductCpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
Definition: blas_cpu.cpp:218
double axpyNormCpu(const double &a, const cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:179
#define errorQuda(...)
Definition: util_quda.h:90
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
void xpyCpu(const cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:23
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:500
std::complex< double > Complex
Definition: eig_variables.h:13
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
void axpyBzpcxCpu(const double &a, cpuColorSpinorField &x, cpuColorSpinorField &y, const double &b, const cpuColorSpinorField &z, const double &c)
Definition: blas_cpu.cpp:130
void reduceDoubleArray(double *, const int len)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:277
#define b
double caxpyXmazNormXCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y, cpuColorSpinorField &z)
Definition: blas_cpu.cpp:271
double cabxpyAxNormCpu(const double &a, const Complex &b, cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:284
void axpbyCpu(const double &a, const cpuColorSpinorField &x, const double &b, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:13
int comm_size(void)
Definition: comm_mpi.cpp:126
void axCpu(const double &a, cpuColorSpinorField &x)
Definition: blas_cpu.cpp:61
__host__ __device__ void sum(double &a, double &b)
void caxpyCpu(const Complex &a, const cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:80
int int int w
double3 cDotProductNormACpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
Definition: blas_cpu.cpp:238
int int int enum cudaChannelFormatKind f
void axpyZpbxCpu(const double &a, cpuColorSpinorField &x, cpuColorSpinorField &y, const cpuColorSpinorField &z, const double &b)
Definition: blas_cpu.cpp:137
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:703
void caxpbypcz(const std::complex< Float > &a, const std::complex< Float > *x, const std::complex< Float > &b, const std::complex< Float > *y, const std::complex< Float > &c, std::complex< Float > *z, int N)
Definition: blas_cpu.cpp:106
double norm(const Float *a, const int N)
Definition: blas_cpu.cpp:161
void caxpbyCpu(const Complex &a, const cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:93
double3 HeavyQuarkResidualNormCpu(cpuColorSpinorField &x, cpuColorSpinorField &r)
Definition: blas_cpu.cpp:332
void caxpyXmazCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y, cpuColorSpinorField &z)
Definition: blas_cpu.cpp:278
void axpby(const double &a, ColorSpinorField &x, const double &b, ColorSpinorField &y)
Definition: blas_quda.cu:106
Complex caxpyDotzyCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y, cpuColorSpinorField &z)
Definition: blas_cpu.cpp:304
double reDotProductCpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
Definition: blas_cpu.cpp:192
void cxpaypbzCpu(const cpuColorSpinorField &x, const Complex &a, const cpuColorSpinorField &y, const Complex &b, cpuColorSpinorField &z)
Definition: blas_cpu.cpp:116
double normCpu(const cpuColorSpinorField &a)
Definition: blas_cpu.cpp:167
void caxpbypczpwCpu(const Complex &a, cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y, const Complex &c, cpuColorSpinorField &z, cpuColorSpinorField &w)
Definition: blas_cpu.cpp:296
void caxpbypzYmbwCpu(const Complex &a, const cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y, cpuColorSpinorField &z, const cpuColorSpinorField &w)
Definition: blas_cpu.cpp:144
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
Definition: blas_quda.cu:292
double caxpyNormCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:265
const void * c
void cabxpyAxCpu(const double &a, const Complex &b, cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:260
void axpyCpu(const double &a, const cpuColorSpinorField &x, cpuColorSpinorField &y)
Definition: blas_cpu.cpp:32
void reduceDouble(double &)
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
double3 caxpbypzYmbwcDotProductUYNormYCpu(const Complex &a, const cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y, cpuColorSpinorField &z, const cpuColorSpinorField &w, const cpuColorSpinorField &u)
Definition: blas_cpu.cpp:251
#define a
double3 cDotProductNormBCpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
Definition: blas_cpu.cpp:244
static void dot(sFloat *res, gFloat *a, sFloat *b)
Definition: dslash_util.h:56