QUDA  v1.1.0
A library for QCD on GPUs
blas_reference.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <time.h>
4 #include <math.h>
5 #include <string.h>
6 #include <complex>
7 #include <inttypes.h>
8 
9 #include <util_quda.h>
10 #include <host_utils.h>
11 #include <command_line_params.h>
12 #include "misc.h"
13 
14 using namespace std;
15 
16 #include <Eigen/Dense>
17 using namespace Eigen;
18 
19 void fillEigenArray(MatrixXcd &EigenArr, complex<double> *arr, int rows, int cols, int ld, int offset)
20 {
21  int counter = offset;
22  for (int i = 0; i < rows; i++) {
23  for (int j = 0; j < cols; j++) {
24  EigenArr(i, j) = arr[counter];
25  counter++;
26  }
27  counter += (ld - cols);
28  }
29 }
30 
31 double blasGEMMEigenVerify(void *A_data, void *B_data, void *C_data_copy, void *C_data, uint64_t refA_size,
32  uint64_t refB_size, uint64_t refC_size, QudaBLASParam *blas_param)
33 {
34  // Sanity checks on parameters
35  //-------------------------------------------------------------------------
36  // If the user passes non positive M,N, or K, we error out
37  int min_dim = std::min(blas_param->m, std::min(blas_param->n, blas_param->k));
38  if (min_dim <= 0) {
39  errorQuda("BLAS dims must be positive: m=%d, n=%d, k=%d", blas_param->m, blas_param->n, blas_param->k);
40  }
41 
42  // If the user passes a negative stride, we error out as this has no meaning.
43  int min_stride = std::min(std::min(blas_param->a_stride, blas_param->b_stride), blas_param->c_stride);
44  if (min_stride < 0) {
45  errorQuda("BLAS strides must be positive or zero: a_stride=%d, b_stride=%d, c_stride=%d", blas_param->a_stride,
46  blas_param->b_stride, blas_param->c_stride);
47  }
48 
49  // If the user passes a negative offset, we error out as this has no meaning.
50  int min_offset = std::min(std::min(blas_param->a_offset, blas_param->b_offset), blas_param->c_offset);
51  if (min_offset < 0) {
52  errorQuda("BLAS offsets must be positive or zero: a_offset=%d, b_offset=%d, c_offset=%d", blas_param->a_offset,
53  blas_param->b_offset, blas_param->c_offset);
54  }
55 
56  // If the batch value is non-positve, we error out
57  if (blas_param->batch_count <= 0) { errorQuda("Batches must be positive: batches=%d", blas_param->batch_count); }
58 
59  // Leading dims are dependendent on the matrix op type.
60  if (blas_param->data_order == QUDA_BLAS_DATAORDER_COL) {
61  if (blas_param->trans_a == QUDA_BLAS_OP_N) {
62  if (blas_param->lda < std::max(1, blas_param->m))
63  errorQuda("lda=%d must be >= max(1,m=%d)", blas_param->lda, blas_param->m);
64  } else {
65  if (blas_param->lda < std::max(1, blas_param->k))
66  errorQuda("lda=%d must be >= max(1,k=%d)", blas_param->lda, blas_param->k);
67  }
68 
69  if (blas_param->trans_b == QUDA_BLAS_OP_N) {
70  if (blas_param->ldb < std::max(1, blas_param->k))
71  errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param->ldb, blas_param->k);
72  } else {
73  if (blas_param->ldb < std::max(1, blas_param->n))
74  errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param->ldb, blas_param->n);
75  }
76  if (blas_param->ldc < std::max(1, blas_param->m))
77  errorQuda("ldc=%d must be >= max(1,m=%d)", blas_param->ldc, blas_param->m);
78  } else {
79  if (blas_param->trans_a == QUDA_BLAS_OP_N) {
80  if (blas_param->lda < std::max(1, blas_param->k))
81  errorQuda("lda=%d must be >= max(1,k=%d)", blas_param->lda, blas_param->k);
82  } else {
83  if (blas_param->lda < std::max(1, blas_param->m))
84  errorQuda("lda=%d must be >= max(1,m=%d)", blas_param->lda, blas_param->m);
85  }
86  if (blas_param->trans_b == QUDA_BLAS_OP_N) {
87  if (blas_param->ldb < std::max(1, blas_param->n))
88  errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param->ldb, blas_param->n);
89  } else {
90  if (blas_param->ldb < std::max(1, blas_param->k))
91  errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param->ldb, blas_param->k);
92  }
93  if (blas_param->ldc < std::max(1, blas_param->n))
94  errorQuda("ldc=%d must be >= max(1,n=%d)", blas_param->ldc, blas_param->n);
95  }
96  //-------------------------------------------------------------------------
97 
98  // Parse parameters for Eigen
99  //-------------------------------------------------------------------------
100  // Swap A and B if in column order
101  if (blas_param->data_order == QUDA_BLAS_DATAORDER_COL) {
102  std::swap(blas_param->m, blas_param->n);
103  std::swap(blas_param->lda, blas_param->ldb);
104  std::swap(blas_param->trans_a, blas_param->trans_b);
105  std::swap(blas_param->a_offset, blas_param->b_offset);
106  std::swap(blas_param->a_stride, blas_param->b_stride);
107  std::swap(A_data, B_data);
108  }
109 
110  // Problem parameters
111  int m = blas_param->m;
112  int n = blas_param->n;
113  int k = blas_param->k;
114 
115  int lda = blas_param->lda;
116  int ldb = blas_param->ldb;
117  int ldc = blas_param->ldc;
118 
119  int a_stride = blas_param->a_stride;
120  int b_stride = blas_param->b_stride;
121  int c_stride = blas_param->c_stride;
122 
123  int a_offset = blas_param->a_offset;
124  int b_offset = blas_param->b_offset;
125  int c_offset = blas_param->c_offset;
126 
127  int batches = blas_param->batch_count;
128 
129  complex<double> alpha = blas_param->alpha;
130  complex<double> beta = blas_param->beta;
131 
132  // Eigen objects to store data
133  MatrixXcd A = MatrixXd::Zero(m, k);
134  MatrixXcd B = MatrixXd::Zero(k, n);
135  MatrixXcd C_eigen = MatrixXd::Zero(m, n);
136  MatrixXcd C_gpu = MatrixXd::Zero(m, n);
137  MatrixXcd C_resid = MatrixXd::Zero(m, n);
138 
139  // Pointers to data
140  complex<double> *A_ptr = (complex<double> *)(&A_data)[0];
141  complex<double> *B_ptr = (complex<double> *)(&B_data)[0];
142  complex<double> *C_ptr = (complex<double> *)(&C_data)[0];
143  complex<double> *Ccopy_ptr = (complex<double> *)(&C_data_copy)[0];
144 
145  // Get maximum stride length to deduce the number of batches in the
146  // computation
147  int max_stride = std::max(std::max(a_stride, b_stride), c_stride);
148 
149  // If the user gives strides of 0 for all arrays, we are essentially performing
150  // a GEMM on the first matrices in the array N_{batch} times.
151  // Give them what they ask for, YMMV...
152  // If the strides have not been set, we are just using strides of 1.
153  if (max_stride <= 0) max_stride = 1;
154 
155  printfQuda("Computing Eigen matrix operation a * A_{%lu,%lu} * B_{%lu,%lu} + b * C_{%lu,%lu} = C_{%lu,%lu}\n",
156  A.rows(), A.cols(), B.rows(), B.cols(), C_eigen.rows(), C_eigen.cols(), C_eigen.rows(), C_eigen.cols());
157 
158  double max_relative_deviation = 0.0;
159  for (int batch = 0; batch < batches; batch += max_stride) {
160 
161  // Populate Eigen objects
162  fillEigenArray(A, A_ptr, m, k, lda, a_offset);
163  fillEigenArray(B, B_ptr, k, n, ldb, b_offset);
164  fillEigenArray(C_eigen, Ccopy_ptr, m, n, ldc, c_offset);
165  fillEigenArray(C_gpu, C_ptr, m, n, ldc, c_offset);
166 
167  // Apply op(A) and op(B)
168  switch (blas_param->trans_a) {
169  case QUDA_BLAS_OP_T: A.transposeInPlace(); break;
170  case QUDA_BLAS_OP_C: A.adjointInPlace(); break;
171  case QUDA_BLAS_OP_N: break;
172  default: errorQuda("Unknown blas op type %d", blas_param->trans_a);
173  }
174 
175  switch (blas_param->trans_b) {
176  case QUDA_BLAS_OP_T: B.transposeInPlace(); break;
177  case QUDA_BLAS_OP_C: B.adjointInPlace(); break;
178  case QUDA_BLAS_OP_N: break;
179  default: errorQuda("Unknown blas op type %d", blas_param->trans_b);
180  }
181 
182  // Perform GEMM using Eigen
183  C_eigen = alpha * A * B + beta * C_eigen;
184 
185  // Check Eigen result against blas
186  C_resid = C_gpu - C_eigen;
187  double deviation = C_resid.norm();
188  double relative_deviation = deviation / C_eigen.norm();
189  max_relative_deviation = std::max(max_relative_deviation, relative_deviation);
190 
191  printfQuda("batch %d: (C_host - C_gpu) Frobenius norm = %e. Relative deviation = %e\n", batch, deviation,
192  relative_deviation);
193 
194  a_offset += refA_size * a_stride;
195  b_offset += refB_size * b_stride;
196  c_offset += refC_size * c_stride;
197  }
198 
199  // Restore the blas parameters to their original values
200  if (blas_param->data_order == QUDA_BLAS_DATAORDER_COL) {
201  std::swap(blas_param->m, blas_param->n);
202  std::swap(blas_param->lda, blas_param->ldb);
203  std::swap(blas_param->trans_a, blas_param->trans_b);
204  std::swap(blas_param->a_offset, blas_param->b_offset);
205  std::swap(blas_param->a_stride, blas_param->b_stride);
206  std::swap(A_data, B_data);
207  }
208 
209  return max_relative_deviation;
210 }
211 
212 double blasGEMMQudaVerify(void *arrayA, void *arrayB, void *arrayC, void *arrayCcopy, uint64_t refA_size,
213  uint64_t refB_size, uint64_t refC_size, QudaBLASParam *blas_param)
214 {
215  // Reference data is always in complex double
216  size_t data_size = sizeof(double);
217  int re_im = 2;
218  data_size *= re_im;
219 
220  // If the user passes non-zero offsets, add one extra
221  // matrix to the test data.
222  int batches_extra = 0;
223  if (blas_param->a_offset + blas_param->b_offset + blas_param->c_offset > 0) { batches_extra++; }
224  int batches = blas_param->batch_count + batches_extra;
225 
226  // Copy data from problem sized array to reference sized array.
227  // Include A and B to ensure no data corruption occurred
228  void *checkA = pinned_malloc(refA_size * data_size * batches);
229  void *checkB = pinned_malloc(refB_size * data_size * batches);
230  void *checkC = pinned_malloc(refC_size * data_size * batches);
231  void *checkCcopy = pinned_malloc(refC_size * data_size * batches);
232 
233  memset(checkA, 0, batches * refA_size * data_size);
234  memset(checkB, 0, batches * refB_size * data_size);
235  memset(checkC, 0, batches * refC_size * data_size);
236  memset(checkCcopy, 0, batches * refC_size * data_size);
237 
238  switch (blas_param->data_type) {
240  for (uint64_t i = 0; i < 2 * refA_size * batches; i += 2) { ((double *)checkA)[i] = ((float *)arrayA)[i / 2]; }
241  for (uint64_t i = 0; i < 2 * refB_size * batches; i += 2) { ((double *)checkB)[i] = ((float *)arrayB)[i / 2]; }
242  for (uint64_t i = 0; i < 2 * refC_size * batches; i += 2) {
243  ((double *)checkC)[i] = ((float *)arrayC)[i / 2];
244  ((double *)checkCcopy)[i] = ((float *)arrayCcopy)[i / 2];
245  }
246  break;
248  for (uint64_t i = 0; i < 2 * refA_size * batches; i += 2) { ((double *)checkA)[i] = ((double *)arrayA)[i / 2]; }
249  for (uint64_t i = 0; i < 2 * refB_size * batches; i += 2) { ((double *)checkB)[i] = ((double *)arrayB)[i / 2]; }
250  for (uint64_t i = 0; i < 2 * refC_size * batches; i += 2) {
251  ((double *)checkC)[i] = ((double *)arrayC)[i / 2];
252  ((double *)checkCcopy)[i] = ((double *)arrayCcopy)[i / 2];
253  }
254  break;
256  for (uint64_t i = 0; i < 2 * refA_size * batches; i++) { ((double *)checkA)[i] = ((float *)arrayA)[i]; }
257  for (uint64_t i = 0; i < 2 * refB_size * batches; i++) { ((double *)checkB)[i] = ((float *)arrayB)[i]; }
258  for (uint64_t i = 0; i < 2 * refC_size * batches; i++) {
259  ((double *)checkC)[i] = ((float *)arrayC)[i];
260  ((double *)checkCcopy)[i] = ((float *)arrayCcopy)[i];
261  }
262  break;
264  for (uint64_t i = 0; i < 2 * refA_size * batches; i++) { ((double *)checkA)[i] = ((double *)arrayA)[i]; }
265  for (uint64_t i = 0; i < 2 * refB_size * batches; i++) { ((double *)checkB)[i] = ((double *)arrayB)[i]; }
266  for (uint64_t i = 0; i < 2 * refC_size * batches; i++) {
267  ((double *)checkC)[i] = ((double *)arrayC)[i];
268  ((double *)checkCcopy)[i] = ((double *)arrayCcopy)[i];
269  }
270  break;
271  default: errorQuda("Unrecognised data type %d\n", blas_param->data_type);
272  }
273 
274  auto deviation = blasGEMMEigenVerify(checkA, checkB, checkCcopy, checkC, refA_size, refB_size, refC_size, blas_param);
275 
276  host_free(checkA);
277  host_free(checkB);
278  host_free(checkC);
279  host_free(checkCcopy);
280 
281  return deviation;
282 }
double blasGEMMEigenVerify(void *A_data, void *B_data, void *C_data_copy, void *C_data, uint64_t refA_size, uint64_t refB_size, uint64_t refC_size, QudaBLASParam *blas_param)
double blasGEMMQudaVerify(void *arrayA, void *arrayB, void *arrayC, void *arrayCcopy, uint64_t refA_size, uint64_t refB_size, uint64_t refC_size, QudaBLASParam *blas_param)
void fillEigenArray(MatrixXcd &EigenArr, complex< double > *arr, int rows, int cols, int ld, int offset)
void * memset(void *s, int c, size_t n)
@ QUDA_BLAS_DATATYPE_Z
Definition: enum_quda.h:480
@ QUDA_BLAS_DATATYPE_D
Definition: enum_quda.h:478
@ QUDA_BLAS_DATATYPE_C
Definition: enum_quda.h:479
@ QUDA_BLAS_DATATYPE_S
Definition: enum_quda.h:477
@ QUDA_BLAS_DATAORDER_COL
Definition: enum_quda.h:486
@ QUDA_BLAS_OP_C
Definition: enum_quda.h:472
@ QUDA_BLAS_OP_N
Definition: enum_quda.h:470
@ QUDA_BLAS_OP_T
Definition: enum_quda.h:471
#define pinned_malloc(size)
Definition: malloc_quda.h:107
#define host_free(ptr)
Definition: malloc_quda.h:115
int c_offset
Definition: quda.h:761
double_complex alpha
Definition: quda.h:766
int a_stride
Definition: quda.h:762
int b_stride
Definition: quda.h:763
QudaBLASDataOrder data_order
Definition: quda.h:772
int c_stride
Definition: quda.h:764
int b_offset
Definition: quda.h:760
QudaBLASOperation trans_a
Definition: quda.h:751
double_complex beta
Definition: quda.h:767
QudaBLASDataType data_type
Definition: quda.h:771
int a_offset
Definition: quda.h:759
int batch_count
Definition: quda.h:769
QudaBLASOperation trans_b
Definition: quda.h:752
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:134
#define printfQuda(...)
Definition: util_quda.h:114
#define errorQuda(...)
Definition: util_quda.h:120