QUDA  v1.1.0
A library for QCD on GPUs
eigensolve_quda.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <iostream>
5 #include <vector>
6 #include <algorithm>
7 
8 #include <quda_internal.h>
9 #include <eigensolve_quda.h>
10 #include <qio_field.h>
11 #include <color_spinor_field.h>
12 #include <blas_quda.h>
13 #include <util_quda.h>
14 #include <vector_io.h>
15 #include <eigen_helper.h>
16 
17 namespace quda
18 {
19 
20  // Eigensolver class
21  //-----------------------------------------------------------------------------
23  mat(mat),
24  eig_param(eig_param),
25  profile(profile),
26  tmp1(nullptr),
27  tmp2(nullptr)
28  {
29  bool profile_running = profile.isRunning(QUDA_PROFILE_INIT);
30  if (!profile_running) profile.TPSTART(QUDA_PROFILE_INIT);
31 
33 
34  // Problem parameters
35  n_ev = eig_param->n_ev;
36  n_kr = eig_param->n_kr;
39  tol = eig_param->tol;
40  reverse = false;
41 
42  // Algorithm variables
43  converged = false;
44  restart_iter = 0;
49  iter = 0;
50  iter_converged = 0;
51  iter_locked = 0;
52  iter_keep = 0;
53  num_converged = 0;
54  num_locked = 0;
55  num_keep = 0;
56 
58 
59  // Sanity checks
60  if (n_kr <= n_ev) errorQuda("n_kr = %d is less than or equal to n_ev = %d", n_kr, n_ev);
61  if (n_ev < n_conv) errorQuda("n_conv=%d is greater than n_ev=%d", n_conv, n_ev);
62  if (n_ev == 0) errorQuda("n_ev=0 passed to Eigensolver");
63  if (n_kr == 0) errorQuda("n_kr=0 passed to Eigensolver");
64  if (n_conv == 0) errorQuda("n_conv=0 passed to Eigensolver");
65  if (n_ev_deflate > n_conv) errorQuda("deflation vecs = %d is greater than n_conv = %d", n_ev_deflate, n_conv);
66 
67  residua.reserve(n_kr);
68  for (int i = 0; i < n_kr; i++) residua.push_back(0.0);
69 
70  // Part of the spectrum to be computed.
71  switch (eig_param->spectrum) {
72  case QUDA_SPECTRUM_LM_EIG: strcpy(spectrum, "LM"); break;
73  case QUDA_SPECTRUM_SM_EIG: strcpy(spectrum, "SM"); break;
74  case QUDA_SPECTRUM_LR_EIG: strcpy(spectrum, "LR"); break;
75  case QUDA_SPECTRUM_SR_EIG: strcpy(spectrum, "SR"); break;
76  case QUDA_SPECTRUM_LI_EIG: strcpy(spectrum, "LI"); break;
77  case QUDA_SPECTRUM_SI_EIG: strcpy(spectrum, "SI"); break;
78  default: errorQuda("Unexpected spectrum type %d", eig_param->spectrum);
79  }
80 
81  // Deduce whether to reverse the sorting
82  if (strncmp("L", spectrum, 1) == 0 && !eig_param->use_poly_acc) {
83  reverse = true;
84  } else if (strncmp("S", spectrum, 1) == 0 && eig_param->use_poly_acc) {
85  reverse = true;
86  spectrum[0] = 'L';
87  } else if (strncmp("L", spectrum, 1) == 0 && eig_param->use_poly_acc) {
88  reverse = true;
89  spectrum[0] = 'S';
90  }
91 
92  if (!profile_running) profile.TPSTOP(QUDA_PROFILE_INIT);
93  }
94 
95  // We bake the matrix operator 'mat' and the eigensolver parameters into the
96  // eigensolver.
98  {
99  EigenSolver *eig_solver = nullptr;
100 
101  switch (eig_param->eig_type) {
102  case QUDA_EIG_IR_ARNOLDI:
103  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating IR Arnoldi eigensolver\n");
104  eig_solver = new IRAM(mat, eig_param, profile);
105  break;
106  case QUDA_EIG_BLK_IR_ARNOLDI: errorQuda("Block IR Arnoldi not implemented");
107  case QUDA_EIG_TR_LANCZOS:
108  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating TR Lanczos eigensolver\n");
109  eig_solver = new TRLM(mat, eig_param, profile);
110  break;
112  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating Block TR Lanczos eigensolver\n");
113  eig_solver = new BLKTRLM(mat, eig_param, profile);
114  break;
115  default: errorQuda("Invalid eig solver type");
116  }
117 
118  // Sanity checks
119  //--------------------------------------------------------------------------
120  if (!mat.hermitian() && eig_solver->hermitian())
121  errorQuda("Cannot solve non-Hermitian system with strictly Hermitian eigensolver %d, %d", (int)!mat.hermitian(),
122  (int)eig_solver->hermitian());
123 
124  // Support for Chebyshev only in strictly Hermitian solvers
125  if (eig_param->use_poly_acc) {
126  if (!mat.hermitian()) errorQuda("Cannot use polynomial acceleration with non-Hermitian operator");
127  if (!eig_solver->hermitian()) errorQuda("Polynomial acceleration not supported with non-Hermitian solver");
128  }
129 
131  errorQuda("The imaginary spectrum of a Hermitian operator cannot be computed");
132  //--------------------------------------------------------------------------
133 
134  return eig_solver;
135  }
136 
137  // Utilities and functions common to all Eigensolver instances
138  //------------------------------------------------------------------------------
139  void EigenSolver::prepareInitialGuess(std::vector<ColorSpinorField *> &kSpace)
140  {
141  if (kSpace[0]->Location() == QUDA_CPU_FIELD_LOCATION) {
142  for (int b = 0; b < block_size; b++) {
143  if (sqrt(blas::norm2(*kSpace[b])) == 0.0) { kSpace[b]->Source(QUDA_RANDOM_SOURCE); }
144  }
145  } else {
146  RNG *rng = new RNG(*kSpace[0], 1234);
147  rng->Init();
148  for (int b = 0; b < block_size; b++) {
149  if (sqrt(blas::norm2(*kSpace[b])) == 0.0) { spinorNoise(*kSpace[b], *rng, QUDA_NOISE_UNIFORM); }
150  }
151  rng->Release();
152  delete rng;
153  }
154  bool orthed = false;
155  int k = 0, kmax = 5;
156  while (!orthed && k < kmax) {
157  orthonormalizeMGS(kSpace, block_size);
158  if (getVerbosity() >= QUDA_SUMMARIZE) {
159  if (block_size > 1)
160  printfQuda("Orthonormalising initial guesses with Modified Gram Schmidt, iter k=%d/5\n", (k + 1));
161  else
162  printfQuda("Orthonormalising initial guess\n");
163  }
164  orthed = orthoCheck(kSpace, block_size);
165  k++;
166  }
167  if (!orthed) errorQuda("Failed to orthonormalise initial guesses");
168  }
169 
170  void EigenSolver::checkChebyOpMax(const DiracMatrix &mat, std::vector<ColorSpinorField *> &kSpace)
171  {
172  if (eig_param->use_poly_acc && eig_param->a_max <= 0.0) {
173  // Use part of the kSpace as temps
174  eig_param->a_max = estimateChebyOpMax(mat, *kSpace[block_size + 2], *kSpace[block_size + 1]);
175  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Chebyshev maximum estimate: %e.\n", eig_param->a_max);
176  }
177  }
178 
179  void EigenSolver::prepareKrylovSpace(std::vector<ColorSpinorField *> &kSpace, std::vector<Complex> &evals)
180  {
181  ColorSpinorParam csParamClone(*kSpace[0]);
182  // Increase Krylov space to n_kr+block_size vectors, create residual
183  kSpace.reserve(n_kr + block_size);
184  csParamClone.create = QUDA_ZERO_FIELD_CREATE;
185  for (int i = n_conv; i < n_kr + block_size; i++) kSpace.push_back(ColorSpinorField::Create(csParamClone));
186  for (int b = 0; b < block_size; b++) { r.push_back(ColorSpinorField::Create(csParamClone)); }
187  // Increase evals space to n_ev
188  evals.reserve(n_kr);
189  for (int i = n_conv; i < n_kr; i++) evals.push_back(0.0);
190  }
191 
193  {
194  if (getVerbosity() >= QUDA_SUMMARIZE) {
195  printfQuda("********************************\n");
196  printfQuda("**** START QUDA EIGENSOLVER ****\n");
197  printfQuda("********************************\n");
198  }
199 
200  if (getVerbosity() >= QUDA_VERBOSE) {
201  printfQuda("spectrum %s\n", spectrum);
202  printfQuda("tol %.4e\n", tol);
203  printfQuda("n_conv %d\n", n_conv);
204  printfQuda("n_ev %d\n", n_ev);
205  printfQuda("n_kr %d\n", n_kr);
206  if (block_size > 1) printfQuda("block size %d\n", block_size);
207  if (eig_param->use_poly_acc) {
208  printfQuda("polyDeg %d\n", eig_param->poly_deg);
209  printfQuda("a-min %f\n", eig_param->a_min);
210  printfQuda("a-max %f\n", eig_param->a_max);
211  }
212  }
213  }
214 
216  {
217  double eps = 0.0;
218  switch (prec) {
219  case QUDA_DOUBLE_PRECISION: eps = DBL_EPSILON; break;
220  case QUDA_SINGLE_PRECISION: eps = FLT_EPSILON; break;
221  case QUDA_HALF_PRECISION: eps = 2e-3; break;
222  case QUDA_QUARTER_PRECISION: eps = 5e-2; break;
223  default: errorQuda("Invalid precision %d", prec);
224  }
225  return eps;
226  }
227 
229  {
230  switch (prec) {
231  case QUDA_DOUBLE_PRECISION: printfQuda("Running eigensolver in double precision\n"); break;
232  case QUDA_SINGLE_PRECISION: printfQuda("Running eigensolver in single precision\n"); break;
233  case QUDA_HALF_PRECISION: printfQuda("Running eigensolver in half precision\n"); break;
234  case QUDA_QUARTER_PRECISION: printfQuda("Running eigensolver in quarter precision\n"); break;
235  default: errorQuda("Invalid precision %d", prec);
236  }
237  }
238 
239  void EigenSolver::cleanUpEigensolver(std::vector<ColorSpinorField *> &kSpace, std::vector<Complex> &evals)
240  {
241  for (int b = 0; b < block_size; b++) delete r[b];
242  r.resize(0);
243 
244  // Resize Krylov Space
245  for (unsigned int i = n_conv; i < kSpace.size(); i++) { delete kSpace[i]; }
246  kSpace.resize(n_conv);
247  evals.resize(n_conv);
248 
249  // Only save if outfile is defined
250  if (strcmp(eig_param->vec_outfile, "") != 0) {
251  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("saving eigenvectors\n");
252  // Make an array of size n_conv
253  std::vector<ColorSpinorField *> vecs_ptr;
254  vecs_ptr.reserve(n_conv);
255  const QudaParity mat_parity = impliedParityFromMatPC(mat.getMatPCType());
256  // We may wish to compute vectors in high prec, but use in a lower
257  // prec. This allows the user to down copy the data for later use.
258  QudaPrecision prec = kSpace[0]->Precision();
259  if (save_prec < prec) {
260  ColorSpinorParam csParamClone(*kSpace[0]);
261  csParamClone.create = QUDA_REFERENCE_FIELD_CREATE;
262  csParamClone.setPrecision(save_prec);
263  for (unsigned int i = 0; i < kSpace.size(); i++) {
264  kSpace[i]->setSuggestedParity(mat_parity);
265  vecs_ptr.push_back(kSpace[i]->CreateAlias(csParamClone));
266  }
267  if (getVerbosity() >= QUDA_SUMMARIZE) {
268  printfQuda("kSpace successfully down copied from prec %d to prec %d\n", kSpace[0]->Precision(),
269  vecs_ptr[0]->Precision());
270  }
271  } else {
272  for (int i = 0; i < n_conv; i++) {
273  kSpace[i]->setSuggestedParity(mat_parity);
274  vecs_ptr.push_back(kSpace[i]);
275  }
276  }
277  // save the vectors
279  io.save(vecs_ptr);
280  for (unsigned int i = 0; i < kSpace.size() && save_prec < prec; i++) delete vecs_ptr[i];
281  }
282 
283  // Save TRLM tuning
284  saveTuneCache();
285 
286  mat.flops();
287 
288  if (getVerbosity() >= QUDA_SUMMARIZE) {
289  printfQuda("********************************\n");
290  printfQuda("***** END QUDA EIGENSOLVER *****\n");
291  printfQuda("********************************\n");
292  }
293  }
294 
296  {
297  if (!tmp1 || !tmp2) {
301  }
302  mat(out, in, *tmp1, *tmp2);
303 
304  // Save matrix * vector tuning
305  saveTuneCache();
306  }
307 
309  {
310  // Just do a simple matVec if no poly acc is requested
311  if (!eig_param->use_poly_acc) {
312  matVec(mat, out, in);
313  return;
314  }
315 
316  if (eig_param->poly_deg == 0) { errorQuda("Polynomial acceleration requested with zero polynomial degree"); }
317 
318  // Compute the polynomial accelerated operator.
319  double a = eig_param->a_min;
320  double b = eig_param->a_max;
321  double delta = (b - a) / 2.0;
322  double theta = (b + a) / 2.0;
323  double sigma1 = -delta / theta;
324  double sigma;
325  double d1 = sigma1 / delta;
326  double d2 = 1.0;
327  double d3;
328 
329  // out = d2 * in + d1 * out
330  // C_1(x) = x
331  matVec(mat, out, in);
332  blas::caxpby(d2, const_cast<ColorSpinorField &>(in), d1, out);
333  if (eig_param->poly_deg == 1) return;
334 
335  // C_0 is the current 'in' vector.
336  // C_1 is the current 'out' vector.
337 
338  // Clone 'in' to two temporary vectors.
341 
342  blas::copy(*tmp1, in);
343  blas::copy(*tmp2, out);
344 
345  // Using Chebyshev polynomial recursion relation,
346  // C_{m+1}(x) = 2*x*C_{m} - C_{m-1}
347 
348  double sigma_old = sigma1;
349 
350  // construct C_{m+1}(x)
351  for (int i = 2; i < eig_param->poly_deg; i++) {
352  sigma = 1.0 / (2.0 / sigma1 - sigma_old);
353 
354  d1 = 2.0 * sigma / delta;
355  d2 = -d1 * theta;
356  d3 = -sigma * sigma_old;
357 
358  // FIXME - we could introduce a fused matVec + blas kernel here, eliminating one temporary
359  // mat*C_{m}(x)
360  matVec(mat, out, *tmp2);
361 
362  Complex d1c(d1, 0.0);
363  Complex d2c(d2, 0.0);
364  Complex d3c(d3, 0.0);
365  blas::caxpbypczw(d3c, *tmp1, d2c, *tmp2, d1c, out, *tmp1);
366  std::swap(tmp1, tmp2);
367 
368  sigma_old = sigma;
369  }
370  blas::copy(out, *tmp2);
371 
372  delete tmp1;
373  delete tmp2;
374 
375  // Save Chebyshev tuning
376  saveTuneCache();
377  }
378 
380  {
381 
382  if (in.Location() == QUDA_CPU_FIELD_LOCATION) {
384  } else {
385  RNG *rng = new RNG(in, 1234);
386  rng->Init();
387  spinorNoise(in, *rng, QUDA_NOISE_UNIFORM);
388  rng->Release();
389  delete rng;
390  }
391 
392  ColorSpinorField *in_ptr = &in;
393  ColorSpinorField *out_ptr = &out;
394 
395  // Power iteration
396  double norm = 0.0;
397  for (int i = 0; i < 100; i++) {
398  if ((i + 1) % 10 == 0) {
399  norm = sqrt(blas::norm2(*in_ptr));
400  blas::ax(1.0 / norm, *in_ptr);
401  }
402  matVec(mat, *out_ptr, *in_ptr);
403  std::swap(out_ptr, in_ptr);
404  }
405 
406  // Compute spectral radius estimate
407  double result = blas::reDotProduct(*out_ptr, *in_ptr);
408 
409  // Increase final result by 10% for safety
410  return result * 1.10;
411 
412  // Save Chebyshev Max tuning
413  saveTuneCache();
414  }
415 
416  bool EigenSolver::orthoCheck(std::vector<ColorSpinorField *> vecs, int size)
417  {
418  bool orthed = true;
419  const Complex Unit(1.0, 0.0);
420  std::vector<ColorSpinorField *> vecs_ptr;
421  vecs_ptr.reserve(size);
422  for (int i = 0; i < size; i++) vecs_ptr.push_back(vecs[i]);
423 
424  std::vector<Complex> H(size * size);
425  blas::hDotProduct(H.data(), vecs_ptr, vecs_ptr);
426 
427  double epsilon = setEpsilon(vecs[0]->Precision());
428 
429  for (int i = 0; i < size; i++) {
430  for (int j = 0; j < size; j++) {
431  auto cnorm = H[i * size + j];
432  if (j != i) {
433  if (abs(cnorm) > 5.0 * epsilon) {
434  if (getVerbosity() >= QUDA_SUMMARIZE)
435  printfQuda("Norm <%d|%d>^2 = ||(%e,%e)|| = %e\n", i, j, cnorm.real(), cnorm.imag(), abs(cnorm));
436  orthed = false;
437  }
438  } else {
439  if (abs(Unit - cnorm) > 5.0 * epsilon) {
440  if (getVerbosity() >= QUDA_SUMMARIZE)
441  printfQuda("1 - Norm <%d|%d>^2 = 1 - ||(%e,%e)|| = %e\n", i, j, cnorm.real(), cnorm.imag(),
442  abs(Unit - cnorm));
443  orthed = false;
444  }
445  }
446  }
447  }
448 
449  return orthed;
450  }
451 
452  void EigenSolver::orthonormalizeMGS(std::vector<ColorSpinorField *> &vecs, int size)
453  {
454  std::vector<ColorSpinorField *> vecs_ptr;
455  vecs_ptr.reserve(size);
456  for (int i = 0; i < size; i++) vecs_ptr.push_back(vecs[i]);
457 
458  for (int i = 0; i < size; i++) {
459  for (int j = 0; j < i; j++) {
460  Complex cnorm = blas::cDotProduct(*vecs_ptr[j], *vecs_ptr[i]); // <j|i> with i normalised.
461  blas::caxpy(-cnorm, *vecs_ptr[j], *vecs_ptr[i]); // i = i - proj_{j}(i) = i - <j|i> * i
462  }
463  double norm = sqrt(blas::norm2(*vecs_ptr[i]));
464  blas::ax(1.0 / norm, *vecs_ptr[i]); // i/<i|i>
465  }
466  }
467 
468  // Orthogonalise r[0:] against V_[0:j]
469  void EigenSolver::blockOrthogonalize(std::vector<ColorSpinorField *> vecs, std::vector<ColorSpinorField *> &rvecs, int j)
470  {
471  int vecs_size = j;
472  int r_size = (int)rvecs.size();
473  int array_size = vecs_size * r_size;
474  std::vector<Complex> s(array_size);
475 
476  std::vector<ColorSpinorField *> vecs_ptr;
477  vecs_ptr.reserve(vecs_size);
478  for (int i = 0; i < vecs_size; i++) { vecs_ptr.push_back(vecs[i]); }
479 
480  // Block dot products stored in s.
481  blas::cDotProduct(s.data(), vecs_ptr, rvecs);
482 
483  // Block orthogonalise
484  for (int i = 0; i < array_size; i++) s[i] *= -1.0;
485  blas::caxpy(s.data(), vecs_ptr, rvecs);
486 
487  // Save orthonormalisation tuning
488  saveTuneCache();
489  }
490 
491  void EigenSolver::permuteVecs(std::vector<ColorSpinorField *> &kSpace, int *mat, int size)
492  {
493  std::vector<int> pivots(size);
494  for (int i = 0; i < size; i++) {
495  for (int j = 0; j < size; j++) {
496  if (mat[j * size + i] == 1) { pivots[j] = i; }
497  }
498  }
499 
500  // Identify cycles in the permutation array.
501  // We shall use the sign bit as a marker. If the
502  // sign is negative, the vector has already been
503  // swapped into the correct place. A positive
504  // value indicates the start of a new cycle.
505 
506  for (int i = 0; i < size; i++) {
507  // First cycle always starts at 0, hence OR statement
508  if (pivots[i] > 0 || i == 0) {
509  int k = i;
510  // Identify vector to be placed at i
511  int j = pivots[i];
512  pivots[i] = -pivots[i];
513  while (j > i) {
514  std::swap(kSpace[k + num_locked], kSpace[j + num_locked]);
515  pivots[j] = -pivots[j];
516  k = j;
517  j = -pivots[j];
518  }
519  }
520  }
521  // Sanity check
522  for (int i = 0; i < size; i++) {
523  if (pivots[i] > 0) errorQuda("Error at pivot %d", i);
524  }
525  }
526 
527  void EigenSolver::blockRotate(std::vector<ColorSpinorField *> &kSpace, double *array, int rank, const range &i_range,
528  const range &j_range, blockType b_type)
529  {
530  int block_i_rank = i_range.second - i_range.first;
531  int block_j_rank = j_range.second - j_range.first;
532 
533  // Quick return if no op.
534  if (block_i_rank == 0 || block_j_rank == 0) return;
535 
536  // Pointers to the relevant vectors
537  std::vector<ColorSpinorField *> vecs_ptr;
538  std::vector<ColorSpinorField *> kSpace_ptr;
539 
540  // Alias the vectors we wish to keep
541  vecs_ptr.reserve(block_i_rank);
542  for (int i = i_range.first; i < i_range.second; i++) { vecs_ptr.push_back(kSpace[num_locked + i]); }
543  // Alias the extra space vectors
544  kSpace_ptr.reserve(block_j_rank);
545  for (int j = j_range.first; j < j_range.second; j++) {
546  int k = n_kr + 1 + j - j_range.first;
547  kSpace_ptr.push_back(kSpace[k]);
548  }
549 
550  double *batch_array = (double *)safe_malloc((block_i_rank * block_j_rank) * sizeof(double));
551  // Populate batch array (COLUMN major -> ROW major)
552  for (int j = j_range.first; j < j_range.second; j++) {
553  for (int i = i_range.first; i < i_range.second; i++) {
554  int j_arr = j - j_range.first;
555  int i_arr = i - i_range.first;
556  batch_array[i_arr * block_j_rank + j_arr] = array[j * rank + i];
557  }
558  }
559  switch (b_type) {
560  case PENCIL: blas::axpy(batch_array, vecs_ptr, kSpace_ptr); break;
561  case LOWER_TRI: blas::axpy_L(batch_array, vecs_ptr, kSpace_ptr); break;
562  case UPPER_TRI: blas::axpy_U(batch_array, vecs_ptr, kSpace_ptr); break;
563  default: errorQuda("Undefined MultiBLAS type in blockRotate");
564  }
565  host_free(batch_array);
566 
567  // Save Krylov block rotation tuning
568  saveTuneCache();
569  }
570 
571  void EigenSolver::blockReset(std::vector<ColorSpinorField *> &kSpace, int j_start, int j_end, int offset)
572  {
573  // copy back to correct position, zero out the workspace
574  for (int j = j_start; j < j_end; j++) {
575  int k = offset + j - j_start;
576  std::swap(kSpace[j + num_locked], kSpace[k]);
577  blas::zero(*kSpace[k]);
578  }
579  }
580 
581  void EigenSolver::blockRotateComplex(std::vector<ColorSpinorField *> &kSpace, Complex *array, int rank,
582  const range &i_range, const range &j_range, blockType b_type, int offset)
583  {
584  int block_i_rank = i_range.second - i_range.first;
585  int block_j_rank = j_range.second - j_range.first;
586 
587  // Quick return if no op.
588  if (block_i_rank == 0 || block_j_rank == 0) return;
589 
590  // Pointers to the relevant vectors
591  std::vector<ColorSpinorField *> vecs_ptr;
592  std::vector<ColorSpinorField *> kSpace_ptr;
593 
594  // Alias the vectors we wish to keep
595  vecs_ptr.reserve(block_i_rank);
596  for (int i = i_range.first; i < i_range.second; i++) { vecs_ptr.push_back(kSpace[num_locked + i]); }
597  // Alias the extra space vectors
598  kSpace_ptr.reserve(block_j_rank);
599  for (int j = j_range.first; j < j_range.second; j++) {
600  int k = offset + j - j_range.first;
601  kSpace_ptr.push_back(kSpace[k]);
602  }
603 
604  Complex *batch_array = (Complex *)safe_malloc((block_i_rank * block_j_rank) * sizeof(Complex));
605  // Populate batch array (COLUM major -> ROW major)
606  for (int j = j_range.first; j < j_range.second; j++) {
607  for (int i = i_range.first; i < i_range.second; i++) {
608  int j_arr = j - j_range.first;
609  int i_arr = i - i_range.first;
610  batch_array[i_arr * block_j_rank + j_arr] = array[j * rank + i];
611  }
612  }
613  switch (b_type) {
614  case PENCIL: blas::caxpy(batch_array, vecs_ptr, kSpace_ptr); break;
615  case LOWER_TRI: blas::caxpy_L(batch_array, vecs_ptr, kSpace_ptr); break;
616  case UPPER_TRI: blas::caxpy_U(batch_array, vecs_ptr, kSpace_ptr); break;
617  default: errorQuda("Undefined MultiBLAS type in blockRotate");
618  }
619  host_free(batch_array);
620 
621  // Save Krylov block rotation tuning
622  saveTuneCache();
623  }
624 
625  void EigenSolver::computeSVD(const DiracMatrix &mat, std::vector<ColorSpinorField *> &evecs, std::vector<Complex> &evals)
626  {
627  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Computing SVD of M\n");
628 
629  int n_conv = eig_param->n_conv;
630  if (evecs.size() != (unsigned int)(2 * n_conv))
631  errorQuda("Incorrect deflation space sized %d passed to computeSVD, expected %d", (int)(evecs.size()), 2 * n_conv);
632 
633  std::vector<double> sigma_tmp(n_conv);
634 
635  for (int i = 0; i < n_conv; i++) {
636 
637  // This function assumes that you have computed the eigenvectors
638  // of MdagM(MMdag), ie, the right(left) SVD of M. The ith eigen vector in the
639  // array corresponds to the ith right(left) singular vector. We place the
640  // computed left(right) singular vectors in the second half of the array. We
641  // assume that right vectors are given and we compute the left.
642  //
643  // As a cross check, we recompute the singular values from mat vecs rather
644  // than make the direct relation (sigma_i)^2 = |lambda_i|
645  //--------------------------------------------------------------------------
646 
647  // Lambda already contains the square root of the eigenvalue of the norm op.
648  Complex lambda = evals[i];
649 
650  // M*Rev_i = M*Rsv_i = sigma_i Lsv_i
651  mat.Expose()->M(*evecs[n_conv + i], *evecs[i]);
652 
653  // sigma_i = sqrt(sigma_i (Lsv_i)^dag * sigma_i * Lsv_i )
654  sigma_tmp[i] = sqrt(blas::norm2(*evecs[n_conv + i]));
655 
656  // Normalise the Lsv: sigma_i Lsv_i -> Lsv_i
657  blas::ax(1.0 / sigma_tmp[i], *evecs[n_conv + i]);
658 
659  if (getVerbosity() >= QUDA_SUMMARIZE)
660  printfQuda("Sval[%04d] = %+.16e sigma - sqrt(|lambda|) = %+.16e\n", i, sigma_tmp[i],
661  sigma_tmp[i] - sqrt(abs(lambda.real())));
662 
663  evals[i] = sigma_tmp[i];
664  //--------------------------------------------------------------------------
665  }
666 
667  // Save SVD tuning
668  saveTuneCache();
669  }
670 
671  // Deflate vec, place result in vec_defl
672  void EigenSolver::deflateSVD(std::vector<ColorSpinorField *> &sol, const std::vector<ColorSpinorField *> &src,
673  const std::vector<ColorSpinorField *> &evecs, const std::vector<Complex> &evals,
674  bool accumulate) const
675  {
676  // number of evecs
677  if (n_ev_deflate == 0) {
678  warningQuda("deflateSVD called with n_ev_deflate = 0");
679  return;
680  }
681 
682  int n_defl = n_ev_deflate;
683  if (evecs.size() != (unsigned int)(2 * eig_param->n_conv))
684  errorQuda("Incorrect deflation space sized %d passed to deflateSVD, expected %d", (int)(evecs.size()),
685  2 * eig_param->n_conv);
686 
687  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Deflating %d left and right singular vectors\n", n_defl);
688 
689  // Perform Sum_i R_i * (\sigma_i)^{-1} * L_i^dag * vec = vec_defl
690  // for all i computed eigenvectors and values.
691 
692  // 1. Take block inner product: L_i^dag * vec = A_i
693  std::vector<ColorSpinorField *> left_vecs;
694  left_vecs.reserve(n_defl);
695  for (int i = eig_param->n_conv; i < eig_param->n_conv + n_defl; i++) left_vecs.push_back(evecs[i]);
696 
697  std::vector<Complex> s(n_defl * src.size());
698  std::vector<ColorSpinorField *> src_ = const_cast<decltype(src) &>(src);
699  blas::cDotProduct(s.data(), left_vecs, src_);
700 
701  // 2. Perform block caxpy
702  // A_i -> (\sigma_i)^{-1} * A_i
703  // vec_defl = Sum_i (R_i)^{-1} * A_i
704  if (!accumulate)
705  for (auto &x : sol) blas::zero(*x);
706  std::vector<ColorSpinorField *> right_vecs;
707  right_vecs.reserve(n_defl);
708  for (int i = 0; i < n_defl; i++) {
709  right_vecs.push_back(evecs[i]);
710  s[i] /= evals[i].real();
711  }
712  blas::caxpy(s.data(), right_vecs, sol);
713 
714  // Save SVD deflation tuning
715  saveTuneCache();
716  }
717 
718  void EigenSolver::computeEvals(const DiracMatrix &mat, std::vector<ColorSpinorField *> &evecs,
719  std::vector<Complex> &evals, int size)
720  {
721  if (size > (int)evecs.size())
722  errorQuda("Requesting %d eigenvectors with only storage allocated for %lu", size, evecs.size());
723  if (size > (int)evals.size())
724  errorQuda("Requesting %d eigenvalues with only storage allocated for %lu", size, evals.size());
725 
726  ColorSpinorParam csParamClone(*evecs[0]);
727  std::vector<ColorSpinorField *> temp;
728  temp.push_back(ColorSpinorField::Create(csParamClone));
729 
730  for (int i = 0; i < size; i++) {
731  // r = A * v_i
732  matVec(mat, *temp[0], *evecs[i]);
733 
734  // lambda_i = v_i^dag A v_i / (v_i^dag * v_i)
735  evals[i] = blas::cDotProduct(*evecs[i], *temp[0]) / sqrt(blas::norm2(*evecs[i]));
736  // Measure ||lambda_i*v_i - A*v_i||
737  Complex n_unit(-1.0, 0.0);
738  blas::caxpby(evals[i], *evecs[i], n_unit, *temp[0]);
739  residua[i] = sqrt(blas::norm2(*temp[0]));
740 
741  // If size = n_conv, this routine is called post sort
742  if (getVerbosity() >= QUDA_SUMMARIZE && size == n_conv)
743  printfQuda("Eval[%04d] = (%+.16e,%+.16e) residual = %+.16e\n", i, evals[i].real(), evals[i].imag(), residua[i]);
744  }
745  delete temp[0];
746 
747  // Save Eval tuning
748  saveTuneCache();
749  }
750 
751  // Deflate vec, place result in vec_defl
752  void EigenSolver::deflate(std::vector<ColorSpinorField *> &sol, const std::vector<ColorSpinorField *> &src,
753  const std::vector<ColorSpinorField *> &evecs, const std::vector<Complex> &evals,
754  bool accumulate) const
755  {
756  // number of evecs
757  if (n_ev_deflate == 0) {
758  warningQuda("deflate called with n_ev_deflate = 0");
759  return;
760  }
761 
762  int n_defl = n_ev_deflate;
763 
764  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Deflating %d vectors\n", n_defl);
765 
766  // Perform Sum_i V_i * (L_i)^{-1} * (V_i)^dag * vec = vec_defl
767  // for all i computed eigenvectors and values.
768 
769  // Pointers to the required Krylov space vectors,
770  // no extra memory is allocated.
771  std::vector<ColorSpinorField *> eig_vecs;
772  eig_vecs.reserve(n_defl);
773  for (int i = 0; i < n_defl; i++) eig_vecs.push_back(evecs[i]);
774 
775  // 1. Take block inner product: (V_i)^dag * vec = A_i
776  std::vector<Complex> s(n_defl * src.size());
777  std::vector<ColorSpinorField *> src_ = const_cast<decltype(src) &>(src);
778  blas::cDotProduct(s.data(), eig_vecs, src_);
779 
780  // 2. Perform block caxpy: V_i * (L_i)^{-1} * A_i
781  for (int i = 0; i < n_defl; i++) { s[i] /= evals[i].real(); }
782 
783  // 3. Accumulate sum vec_defl = Sum_i V_i * (L_i)^{-1} * A_i
784  if (!accumulate)
785  for (auto &x : sol) blas::zero(*x);
786  blas::caxpy(s.data(), eig_vecs, sol);
787 
788  // Save Deflation tuning
789  saveTuneCache();
790  }
791 
792  void EigenSolver::loadFromFile(const DiracMatrix &mat, std::vector<ColorSpinorField *> &kSpace,
793  std::vector<Complex> &evals)
794  {
795  // Set suggested parity of fields
796  const QudaParity mat_parity = impliedParityFromMatPC(mat.getMatPCType());
797  for (int i = 0; i < n_conv; i++) { kSpace[i]->setSuggestedParity(mat_parity); }
798 
799  // Make an array of size n_conv
800  std::vector<ColorSpinorField *> vecs_ptr;
801  vecs_ptr.reserve(n_conv);
802  for (int i = 0; i < n_conv; i++) { vecs_ptr.push_back(kSpace[i]); }
803 
804  {
805  // load the vectors
807  io.load(vecs_ptr);
808  }
809 
810  // Create the device side residual vector by cloning
811  // the kSpace passed to the function.
812  ColorSpinorParam csParam(*kSpace[0]);
814  r.push_back(ColorSpinorField::Create(csParam));
815 
816  // Error estimates (residua) given by ||A*vec - lambda*vec||
817  computeEvals(mat, kSpace, evals);
818  delete r[0];
819  }
820 
821  void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector<Complex> &x, std::vector<Complex> &y)
822  {
823 
824  // 'LM' -> sort into increasing order of magnitude.
825  // 'SM' -> sort into decreasing order of magnitude.
826  // 'LR' -> sort with real(x) in increasing algebraic order
827  // 'SR' -> sort with real(x) in decreasing algebraic order
828  // 'LI' -> sort with imag(x) in increasing algebraic order
829  // 'SI' -> sort with imag(x) in decreasing algebraic order
830 
831  std::vector<std::pair<Complex, Complex>> array(n);
832  for (int i = 0; i < n; i++) array[i] = std::make_pair(x[i], y[i]);
833 
834  switch (spec_type) {
836  std::sort(array.begin(), array.begin() + n,
837  [](const std::pair<Complex, Complex> &a, const std::pair<Complex, Complex> &b) {
838  return (abs(a.first) < abs(b.first));
839  });
840  break;
842  std::sort(array.begin(), array.begin() + n,
843  [](const std::pair<Complex, Complex> &a, const std::pair<Complex, Complex> &b) {
844  return (abs(a.first) > abs(b.first));
845  });
846  break;
848  std::sort(array.begin(), array.begin() + n,
849  [](const std::pair<Complex, Complex> &a, const std::pair<Complex, Complex> &b) {
850  return (a.first).real() < (b.first).real();
851  });
852  break;
854  std::sort(array.begin(), array.begin() + n,
855  [](const std::pair<Complex, Complex> &a, const std::pair<Complex, Complex> &b) {
856  return (a.first).real() > (b.first).real();
857  });
858  break;
860  std::sort(array.begin(), array.begin() + n,
861  [](const std::pair<Complex, Complex> &a, const std::pair<Complex, Complex> &b) {
862  return (a.first).imag() < (b.first).imag();
863  });
864  break;
866  std::sort(array.begin(), array.begin() + n,
867  [](const std::pair<Complex, Complex> &a, const std::pair<Complex, Complex> &b) {
868  return (a.first).imag() > (b.first).imag();
869  });
870  break;
871  default: errorQuda("Undefined spectrum type %d given", spec_type);
872  }
873 
874  // Repopulate x and y arrays with sorted elements
875  for (int i = 0; i < n; i++) {
876  x[i] = array[i].first;
877  y[i] = array[i].second;
878  }
879  }
880 
881  // Overloaded version of sortArrays to deal with real y array.
882  void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector<Complex> &x, std::vector<double> &y)
883  {
884 
885  std::vector<Complex> y_tmp(n, 0.0);
886  for (int i = 0; i < n; i++) y_tmp[i].real(y[i]);
887  sortArrays(spec_type, n, x, y_tmp);
888  for (int i = 0; i < n; i++) y[i] = y_tmp[i].real();
889  }
890 
891  // Overloaded version of sortArrays to deal with real x array.
892  void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector<double> &x, std::vector<Complex> &y)
893  {
894 
895  std::vector<Complex> x_tmp(n, 0.0);
896  for (int i = 0; i < n; i++) x_tmp[i].real(x[i]);
897  sortArrays(spec_type, n, x_tmp, y);
898  for (int i = 0; i < n; i++) x[i] = x_tmp[i].real();
899  }
900 
901  // Overloaded version of sortArrays to deal with real x and y array.
902  void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector<double> &x, std::vector<double> &y)
903  {
904 
905  std::vector<Complex> x_tmp(n, 0.0);
906  std::vector<Complex> y_tmp(n, 0.0);
907  for (int i = 0; i < n; i++) {
908  x_tmp[i].real(x[i]);
909  y_tmp[i].real(y[i]);
910  }
911  sortArrays(spec_type, n, x_tmp, y_tmp);
912  for (int i = 0; i < n; i++) {
913  x[i] = x_tmp[i].real();
914  y[i] = y_tmp[i].real();
915  }
916  }
917 
918  void EigenSolver::rotateVecsComplex(std::vector<ColorSpinorField *> &kSpace, const Complex *rot_array, const int offset,
919  const int dim, const int keep, const int locked, TimeProfile &profile)
920  {
921  // If we have memory availible, do the entire rotation
922  if (batched_rotate <= 0 || batched_rotate >= keep) {
923  if ((int)kSpace.size() < offset + keep) {
924  ColorSpinorParam csParamClone(*kSpace[0]);
925  csParamClone.create = QUDA_ZERO_FIELD_CREATE;
926  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Resizing kSpace to %d vectors\n", n_kr + keep);
927  kSpace.reserve(offset + keep);
928  for (int i = kSpace.size(); i < offset + keep; i++) {
929  kSpace.push_back(ColorSpinorField::Create(csParamClone));
930  }
931  }
932 
933  // Pointers to the relevant vectors
934  std::vector<ColorSpinorField *> vecs_ptr;
935  std::vector<ColorSpinorField *> kSpace_ptr;
936 
937  // Alias the extra space vectors, zero the workspace
938  kSpace_ptr.reserve(keep);
939  for (int i = 0; i < keep; i++) {
940  kSpace_ptr.push_back(kSpace[offset + i]);
941  blas::zero(*kSpace_ptr[i]);
942  }
943 
944  // Alias the vectors we wish to compress.
945  vecs_ptr.reserve(dim);
946  for (int j = 0; j < dim; j++) vecs_ptr.push_back(kSpace[locked + j]);
947 
948  // multiBLAS caxpy
949  profile.TPSTART(QUDA_PROFILE_COMPUTE);
950  blas::caxpy(rot_array, vecs_ptr, kSpace_ptr);
952 
953  // Copy compressed Krylov
954  for (int i = 0; i < keep; i++) std::swap(kSpace[locked + i], kSpace[offset + i]);
955 
956  } else {
957 
958  // Do batched rotation to save on memory
959  int batch_size = batched_rotate;
960  int full_batches = keep / batch_size;
961  int batch_size_r = keep % batch_size;
962  bool do_batch_remainder = (batch_size_r != 0 ? true : false);
963 
964  if ((int)kSpace.size() < offset + batch_size) {
965  ColorSpinorParam csParamClone(*kSpace[0]);
966  csParamClone.create = QUDA_ZERO_FIELD_CREATE;
967  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Resizing kSpace to %d vectors\n", offset + batch_size);
968  kSpace.reserve(offset + batch_size);
969  for (int i = kSpace.size(); i < offset + batch_size; i++) {
970  kSpace.push_back(ColorSpinorField::Create(csParamClone));
971  }
972  }
973 
974  profile.TPSTART(QUDA_PROFILE_EIGENLU);
975  MatrixXcd mat = MatrixXcd::Zero(dim, keep);
976  for (int j = 0; j < keep; j++)
977  for (int i = 0; i < dim; i++) mat(i, j) = rot_array[i * keep + j];
978 
979  FullPivLU<MatrixXcd> matLU(mat);
980 
981  // Extract the upper triangular matrix
982  MatrixXcd matUpper = MatrixXcd::Zero(keep, keep);
983  matUpper = matLU.matrixLU().triangularView<Eigen::Upper>();
984  matUpper.conservativeResize(keep, keep);
985 
986  // Extract the lower triangular matrix
987  MatrixXcd matLower = MatrixXcd::Identity(dim, dim);
988  matLower.block(0, 0, dim, keep).triangularView<Eigen::StrictlyLower>() = matLU.matrixLU();
989  matLower.conservativeResize(dim, keep);
990 
991  // Extract the desired permutation matrices
992  MatrixXi matP = MatrixXi::Zero(dim, dim);
993  MatrixXi matQ = MatrixXi::Zero(keep, keep);
994  matP = matLU.permutationP().inverse();
995  matQ = matLU.permutationQ().inverse();
997 
998  profile.TPSTART(QUDA_PROFILE_COMPUTE);
999  // Compute V * A = V * PLUQ
1000 
1001  // Do P Permute
1002  //---------------------------------------------------------------------------
1003  permuteVecs(kSpace, matP.data(), dim);
1004 
1005  // Do L Multiply
1006  //---------------------------------------------------------------------------
1007  // Loop over full batches
1008  for (int b = 0; b < full_batches; b++) {
1009 
1010  // batch triangle
1011  blockRotateComplex(kSpace, matLower.data(), dim, {b * batch_size, (b + 1) * batch_size},
1012  {b * batch_size, (b + 1) * batch_size}, LOWER_TRI, offset);
1013  // batch pencil
1014  blockRotateComplex(kSpace, matLower.data(), dim, {(b + 1) * batch_size, dim},
1015  {b * batch_size, (b + 1) * batch_size}, PENCIL, offset);
1016  blockReset(kSpace, b * batch_size, (b + 1) * batch_size, offset);
1017  }
1018  if (do_batch_remainder) {
1019  // remainder triangle
1020  blockRotateComplex(kSpace, matLower.data(), dim, {full_batches * batch_size, keep},
1021  {full_batches * batch_size, keep}, LOWER_TRI, offset);
1022  // remainder pencil
1023  if (keep < dim) {
1024  blockRotateComplex(kSpace, matLower.data(), dim, {keep, dim}, {full_batches * batch_size, keep}, PENCIL,
1025  offset);
1026  }
1027  blockReset(kSpace, full_batches * batch_size, keep, offset);
1028  }
1029 
1030  // Do U Multiply
1031  //---------------------------------------------------------------------------
1032  if (do_batch_remainder) {
1033  // remainder triangle
1034  blockRotateComplex(kSpace, matUpper.data(), keep, {full_batches * batch_size, keep},
1035  {full_batches * batch_size, keep}, UPPER_TRI, offset);
1036  // remainder pencil
1037  blockRotateComplex(kSpace, matUpper.data(), keep, {0, full_batches * batch_size},
1038  {full_batches * batch_size, keep}, PENCIL, offset);
1039  blockReset(kSpace, full_batches * batch_size, keep, offset);
1040  }
1041 
1042  // Loop over full batches
1043  for (int b = full_batches - 1; b >= 0; b--) {
1044  // batch triangle
1045  blockRotateComplex(kSpace, matUpper.data(), keep, {b * batch_size, (b + 1) * batch_size},
1046  {b * batch_size, (b + 1) * batch_size}, UPPER_TRI, offset);
1047  if (b > 0) {
1048  // batch pencil
1049  blockRotateComplex(kSpace, matUpper.data(), keep, {0, b * batch_size}, {b * batch_size, (b + 1) * batch_size},
1050  PENCIL, offset);
1051  }
1052  blockReset(kSpace, b * batch_size, (b + 1) * batch_size, offset);
1053  }
1054 
1055  // Do Q Permute
1056  //---------------------------------------------------------------------------
1057  permuteVecs(kSpace, matQ.data(), keep);
1058  profile.TPSTOP(QUDA_PROFILE_COMPUTE);
1059  }
1060  }
1061 
1062  void EigenSolver::rotateVecs(std::vector<ColorSpinorField *> &kSpace, const double *rot_array, const int offset,
1063  const int dim, const int keep, const int locked, TimeProfile &profile)
1064  {
1065  // If we have memory availible, do the entire rotation
1066  if (batched_rotate <= 0 || batched_rotate >= keep) {
1067  if ((int)kSpace.size() < offset + keep) {
1068  ColorSpinorParam csParamClone(*kSpace[0]);
1069  csParamClone.create = QUDA_ZERO_FIELD_CREATE;
1070  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Resizing kSpace to %d vectors\n", offset + keep);
1071  kSpace.reserve(offset + keep);
1072  for (int i = kSpace.size(); i < offset + keep; i++) {
1073  kSpace.push_back(ColorSpinorField::Create(csParamClone));
1074  }
1075  }
1076 
1077  // Pointers to the relevant vectors
1078  std::vector<ColorSpinorField *> vecs_ptr;
1079  std::vector<ColorSpinorField *> kSpace_ptr;
1080 
1081  // Alias the extra space vectors, zero the workspace
1082  kSpace_ptr.reserve(keep);
1083  for (int i = 0; i < keep; i++) {
1084  kSpace_ptr.push_back(kSpace[offset + i]);
1085  blas::zero(*kSpace_ptr[i]);
1086  }
1087 
1088  // Alias the vectors we wish to keep.
1089  vecs_ptr.reserve(dim);
1090  for (int j = 0; j < dim; j++) vecs_ptr.push_back(kSpace[locked + j]);
1091 
1092  // multiBLAS axpy
1093  profile.TPSTART(QUDA_PROFILE_COMPUTE);
1094  blas::axpy(rot_array, vecs_ptr, kSpace_ptr);
1095  profile.TPSTOP(QUDA_PROFILE_COMPUTE);
1096 
1097  // Copy compressed Krylov
1098  for (int i = 0; i < keep; i++) std::swap(kSpace[locked + i], kSpace[offset + i]);
1099 
1100  } else {
1101 
1102  int batch_size = batched_rotate;
1103  int full_batches = keep / batch_size;
1104  int batch_size_r = keep % batch_size;
1105  bool do_batch_remainder = (batch_size_r != 0 ? true : false);
1106 
1107  if ((int)kSpace.size() < offset + batch_size) {
1108  ColorSpinorParam csParamClone(*kSpace[0]);
1109  csParamClone.create = QUDA_ZERO_FIELD_CREATE;
1110  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Resizing kSpace to %d vectors\n", offset + batch_size);
1111  kSpace.reserve(offset + batch_size);
1112  for (int i = kSpace.size(); i < offset + batch_size; i++) {
1113  kSpace.push_back(ColorSpinorField::Create(csParamClone));
1114  }
1115  }
1116 
1117  profile.TPSTART(QUDA_PROFILE_EIGEN);
1118  MatrixXd mat = MatrixXd::Zero(dim, keep);
1119  for (int j = 0; j < keep; j++)
1120  for (int i = 0; i < dim; i++) mat(i, j) = rot_array[i * keep + j];
1121 
1122  FullPivLU<MatrixXd> matLU(mat);
1123 
1124  // Extract the upper triangular matrix
1125  MatrixXd matUpper = MatrixXd::Zero(keep, keep);
1126  matUpper = matLU.matrixLU().triangularView<Eigen::Upper>();
1127  matUpper.conservativeResize(keep, keep);
1128 
1129  // Extract the lower triangular matrix
1130  MatrixXd matLower = MatrixXd::Identity(dim, dim);
1131  matLower.block(0, 0, dim, keep).triangularView<Eigen::StrictlyLower>() = matLU.matrixLU();
1132  matLower.conservativeResize(dim, keep);
1133 
1134  // Extract the desired permutation matrices
1135  MatrixXi matP = MatrixXi::Zero(dim, dim);
1136  MatrixXi matQ = MatrixXi::Zero(keep, keep);
1137  matP = matLU.permutationP().inverse();
1138  matQ = matLU.permutationQ().inverse();
1139  profile.TPSTOP(QUDA_PROFILE_EIGEN);
1140 
1141  profile.TPSTART(QUDA_PROFILE_COMPUTE);
1142  // Compute V * A = V * PLUQ
1143 
1144  // Do P Permute
1145  //---------------------------------------------------------------------------
1146  permuteVecs(kSpace, matP.data(), dim);
1147 
1148  // Do L Multiply
1149  //---------------------------------------------------------------------------
1150  // Loop over full batches
1151  for (int b = 0; b < full_batches; b++) {
1152 
1153  // batch triangle
1154  blockRotate(kSpace, matLower.data(), dim, {b * batch_size, (b + 1) * batch_size},
1155  {b * batch_size, (b + 1) * batch_size}, LOWER_TRI);
1156  // batch pencil
1157  blockRotate(kSpace, matLower.data(), dim, {(b + 1) * batch_size, dim}, {b * batch_size, (b + 1) * batch_size},
1158  PENCIL);
1159  blockReset(kSpace, b * batch_size, (b + 1) * batch_size, offset);
1160  }
1161  if (do_batch_remainder) {
1162  // remainder triangle
1163  blockRotate(kSpace, matLower.data(), dim, {full_batches * batch_size, keep}, {full_batches * batch_size, keep},
1164  LOWER_TRI);
1165  // remainder pencil
1166  if (keep < dim) {
1167  blockRotate(kSpace, matLower.data(), dim, {keep, dim}, {full_batches * batch_size, keep}, PENCIL);
1168  }
1169  blockReset(kSpace, full_batches * batch_size, keep, offset);
1170  }
1171 
1172  // Do U Multiply
1173  //---------------------------------------------------------------------------
1174  if (do_batch_remainder) {
1175  // remainder triangle
1176  blockRotate(kSpace, matUpper.data(), keep, {full_batches * batch_size, keep}, {full_batches * batch_size, keep},
1177  UPPER_TRI);
1178  // remainder pencil
1179  blockRotate(kSpace, matUpper.data(), keep, {0, full_batches * batch_size}, {full_batches * batch_size, keep},
1180  PENCIL);
1181  blockReset(kSpace, full_batches * batch_size, keep, offset);
1182  }
1183 
1184  // Loop over full batches
1185  for (int b = full_batches - 1; b >= 0; b--) {
1186  // batch triangle
1187  blockRotate(kSpace, matUpper.data(), keep, {b * batch_size, (b + 1) * batch_size},
1188  {b * batch_size, (b + 1) * batch_size}, UPPER_TRI);
1189  if (b > 0) {
1190  // batch pencil
1191  blockRotate(kSpace, matUpper.data(), keep, {0, b * batch_size}, {b * batch_size, (b + 1) * batch_size}, PENCIL);
1192  }
1193  blockReset(kSpace, b * batch_size, (b + 1) * batch_size, offset);
1194  }
1195 
1196  // Do Q Permute
1197  //---------------------------------------------------------------------------
1198  permuteVecs(kSpace, matQ.data(), keep);
1199  profile.TPSTOP(QUDA_PROFILE_COMPUTE);
1200  }
1201  }
1202 
1204  {
1205  if (tmp1) delete tmp1;
1206  if (tmp2) delete tmp2;
1207  }
1208 } // namespace quda
Block Thick Restarted Lanczos Method.
static ColorSpinorField * Create(const ColorSpinorParam &param)
virtual void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)=0
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
Apply M for the dirac op. E.g. the Schur Complement operator.
const Dirac * Expose() const
Definition: dirac_quda.h:1964
virtual bool hermitian() const
Definition: dirac_quda.h:1962
unsigned long long flops() const
Definition: dirac_quda.h:1909
QudaMatPCType getMatPCType() const
Definition: dirac_quda.h:1911
void blockOrthogonalize(std::vector< ColorSpinorField * > v, std::vector< ColorSpinorField * > &r, int j)
Orthogonalise input vectors r against vector space v using block-BLAS.
void blockReset(std::vector< ColorSpinorField * > &kSpace, int js, int je, int offset)
Copy temp part of kSpace, zero out for next use.
void matVec(const DiracMatrix &mat, ColorSpinorField &out, const ColorSpinorField &in)
Applies the specified matVec operation: M, Mdag, MMdag, MdagM.
void blockRotate(std::vector< ColorSpinorField * > &kSpace, double *array, int rank, const range &i, const range &j, blockType b_type)
Rotate part of kSpace.
bool orthoCheck(std::vector< ColorSpinorField * > v, int j)
Check orthonormality of input vector space v.
void rotateVecs(std::vector< ColorSpinorField * > &kSpace, const double *rot_array, const int offset, const int dim, const int keep, const int locked, TimeProfile &profile)
Rotate the Krylov space.
void deflate(std::vector< ColorSpinorField * > &sol, const std::vector< ColorSpinorField * > &src, const std::vector< ColorSpinorField * > &evecs, const std::vector< Complex > &evals, bool accumulate=false) const
Deflate a set of source vectors with a given eigenspace.
void prepareInitialGuess(std::vector< ColorSpinorField * > &kSpace)
Check for an initial guess. If none present, populate with rands, then orthonormalise.
TimeProfile & profile
void deflateSVD(std::vector< ColorSpinorField * > &sol, const std::vector< ColorSpinorField * > &vec, const std::vector< ColorSpinorField * > &evecs, const std::vector< Complex > &evals, bool accumulate=false) const
Deflate a set of source vectors with a set of left and right singular vectors.
void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector< Complex > &x, std::vector< Complex > &y)
Sort array the first n elements of x according to spec_type, y comes along for the ride.
EigenSolver(const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile)
Constructor for base Eigensolver class.
void orthonormalizeMGS(std::vector< ColorSpinorField * > &v, int j)
Orthonormalise input vector space v using Modified Gram-Schmidt.
const DiracMatrix & mat
virtual bool hermitian()=0
std::vector< ColorSpinorField * > r
void checkChebyOpMax(const DiracMatrix &mat, std::vector< ColorSpinorField * > &kSpace)
Check for a maximum of the Chebyshev operator.
double setEpsilon(const QudaPrecision prec)
Set the epsilon parameter.
QudaPrecision save_prec
void computeEvals(const DiracMatrix &mat, std::vector< ColorSpinorField * > &evecs, std::vector< Complex > &evals, int size)
Compute eigenvalues and their residiua.
void computeSVD(const DiracMatrix &mat, std::vector< ColorSpinorField * > &evecs, std::vector< Complex > &evals)
Computes Left/Right SVD from pre computed Right/Left.
ColorSpinorField * tmp2
void rotateVecsComplex(std::vector< ColorSpinorField * > &kSpace, const Complex *rot_array, const int offset, const int dim, const int keep, const int locked, TimeProfile &profile)
Rotate the Krylov space.
void permuteVecs(std::vector< ColorSpinorField * > &kSpace, int *mat, int size)
Permute the vector space using the permutation matrix.
double estimateChebyOpMax(const DiracMatrix &mat, ColorSpinorField &out, ColorSpinorField &in)
Estimate the spectral radius of the operator for the max value of the Chebyshev polynomial.
QudaEigParam * eig_param
void queryPrec(const QudaPrecision prec)
Query the eigensolver precision to stdout.
static EigenSolver * create(QudaEigParam *eig_param, const DiracMatrix &mat, TimeProfile &profile)
Creates the eigensolver using the parameters given and the matrix.
void prepareKrylovSpace(std::vector< ColorSpinorField * > &kSpace, std::vector< Complex > &evals)
Extend the Krylov space.
void loadFromFile(const DiracMatrix &mat, std::vector< ColorSpinorField * > &eig_vecs, std::vector< Complex > &evals)
Load and check eigenpairs from file.
void cleanUpEigensolver(std::vector< ColorSpinorField * > &kSpace, std::vector< Complex > &evals)
Release memory, save eigenvectors, resize the Krylov space to its original dimension.
void blockRotateComplex(std::vector< ColorSpinorField * > &kSpace, Complex *array, int rank, const range &i, const range &j, blockType b_type, int offset)
Rotate part of kSpace.
ColorSpinorField * tmp1
void printEigensolverSetup()
Dump the eigensolver parameters to stdout.
void chebyOp(const DiracMatrix &mat, ColorSpinorField &out, const ColorSpinorField &in)
Promoted the specified matVec operation: M, Mdag, MMdag, MdagM to a Chebyshev polynomial.
std::vector< double > residua
Implicitly Restarted Arnoldi Method.
QudaFieldLocation Location() const
Class declaration to initialize and hold CURAND RNG states.
Definition: random_quda.h:23
void Release()
void Init()
Thick Restarted Lanczos Method.
bool isRunning(QudaProfileType idx)
Definition: timer.h:260
VectorIO is a simple wrapper class for loading and saving sets of vector fields using QIO.
Definition: vector_io.h:13
void load(std::vector< ColorSpinorField * > &vecs)
Load vectors from filename.
Definition: vector_io.cpp:20
void save(const std::vector< ColorSpinorField * > &vecs)
Save vectors to filename.
Definition: vector_io.cpp:119
std::array< int, 4 > dim
double epsilon
QudaPrecision prec
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
enum QudaPrecision_s QudaPrecision
@ QUDA_NOISE_UNIFORM
Definition: enum_quda.h:385
@ QUDA_RANDOM_SOURCE
Definition: enum_quda.h:376
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268
@ QUDA_SUMMARIZE
Definition: enum_quda.h:266
@ QUDA_VERBOSE
Definition: enum_quda.h:267
@ QUDA_BOOLEAN_TRUE
Definition: enum_quda.h:461
enum QudaEigSpectrumType_s QudaEigSpectrumType
@ QUDA_EIG_BLK_IR_ARNOLDI
Definition: enum_quda.h:140
@ QUDA_EIG_IR_ARNOLDI
Definition: enum_quda.h:139
@ QUDA_EIG_BLK_TR_LANCZOS
Definition: enum_quda.h:138
@ QUDA_EIG_TR_LANCZOS
Definition: enum_quda.h:137
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_QUARTER_PRECISION
Definition: enum_quda.h:62
@ QUDA_HALF_PRECISION
Definition: enum_quda.h:63
@ QUDA_SPECTRUM_LM_EIG
Definition: enum_quda.h:147
@ QUDA_SPECTRUM_SM_EIG
Definition: enum_quda.h:148
@ QUDA_SPECTRUM_LR_EIG
Definition: enum_quda.h:149
@ QUDA_SPECTRUM_SR_EIG
Definition: enum_quda.h:150
@ QUDA_SPECTRUM_SI_EIG
Definition: enum_quda.h:152
@ QUDA_SPECTRUM_LI_EIG
Definition: enum_quda.h:151
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
enum QudaParity_s QudaParity
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define host_free(ptr)
Definition: malloc_quda.h:115
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
void caxpbypczw(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, const Complex &c, ColorSpinorField &z, ColorSpinorField &w)
void hDotProduct(Complex *result, std::vector< ColorSpinorField * > &a, std::vector< ColorSpinorField * > &b)
Computes the matrix of inner products between the vector set a and the vector set b....
void ax(double a, ColorSpinorField &x)
void axpy_L(const double *a, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &y)
Compute the block "axpy_L" with over the set of ColorSpinorFields. E.g., it computes.
void zero(ColorSpinorField &a)
double norm2(const ColorSpinorField &a)
void caxpy_U(const Complex *a, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &y)
Compute the block "caxpy_U" with over the set of ColorSpinorFields. E.g., it computes.
void caxpy_L(const Complex *a, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &y)
Compute the block "caxpy_L" with over the set of ColorSpinorFields. E.g., it computes.
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
void axpy_U(const double *a, std::vector< ColorSpinorField * > &x, std::vector< ColorSpinorField * > &y)
Compute the block "axpy_U" with over the set of ColorSpinorFields. E.g., it computes.
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.h:43
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: blas_quda.h:24
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void saveTuneCache(bool error=false)
Definition: tune.cpp:439
std::complex< double > Complex
Definition: quda_internal.h:86
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:120
@ QUDA_PROFILE_INIT
Definition: timer.h:106
@ QUDA_PROFILE_COMPUTE
Definition: timer.h:108
@ QUDA_PROFILE_EIGENLU
Definition: timer.h:115
@ QUDA_PROFILE_EIGEN
Definition: timer.h:114
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
constexpr QudaParity impliedParityFromMatPC(const QudaMatPCType &matpc_type)
Helper function for getting the implied spinor parity from a matrix preconditioning type.
void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type)
Generate a random noise spinor. This variant allows the user to manage the RNG state.
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:125
ColorSpinorParam csParam
Definition: pack_test.cpp:25
QudaGaugeParam param
Definition: pack_test.cpp:18
void printQudaEigParam(QudaEigParam *param)
Definition: check_params.h:158
QudaEigSpectrumType spectrum
Definition: quda.h:466
QudaPrecision save_prec
Definition: quda.h:528
QudaEigType eig_type
Definition: quda.h:416
int check_interval
Definition: quda.h:483
int n_ev_deflate
Definition: quda.h:477
QudaBoolean io_parity_inflate
Definition: quda.h:533
int batched_rotate
Definition: quda.h:487
QudaBoolean use_poly_acc
Definition: quda.h:419
char vec_outfile[256]
Definition: quda.h:525
char vec_infile[256]
Definition: quda.h:522
int poly_deg
Definition: quda.h:422
double a_min
Definition: quda.h:425
double tol
Definition: quda.h:479
double a_max
Definition: quda.h:426
int block_size
Definition: quda.h:489
int n_ev
Definition: quda.h:469
int max_restarts
Definition: quda.h:485
int n_kr
Definition: quda.h:471
int n_conv
Definition: quda.h:475
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:134
#define printfQuda(...)
Definition: util_quda.h:114
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define warningQuda(...)
Definition: util_quda.h:132
#define errorQuda(...)
Definition: util_quda.h:120