11 #define MAX(a, b) (a > b) ? a : b; 14 #define MAGMA_17 //default version version of the MAGMA library 35 #define _cU MagmaUpper 37 #define _cR MagmaRight 40 #define _cC MagmaConjTrans 41 #define _cN MagmaNoTrans 43 #define _cNV MagmaNoVec 57 magma_int_t
err = magma_init();
59 if(
err != MAGMA_SUCCESS)
errorQuda(
"\nError: cannot initialize MAGMA library\n");
61 int major, minor, micro;
63 magma_version( &major, &minor, µ);
64 printfQuda(
"\nMAGMA library version: %d.%d\n\n", major, minor);
66 errorQuda(
"\nError: MAGMA library was not compiled, check your compilation options...\n");
75 if(magma_finalize() != MAGMA_SUCCESS)
errorQuda(
"\nError: cannot close MAGMA library\n");
77 errorQuda(
"\nError: MAGMA library was not compiled, check your compilation options...\n");
83 #define FMULS_GETRF(m_, n_) ( ((m_) < (n_)) \ 84 ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) - 1. ) + (n_)) + (2. / 3.) * (m_)) \ 85 : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) - 1. ) + (m_)) + (2. / 3.) * (n_)) ) 86 #define FADDS_GETRF(m_, n_) ( ((m_) < (n_)) \ 87 ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) ) - (n_)) + (1. / 6.) * (m_)) \ 88 : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) ) - (m_)) + (1. / 6.) * (n_)) ) 90 #define FLOPS_ZGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) ) 91 #define FLOPS_CGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) ) 93 #define FMULS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) + 0.5)) ) 94 #define FADDS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) - 1.5)) ) 96 #define FLOPS_ZGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) ) 97 #define FLOPS_CGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) ) 102 printfQuda(
"%s with n=%d and batch=%d\n", __func__,
n, batch);
104 magma_queue_t queue = 0;
111 magma_int_t **dipiv_array =
static_cast<magma_int_t**
>(
device_malloc(batch*
sizeof(magma_int_t*)));
112 magma_int_t *dipiv_tmp =
static_cast<magma_int_t*
>(
device_malloc(batch*
n*
sizeof(magma_int_t)));
113 set_ipointer(dipiv_array, dipiv_tmp, 1, 0, 0,
n, batch, queue);
115 magma_int_t *no_piv_array =
static_cast<magma_int_t*
>(
safe_malloc(batch*
n*
sizeof(magma_int_t)));
117 for (
int i=0;
i<batch;
i++) {
118 for (
int j=0; j<
n; j++) {
119 no_piv_array[
i*
n + j] = j+1;
122 qudaMemcpy(dipiv_tmp, no_piv_array, batch*
n*
sizeof(magma_int_t), cudaMemcpyHostToDevice);
126 magma_int_t *dinfo_array =
static_cast<magma_int_t*
>(
device_malloc(batch*
sizeof(magma_int_t)));
127 magma_int_t *info_array =
static_cast<magma_int_t*
>(
safe_malloc(batch*
sizeof(magma_int_t)));
132 magmaFloatComplex **A_array =
static_cast<magmaFloatComplex**
>(
device_malloc(batch*
sizeof(magmaFloatComplex*)));
133 magmaFloatComplex **Ainv_array =
static_cast<magmaFloatComplex**
>(
device_malloc(batch*
sizeof(magmaFloatComplex*)));
135 cset_pointer(A_array, static_cast<magmaFloatComplex*>(A_d),
n, 0, 0,
n*
n, batch, queue);
136 cset_pointer(Ainv_array, static_cast<magmaFloatComplex*>(Ainv_d),
n, 0, 0,
n*
n, batch, queue);
138 double magma_time = magma_sync_wtime(queue);
140 err = magma_cgetrf_nopiv_batched(
n,
n, A_array,
n, dinfo_array, batch, queue);
141 magma_time = magma_sync_wtime(queue) - magma_time;
142 printfQuda(
"LU factorization completed in %f seconds with GFLOPS = %f\n",
145 if(
err != 0)
errorQuda(
"\nError in LU decomposition (magma_cgetrf), error code = %d\n",
err);
147 qudaMemcpy(info_array, dinfo_array, batch*
sizeof(magma_int_t), cudaMemcpyDeviceToHost);
148 for (
int i=0;
i<batch;
i++) {
149 if (info_array[
i] < 0) {
150 errorQuda(
"%d argument had an illegal value or another error occured, such as memory allocation failed",
i);
151 }
else if (info_array[
i] > 0) {
152 errorQuda(
"%d factorization completed but the factor U is exactly singular",
i);
156 magma_time = magma_sync_wtime(queue);
157 err = magma_cgetri_outofplace_batched(
n, A_array,
n, dipiv_array, Ainv_array,
n, dinfo_array, batch, queue);
158 magma_time = magma_sync_wtime(queue) - magma_time;
159 printfQuda(
"Matrix inversion completed in %f seconds with GFLOPS = %f\n",
162 if(
err != 0)
errorQuda(
"\nError in matrix inversion (magma_cgetri), error code = %d\n",
err);
164 qudaMemcpy(info_array, dinfo_array, batch*
sizeof(magma_int_t), cudaMemcpyDeviceToHost);
166 for (
int i=0;
i<batch;
i++) {
167 if (info_array[
i] < 0) {
168 errorQuda(
"%d argument had an illegal value or another error occured, such as memory allocation failed",
i);
169 }
else if (info_array[
i] > 0) {
170 errorQuda(
"%d factorization completed but the factor U is exactly singular",
i);
176 }
else if (
prec == 8) {
177 magmaDoubleComplex **A_array =
static_cast<magmaDoubleComplex**
>(
device_malloc(batch*
sizeof(magmaDoubleComplex*)));
178 zset_pointer(A_array, static_cast<magmaDoubleComplex*>(A_d),
n, 0, 0,
n*
n, batch, queue);
180 magmaDoubleComplex **Ainv_array =
static_cast<magmaDoubleComplex**
>(
device_malloc(batch*
sizeof(magmaDoubleComplex*)));
181 zset_pointer(Ainv_array, static_cast<magmaDoubleComplex*>(Ainv_d),
n, 0, 0,
n*
n, batch, queue);
183 double magma_time = magma_sync_wtime(queue);
184 err = magma_zgetrf_batched(
n,
n, A_array,
n, dipiv_array, dinfo_array, batch, queue);
185 magma_time = magma_sync_wtime(queue) - magma_time;
186 printfQuda(
"LU factorization completed in %f seconds with GFLOPS = %f\n",
189 if(
err != 0)
errorQuda(
"\nError in LU decomposition (magma_zgetrf), error code = %d\n",
err);
191 qudaMemcpy(info_array, dinfo_array, batch*
sizeof(magma_int_t), cudaMemcpyDeviceToHost);
192 for (
int i=0;
i<batch;
i++) {
193 if (info_array[
i] < 0) {
194 errorQuda(
"%d argument had an illegal value or another error occured, such as memory allocation failed",
i);
195 }
else if (info_array[
i] > 0) {
196 errorQuda(
"%d factorization completed but the factor U is exactly singular",
i);
200 magma_time = magma_sync_wtime(queue);
201 err = magma_zgetri_outofplace_batched(
n, A_array,
n, dipiv_array, Ainv_array,
n, dinfo_array, batch, queue);
202 magma_time = magma_sync_wtime(queue) - magma_time;
203 printfQuda(
"Matrix inversion completed in %f seconds with GFLOPS = %f\n",
206 if(
err != 0)
errorQuda(
"\nError in matrix inversion (magma_cgetri), error code = %d\n",
err);
208 qudaMemcpy(info_array, dinfo_array, batch*
sizeof(magma_int_t), cudaMemcpyDeviceToHost);
210 for (
int i=0;
i<batch;
i++) {
211 if (info_array[
i] < 0) {
212 errorQuda(
"%d argument had an illegal value or another error occured, such as memory allocation failed",
i);
213 }
else if (info_array[
i] > 0) {
214 errorQuda(
"%d factorization completed but the factor U is exactly singular",
i);
221 errorQuda(
"%s not implemented for precision=%d", __func__,
prec);
#define qudaMemcpy(dst, src, count, kind)
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
#define safe_malloc(size)
#define FLOPS_CGETRF(m_, n_)
#define FLOPS_ZGETRF(m_, n_)
#define device_malloc(size)