4 #define FMULS_GETRF(m_, n_) ( ((m_) < (n_)) \ 5 ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) - 1. ) + (n_)) + (2. / 3.) * (m_)) \ 6 : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) - 1. ) + (m_)) + (2. / 3.) * (n_)) ) 7 #define FADDS_GETRF(m_, n_) ( ((m_) < (n_)) \ 8 ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) ) - (n_)) + (1. / 6.) * (m_)) \ 9 : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) ) - (m_)) + (1. / 6.) * (n_)) ) 11 #define FLOPS_ZGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) ) 12 #define FLOPS_CGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) ) 14 #define FMULS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) + 0.5)) ) 15 #define FADDS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) - 1.5)) ) 17 #define FLOPS_ZGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) ) 18 #define FLOPS_CGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) ) 26 __global__
void set_pointer(T **output_array_a, T *input_a, T **output_array_b, T *input_b,
int batch_offset)
28 output_array_a[blockIdx.x] = input_a + blockIdx.x * batch_offset;
29 output_array_b[blockIdx.x] = input_b + blockIdx.x * batch_offset;
38 gettimeofday(&
start, NULL);
51 cublasStatus_t error = cublasCreate(&
handle);
52 if (error != CUBLAS_STATUS_SUCCESS)
errorQuda(
"cublasCreate failed");
55 typedef cuFloatComplex C;
59 set_pointer<C><<<batch,1>>>(A_array, (C*)A_d, Ainv_array, (C*)Ainv_d,
n*
n);
61 error = cublasCgetrfBatched(
handle,
n, A_array,
n, dipiv, dinfo_array, batch);
64 if (error != CUBLAS_STATUS_SUCCESS)
65 errorQuda(
"\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);
67 qudaMemcpy(info_array, dinfo_array, batch*
sizeof(
int), cudaMemcpyDeviceToHost);
68 for (
int i=0;
i<batch;
i++) {
69 if (info_array[
i] < 0) {
70 errorQuda(
"%d argument had an illegal value or another error occured, such as memory allocation failed",
i);
71 }
else if (info_array[
i] > 0) {
72 warningQuda(
"%d factorization completed but the factor U is exactly singular",
i);
76 error = cublasCgetriBatched(
handle,
n, (
const C**)A_array,
n, dipiv,
77 Ainv_array,
n, dinfo_array, batch);
80 if (error != CUBLAS_STATUS_SUCCESS)
81 errorQuda(
"\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);
83 qudaMemcpy(info_array, dinfo_array, batch*
sizeof(
int), cudaMemcpyDeviceToHost);
85 for (
int i=0;
i<batch;
i++) {
86 if (info_array[
i] < 0) {
87 errorQuda(
"%d argument had an illegal value or another error occured, such as memory allocation failed",
i);
88 }
else if (info_array[
i] > 0) {
89 errorQuda(
"%d factorization completed but the factor U is exactly singular",
i);
99 errorQuda(
"%s not implemented for precision=%d", __func__,
prec);
102 error = cublasDestroy(
handle);
103 if (error != CUBLAS_STATUS_SUCCESS)
104 errorQuda(
"\nError indestroying cublas context, error code = %d\n", error);
117 gettimeofday(&stop, NULL);
120 double time = ds + 0.000001*dus;
122 printfQuda(
"Batched matrix inversion completed in %f seconds with GFLOPS = %f\n",
#define qudaMemcpy(dst, src, count, kind)
#define pool_pinned_free(ptr)
enum QudaPrecision_s QudaPrecision
__global__ void set_pointer(T **output_array_a, T *input_a, T **output_array_b, T *input_b, int batch_offset)
unsigned int unsigned long long handle
__darwin_suseconds_t tv_usec
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
#define pool_device_malloc(size)
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define FLOPS_CGETRF(m_, n_)
#define pool_pinned_malloc(size)
enum QudaFieldLocation_s QudaFieldLocation
#define pool_device_free(ptr)