31 extern void usage(
char** );
33 #if (__COMPUTE_CAPABILITY__ >= 200)
35 #else // exclude Heavy Quark Norm if on Tesla architecture
90 else param.
x[0] =
xdim;
197 cudaEvent_t start,
end;
198 cudaEventCreate(&start);
199 cudaEventCreate(&end);
200 cudaEventRecord(start, 0);
337 errorQuda(
"Undefined blas kernel %d\n", kernel);
341 cudaEventRecord(end, 0);
342 cudaEventSynchronize(end);
344 cudaEventElapsedTime(&runTime, start, end);
345 cudaEventDestroy(start);
346 cudaEventDestroy(end);
348 double secs = runTime / 1000;
352 #define ERROR(a) fabs(norm2(*a##D) - norm2(*a##H)) / norm2(*a##H)
356 double a = M_PI, b = M_PI*
exp(1.0), c =
sqrt(M_PI);
529 error =
ERROR(
y) + fabs(d-h)/fabs(h);}
537 error =
ERROR(
y) + fabs(d-h)/fabs(h);}
545 error =
ERROR(
y) + fabs(d-h)/fabs(h);}
597 error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
605 error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
616 error =
ERROR(z) +
ERROR(
y) + fabs(d.x - h.x) / fabs(h.x) +
617 fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
625 error = fabs(d.x - h.x) / fabs(h.x) +
626 fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
630 errorQuda(
"Undefined blas kernel %d\n", kernel);
637 #if (__COMPUTE_CAPABILITY__ >= 130)
643 const char *
prec_str[] = {
"half",
"single",
"double"};
676 "caxpbypzYmbwcDotProductWYNormY",
677 "HeavyQuarkResidualNorm"
680 int main(
int argc,
char** argv)
682 for (
int i = 1; i < argc; i++){
686 printfQuda(
"ERROR: Invalid option:%s\n", argv[i]);
704 for (
int kernel = 0; kernel <
Nkernels; kernel++) {
706 if ((Nprec < 3) && (kernel == 0))
continue;
720 printfQuda(
"%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n",
names[kernel], gflops, gbytes);
759 int kernel =
param.y;
760 double deviation =
test(kernel);
762 double tol = (prec == 2 ? 1e-12 : (prec == 1 ? 1e-5 : 1e-3));
763 tol = (kernel < 2) ? 1e-4 : tol;
764 EXPECT_LE(deviation, tol) <<
"CPU and CUDA implementations do not agree";
QudaDslashType dslash_type
int dimPartitioned(int dim)
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
void copy(const cpuColorSpinorField &)
void caxpbypzYmbwCpu(const Complex &, const cpuColorSpinorField &, const Complex &, cpuColorSpinorField &, cpuColorSpinorField &, const cpuColorSpinorField &)
void mxpyCpu(const cpuColorSpinorField &x, cpuColorSpinorField &y)
enum QudaPrecision_s QudaPrecision
Complex xpaycDotzyCpu(const cpuColorSpinorField &x, const double &a, cpuColorSpinorField &y, const cpuColorSpinorField &z)
void cabxpyAxCpu(const double &a, const Complex &b, cpuColorSpinorField &x, cpuColorSpinorField &y)
cudaColorSpinorField * hD
__host__ __device__ ValueType exp(ValueType x)
#define EXPECT_LE(val1, val2)
__host__ __device__ ValueType sqrt(ValueType x)
double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
unsigned long long blas_bytes
void xpayCpu(const cpuColorSpinorField &x, const double &a, cpuColorSpinorField &y)
std::complex< double > Complex
void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y)
int process_command_line_option(int argc, char **argv, int *idx)
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
cudaColorSpinorField * xD
void xpyCpu(const cpuColorSpinorField &x, cpuColorSpinorField &y)
double cabxpyAxNormCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
double3 cDotProductNormBCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
QudaSiteSubset siteSubset
void axpyZpbxCpu(const double &a, cpuColorSpinorField &x, cpuColorSpinorField &y, const cpuColorSpinorField &z, const double &b)
void cabxpyAxCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
void setTuning(QudaTune tune)
void initQuda(int device)
void axpyBzpcxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, const double &b, cudaColorSpinorField &z, const double &c)
void caxpyXmazCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
double3 caxpbypzYmbwcDotProductUYNormYCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &u)
double benchmark(int kernel, const int niter)
void initFields(int prec)
void caxpbyCpu(const Complex &a, const cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y)
Complex cDotProductCuda(cudaColorSpinorField &, cudaColorSpinorField &)
void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
QudaFieldOrder fieldOrder
double3 caxpbypzYmbwcDotProductUYNormYCpu(const Complex &a, const cpuColorSpinorField &x, const Complex &b, cpuColorSpinorField &y, cpuColorSpinorField &z, const cpuColorSpinorField &w, const cpuColorSpinorField &u)
double caxpyXmazNormXCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Complex cDotProductCpu(const cpuColorSpinorField &, const cpuColorSpinorField &)
void setSpinorSiteSize(int n)
GTEST_API_ void InitGoogleTest(int *argc, char **argv)
QudaGammaBasis gammaBasis
cudaColorSpinorField * wD
void cxpaypbzCpu(const cpuColorSpinorField &x, const Complex &b, const cpuColorSpinorField &y, const Complex &c, cpuColorSpinorField &z)
void caxpbypzCpu(const Complex &, cpuColorSpinorField &, const Complex &, cpuColorSpinorField &, cpuColorSpinorField &)
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
double normCuda(const cudaColorSpinorField &b)
void caxpyCpu(const Complex &a, const cpuColorSpinorField &x, cpuColorSpinorField &y)
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Complex caxpyDotzyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
void axpbyCpu(const double &a, const cpuColorSpinorField &x, const double &b, cpuColorSpinorField &y)
void axpyBzpcxCpu(const double &a, cpuColorSpinorField &x, cpuColorSpinorField &y, const double &b, const cpuColorSpinorField &z, const double &c)
void caxpbypczpwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z)
void axCpu(const double &a, cpuColorSpinorField &x)
unsigned long long blas_flops
cudaColorSpinorField * vD
double cabxpyAxNormCpu(const double &a, const Complex &b, cpuColorSpinorField &x, cpuColorSpinorField &y)
void caxpyXmazCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y, cpuColorSpinorField &z)
cudaColorSpinorField * zD
void caxpbypczpwCpu(const Complex &, cpuColorSpinorField &, const Complex &, cpuColorSpinorField &, const Complex &, cpuColorSpinorField &, cpuColorSpinorField &)
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
double caxpyNormCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y)
void cxpaypbzCuda(cudaColorSpinorField &, const Complex &b, cudaColorSpinorField &y, const Complex &c, cudaColorSpinorField &z)
virtual void NormalExit()
void setPrec(ColorSpinorParam ¶m, const QudaPrecision precision)
cudaColorSpinorField * lD
void caxpbypzCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
double normCpu(const cpuColorSpinorField &b)
double axpyNormCpu(const double &a, const cpuColorSpinorField &x, cpuColorSpinorField &y)
double3 cDotProductNormACuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_
enum QudaDslashType_s QudaDslashType
double reDotProductCpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
double3 cDotProductNormACpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
double caxpyXmazNormXCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y, cpuColorSpinorField &z)
__host__ __device__ ValueType abs(ValueType x)
void caxpbyCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y)
double3 HeavyQuarkResidualNormCpu(cpuColorSpinorField &x, cpuColorSpinorField &r)
cudaColorSpinorField * yD
double caxpyNormCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
double xmyNormCpu(const cpuColorSpinorField &a, cpuColorSpinorField &b)
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
double3 HeavyQuarkResidualNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &r)
INSTANTIATE_TEST_CASE_P(copyHS_half, BlasTest,::testing::Values(make_int2(0, 0)))
Complex caxpyDotzyCpu(const Complex &a, cpuColorSpinorField &x, cpuColorSpinorField &y, cpuColorSpinorField &z)
double3 cDotProductNormBCpu(const cpuColorSpinorField &a, const cpuColorSpinorField &b)
void axCuda(const double &a, cudaColorSpinorField &x)
double norm2(const ColorSpinorField &)
void axpyCpu(const double &a, const cpuColorSpinorField &x, cpuColorSpinorField &y)
double xmyNormCuda(cudaColorSpinorField &a, cudaColorSpinorField &b)
void initComms(int argc, char **argv, const int *commDims)
int gridsize_from_cmdline[]
void setVerbosity(const QudaVerbosity verbosity)
void caxpbypzYmbwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &, cudaColorSpinorField &)
int main(int argc, char **argv)
internal::ValueArray1< T1 > Values(T1 v1)