14 #include <gtest/gtest.h> 34 extern void usage(
char** );
41 ColorSpinorField *
xD, *
yD, *
zD, *
wD, *
vD, *
hD, *
mD, *
lD, *
xmD, *
ymD, *
zmD;
42 std::vector<cpuColorSpinorField*>
xmH;
43 std::vector<cpuColorSpinorField*>
ymH;
44 std::vector<cpuColorSpinorField*>
zmH;
61 printfQuda(
"S_dimension T_dimension Nspin Ncolor\n");
75 if ((QUDA_PRECISION &
getPrecision(precision)) == 0)
return true;
84 if (
Nspin == 2 && ( precision == 0 || precision ==1 ) ) {
87 }
else if (
Nspin == 2 && (kernel == 1 || kernel == 2)) {
90 }
else if (
Ncolor != 3 && (kernel == 31 || kernel == 32)) {
93 }
else if ((
Nprec < 4) && (kernel == 0)) {
124 else param.
x[0] =
xdim;
168 for(
int i=0; i<
Nsrc; i++){
171 for(
int i=0; i<
Msrc; i++){
206 errorQuda(
"Precision option not defined");
247 bool flag = !(param.
nSpin == 2 &&
294 for (
int i=0; i <
Nsrc; i++)
delete xmH[i];
295 for (
int i=0; i <
Msrc; i++)
delete ymH[i];
296 for (
int i=0; i <
Nsrc; i++)
delete zmH[i];
305 double a = 1.0, b = 2.0, c = 3.0;
312 cudaEvent_t start,
end;
313 cudaEventCreate(&start);
314 cudaEventCreate(&end);
315 cudaEventRecord(start, 0);
484 errorQuda(
"Undefined blas kernel %d\n", kernel);
488 cudaEventRecord(end, 0);
489 cudaEventSynchronize(end);
491 cudaEventElapsedTime(&runTime, start, end);
492 cudaEventDestroy(start);
493 cudaEventDestroy(end);
498 double secs = runTime / 1000;
502 #define ERROR(a) fabs(blas::norm2(*a##D) - blas::norm2(*a##H)) / blas::norm2(*a##H) 506 double a = M_PI, b = M_PI*
exp(1.0), c =
sqrt(M_PI);
515 A[i] = a2* (1.0*((i/
Nsrc) + i)) + b2 * (1.0*i) + c2 *(1.0*(
Nsrc*Msrc/2-i));
516 B[i] = a2* (1.0*((i/
Nsrc) + i)) - b2 * (M_PI*i) + c2 *(1.0*(
Nsrc*Msrc/2-i));
517 C[i] = a2* (1.0*((M_PI/
Nsrc) + i)) + b2 * (1.0*i) + c2 *(1.0*(
Nsrc*Msrc/2-i));
520 A2[i] = a2* (1.0*((i/
Nsrc) + i)) + b2 * (1.0*i) + c2 *(1.0*(Nsrc*Nsrc/2-i));
521 B2[i] = a2* (1.0*((i/
Nsrc) + i)) - b2 * (M_PI*i) + c2 *(1.0*(Nsrc*Nsrc/2-i));
686 error =
ERROR(y) + fabs(d-h)/fabs(h);}
694 error =
ERROR(y) + fabs(d-h)/fabs(h);}
702 error =
ERROR(y) + fabs(d-h)/fabs(h);}
711 error =
ERROR(y) +
ERROR(x) + fabs(d-h)/fabs(h);}
719 error =
ERROR(x) +
ERROR(y) + fabs(d-h)/fabs(h);}
742 error =
abs(
Complex(d.x - h.x, d.y - h.y)) /
abs(
Complex(h.x, h.y)) + fabs(d.z - h.z) / fabs(h.z);
751 error =
abs(
Complex(d.x - h.x, d.y - h.y)) /
abs(
Complex(h.x, h.y)) + fabs(d.z - h.z) / fabs(h.z);
764 + fabs(d.z - h.z) / fabs(h.z);
773 error = fabs(d.x - h.x) / fabs(h.x) +
774 fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
783 error =
ERROR(y) + fabs(d.x - h.x) / fabs(h.x) +
784 fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
793 error = fabs(d.x - h.x) / fabs(h.x) +
794 fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z); }
812 error =
ERROR(y) + fabs(d-h)/fabs(h); }
820 for (
int i=0; i <
Nsrc; i++){
821 for(
int j=0; j <
Msrc; j++){
826 for (
int i=0; i <
Msrc; i++){
833 for (
int i=0; i <
Nsrc; i++) {
841 for (
int i=0; i<
Nsrc; i++) {
842 blas::axpyBzpcx(((
double*)A)[i], *xmH[i], *zmH[i], ((
double*)B)[i], *yH, ((
double*)C)[i]);
846 for (
int i=0; i <
Nsrc; i++){
875 for (
int i = 0; i <
Nsrc; i++) {
876 for (
int j = 0; j <
Nsrc; j++) {
889 for (
int i = 0; i <
Nsrc; i++) {
890 for (
int j = 0; j <
Msrc; j++) {
899 errorQuda(
"Undefined blas kernel %d\n", kernel);
909 const char *
prec_str[] = {
"quarter",
"half",
"single",
"double"};
943 "caxpbypzYmbwcDotProductUYNormY",
944 "HeavyQuarkResidualNorm",
945 "xpyHeavyQuarkResidualNorm",
953 "cDotProductNorm_block",
957 int main(
int argc,
char** argv)
960 ::testing::InitGoogleTest(&argc, argv);
966 for (
int i = 1; i < argc; i++){
970 printfQuda(
"ERROR: Invalid option:%s\n", argv[i]);
997 ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
998 if (
comm_rank() != 0) {
delete listeners.Release(listeners.default_result_printer()); }
999 result = RUN_ALL_TESTS();
1009 using ::testing::TestWithParam;
1010 using ::testing::Bool;
1011 using ::testing::Values;
1012 using ::testing::Range;
1013 using ::testing::Combine;
1015 class BlasTest :
public ::testing::TestWithParam<::testing::tuple<int, int>> {
1031 int prec = ::testing::get<0>(GetParam());
1032 int kernel = ::testing::get<1>(GetParam());
1037 double deviation =
test(kernel);
1039 double tol = (prec == 3 ? 1e-12 : (prec == 2 ? 1e-6 : (prec == 1 ? 1e-4 : 1e-2)));
1040 tol = (kernel < 4) ? 5e-2 : tol;
1041 EXPECT_LE(deviation, tol) <<
"CPU and CUDA implementations do not agree";
1045 int prec = ::testing::get<0>(GetParam());
1046 int kernel = ::testing::get<1>(GetParam());
1060 RecordProperty(
"Gflops", std::to_string(gflops));
1061 RecordProperty(
"GBs", std::to_string(gbytes));
1062 printfQuda(
"%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n",
names[kernel], gflops, gbytes);
1066 int prec = ::testing::get<0>(
param.param);
1067 int kernel = ::testing::get<1>(
param.param);
1068 std::string str(
names[kernel]);
1069 str += std::string(
"_");
1070 str += std::string(
prec_str[prec]);
QudaDslashType dslash_type
int dimPartitioned(int dim)
void ax(double a, ColorSpinorField &x)
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
enum QudaPrecision_s QudaPrecision
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
void axpyZpbx(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, double b)
__host__ __device__ ValueType exp(ValueType x)
double norm2(const ColorSpinorField &a)
QudaInverterType inv_type
enum QudaSolveType_s QudaSolveType
__host__ __device__ ValueType sqrt(ValueType x)
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
int process_command_line_option(int argc, char **argv, int *idx)
::testing::tuple< int, int > param
void cabxpyAx(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y)
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
std::vector< cpuColorSpinorField * > ymH
CompositeColorSpinorField & Components()
bool skip_kernel(int precision, int kernel)
std::vector< cpuColorSpinorField * > xmH
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
ColorSpinorField & Component(const int idx) const
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
void caxpyBzpx(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
QudaSiteSubset siteSubset
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
bool is_composite
for deflation solvers:
void initQuda(int device)
QudaPrecision getPrecision(int i)
double benchmark(int kernel, const int niter)
void initFields(int prec)
QudaFieldOrder fieldOrder
void setSpinorSiteSize(int n)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
int nvec[QUDA_MAX_MG_LEVEL]
void caxpbypzYmbw(const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, ColorSpinorField &, ColorSpinorField &)
QudaGammaBasis gammaBasis
void axpyBzpcx(double a, ColorSpinorField &x, ColorSpinorField &y, double b, ColorSpinorField &z, double c)
double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
std::complex< double > Complex
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void tripleCGUpdate(double alpha, double beta, ColorSpinorField &q, ColorSpinorField &r, ColorSpinorField &x, ColorSpinorField &p)
double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
void caxpyXmaz(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
#define QUDA_MAX_MG_LEVEL
Maximum number of multi-grid levels. This number may be increased if needed.
void setPrec(ColorSpinorParam ¶m, const QudaPrecision precision)
void xpy(ColorSpinorField &x, ColorSpinorField &y)
std::string getblasname(testing::TestParamInfo<::testing::tuple< int, int >> param)
void axpby(double a, ColorSpinorField &x, double b, ColorSpinorField &y)
enum QudaDslashType_s QudaDslashType
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
double axpyNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
void mxpy(ColorSpinorField &x, ColorSpinorField &y)
__host__ __device__ ValueType abs(ValueType x)
enum QudaVerbosity_s QudaVerbosity
void cxpaypbz(ColorSpinorField &, const Complex &b, ColorSpinorField &y, const Complex &c, ColorSpinorField &z)
INSTANTIATE_TEST_SUITE_P(QUDA, BlasTest, Combine(Range(0, 4), Range(0, Nkernels)), getblasname)
std::vector< cpuColorSpinorField * > zmH
void initComms(int argc, char **argv, int *const commDims)
double3 cDotProductNormB(ColorSpinorField &a, ColorSpinorField &b)
Return (a,b) and ||b||^2 - implemented using cDotProductNormA.
int gridsize_from_cmdline[]
void setVerbosity(QudaVerbosity verbosity)
enum QudaInverterType_s QudaInverterType
int main(int argc, char **argv)
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)