47 std::vector<cpuColorSpinorField *>
xmH;
48 std::vector<cpuColorSpinorField *>
ymH;
49 std::vector<cpuColorSpinorField *>
zmH;
58 printfQuda(
"S_dimension T_dimension Nspin Ncolor\n");
175 auto &this_prec = pair.first;
176 auto &other_prec = pair.second;
178 if ((QUDA_PRECISION & this_prec) == 0)
return true;
179 if ((QUDA_PRECISION & other_prec) == 0)
return true;
272 param.setPrecision(prec_other, prec_other,
true);
280 param.is_composite =
true;
281 param.is_component =
false;
293 param.setPrecision(prec_other, prec_other,
true);
341 for (
int i = 0; i <
Nsrc; i++)
delete xmH[i];
342 for (
int i = 0; i <
Msrc; i++)
delete ymH[i];
343 for (
int i = 0; i <
Nsrc; i++)
delete zmH[i];
351 double a = 1.0, b = 2.0, c = 3.0;
357 double *Ar =
new double[
Nsrc *
Msrc];
360 cudaEventCreate(&
start);
361 cudaEventCreate(&
end);
362 cudaEventRecord(
start, 0);
496 for (
int i = 0; i <
niter; ++i)
526 cudaEventRecord(
end, 0);
527 cudaEventSynchronize(
end);
529 cudaEventElapsedTime(&runTime,
start,
end);
530 cudaEventDestroy(
start);
531 cudaEventDestroy(
end);
537 double secs = runTime / 1000;
541 #define ERROR(a) fabs(blas::norm2(*a##D) - blas::norm2(*a##H)) / blas::norm2(*a##H)
545 double a = M_PI, b = M_PI *
exp(1.0), c =
sqrt(M_PI);
553 double *Ar =
new double[
Nsrc *
Msrc];
555 for (
int i = 0; i <
Nsrc *
Msrc; i++) {
556 A[i] = a2 * (1.0 * ((i / (double)
Nsrc) + i)) + b2 * (1.0 * i) + c2 * (1.0 * (0.5 *
Nsrc *
Msrc - i));
557 B[i] = a2 * (1.0 * ((i / (double)
Nsrc) + i)) - b2 * (M_PI * i) + c2 * (1.0 * (0.5 *
Nsrc *
Msrc - i));
558 C[i] = a2 * (1.0 * ((M_PI / (double)
Nsrc) + i)) + b2 * (1.0 * i) + c2 * (1.0 * (0.5 *
Nsrc *
Msrc - i));
561 for (
int i = 0; i <
Nsrc *
Nsrc; i++) {
562 A2[i] = a2 * (1.0 * ((i / (double)
Nsrc) + i)) + b2 * (1.0 * i) + c2 * (1.0 * (0.5 *
Nsrc *
Nsrc - i));
563 B2[i] = a2 * (1.0 * ((i / (double)
Nsrc) + i)) - b2 * (M_PI * i) + c2 * (1.0 * (0.5 *
Nsrc *
Nsrc - i));
690 error =
ERROR(z) + fabs(d - h) / fabs(h);
700 error =
ERROR(yo) + fabs(d.real() - h.real()) / fabs(h.real()) + fabs(d.imag() - h.imag()) / fabs(h.imag());
710 error =
ERROR(y) + fabs(d - h) / fabs(h);
721 error =
ERROR(y) +
ERROR(x) + fabs(d - h) / fabs(h);
731 error =
ERROR(x) +
ERROR(y) + fabs(d - h) / fabs(h);
758 error =
abs(
Complex(d.x - h.x, d.y - h.y)) /
abs(
Complex(h.x, h.y)) + fabs(d.z - h.z) / fabs(h.z);
772 + fabs(d.z - h.z) / fabs(h.z);
782 error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z);
793 error =
ERROR(y) + fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z);
804 error = fabs(d.x - h.x) / fabs(h.x) + fabs(d.y - h.y) / fabs(h.y) + fabs(d.z - h.z) / fabs(h.z);
826 error =
ERROR(y) + fabs(d - h) / fabs(h);
857 for (
int i = 0; i <
Nsrc; i++) {
862 for (
int i = 0; i <
Msrc; i++) {
873 for (
int i = 0; i <
Nsrc; i++) {
877 for (
int i = 0; i <
Msrc; i++) {
884 for (
int i = 0; i <
Nsrc; i++) {
892 for (
int i = 0; i <
Nsrc; i++) {
897 for (
int i = 0; i <
Nsrc; i++) {
908 for (
int i = 0; i <
Nsrc; i++) {
909 for (
int j = 0; j <
Nsrc; j++) {
911 error +=
std::abs(((
double *)A2)[i *
Nsrc + j] - ((
double *)B2)[i *
Nsrc + j])
924 for (
int i = 0; i <
Nsrc; i++) {
925 for (
int j = 0; j <
Msrc; j++) {
938 for (
int i = 0; i <
Nsrc; i++) {
939 for (
int j = 0; j <
Nsrc; j++) {
953 for (
int i = 0; i <
Nsrc; i++) {
954 for (
int j = 0; j <
Msrc; j++) {
979 auto alpha =
Complex(Ar3.x, Ar3.y) / Ar3.z;
1018 app->parse(argc, argv);
1019 }
catch (
const CLI::ParseError &e) {
1020 return app->exit(e);
1056 using ::testing::TestWithParam;
1073 default:
errorQuda(
"Unexpect precision index %d", idx);
1105 double deviation =
test(kernel);
1115 double tol = std::max(tol_x, tol_y);
1117 EXPECT_LE(deviation,
tol) <<
"CPU and CUDA implementations do not agree";
1118 EXPECT_EQ(
false, std::isnan(deviation)) <<
"Nan has propagated into the result";
1139 RecordProperty(
"Gflops", std::to_string(gflops));
1140 RecordProperty(
"GBs", std::to_string(gbytes));
1141 printfQuda(
"%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n",
kernel_map.at(kernel).c_str(), gflops, gbytes);
1147 int kernel = ::testing::get<1>(
param.param);
const std::map< Kernel, std::string > kernel_map
int main(int argc, char **argv)
bool skip_kernel(prec_pair_t pair, Kernel kernel)
prec_pair_t prec_idx_map(int idx)
void setPrec(ColorSpinorParam ¶m, QudaPrecision precision)
INSTANTIATE_TEST_SUITE_P(QUDA, BlasTest, Combine(Range(0,(Nprec *(Nprec+1))/2), Range(0, Nkernels)), getblasname)
std::vector< cpuColorSpinorField * > ymH
bool is_multi(Kernel kernel)
std::string getblasname(testing::TestParamInfo<::testing::tuple< int, int >> param)
@ xpyHeavyQuarkResidualNorm
@ caxpbypzYmbwcDotProductUYNormY
double benchmark(Kernel kernel, const int niter)
bool is_site_unroll(Kernel kernel)
std::vector< cpuColorSpinorField * > zmH
double test(Kernel kernel)
void initFields(prec_pair_t prec_pair)
std::vector< cpuColorSpinorField * > xmH
bool is_copy(Kernel kernel)
const std::map< QudaPrecision, std::string > prec_map
std::pair< QudaPrecision, QudaPrecision > prec_pair_t
::testing::tuple< int, int > param
const prec_pair_t prec_pair
CompositeColorSpinorField & Components()
ColorSpinorField & Component(const int idx) const
TestEventListener * Release(TestEventListener *listener)
TestEventListener * default_result_printer() const
TestEventListeners & listeners()
static UnitTest * GetInstance()
void commAsyncReductionSet(bool global_reduce)
void commGlobalReductionSet(bool global_reduce)
QudaInverterType inv_type
std::shared_ptr< QUDAApp > make_app(std::string app_description, std::string app_name)
quda::mgarray< int > nvec
QudaDslashType dslash_type
std::array< int, 4 > gridsize_from_cmdline
QudaPrecision prec_sloppy
enum QudaPrecision_s QudaPrecision
@ QUDA_PARITY_SITE_SUBSET
@ QUDA_DEGRAND_ROSSI_GAMMA_BASIS
@ QUDA_EVEN_ODD_SITE_ORDER
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
#define EXPECT_EQ(val1, val2)
int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_
#define EXPECT_LE(val1, val2)
int dimPartitioned(int dim)
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
__host__ __device__ __forceinline__ T & get(array< T, m > &src)
double axpbyzNorm(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
Complex axpyCGNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void caxpbypzYmbw(const Complex &, ColorSpinorField &, const Complex &, ColorSpinorField &, ColorSpinorField &, ColorSpinorField &)
void axpyZpbx(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, double b)
void cabxpyAx(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y)
void caxpby(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y)
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void caxpyBzpx(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void axpbyz(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
void axpyBzpcx(double a, ColorSpinorField &x, ColorSpinorField &y, double b, ColorSpinorField &z, double c)
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
void caxpyXmaz(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
void ax(double a, ColorSpinorField &x)
void caxpyXmazMR(const double &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
double norm2(const ColorSpinorField &a)
void tripleCGUpdate(double alpha, double beta, ColorSpinorField &q, ColorSpinorField &r, ColorSpinorField &x, ColorSpinorField &p)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
void cxpaypbz(ColorSpinorField &, const Complex &b, ColorSpinorField &y, const Complex &c, ColorSpinorField &z)
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void start()
Start profiling.
double norm2(const CloverField &a, bool inverse=false)
void ax(const double &a, GaugeField &u)
Scale the gauge field by the scalar a.
std::complex< double > Complex
__host__ __device__ ValueType sqrt(ValueType x)
__host__ __device__ ValueType exp(ValueType x)
__host__ __device__ ValueType abs(ValueType x)
internal::ParamGenerator< T > Range(T start, T end, IncrementT step)
internal::CartesianProductHolder< Generator... > Combine(const Generator &... g)
internal::ValueArray< T... > Values(T... v)
internal::ParamGenerator< bool > Bool()
GTEST_API_ void InitGoogleTest(int *argc, char **argv)
void initQuda(int device)
void setVerbosity(QudaVerbosity verbosity)