22 #include <gtest/gtest.h> 24 #define MAX(a,b) ((a)>(b)?(a):(b)) 78 extern void usage(
char**);
82 const char *
prec_str[] = {
"quarter",
"half",
"single",
"double"};
108 gauge_param.
X[0] =
xdim;
109 gauge_param.
X[1] =
ydim;
110 gauge_param.
X[2] =
zdim;
111 gauge_param.
X[3] =
tdim;
114 errorQuda(
"Asqtad not supported. Please try staggered_dslash_test instead");
138 inv_param.
kappa = 0.1;
149 for(
int k = 0; k <
Lsdim; k++)
153 inv_param.
b_5[k] = 1.50;
154 inv_param.
c_5[k] = 0.50;
169 errorQuda(
"Gauge and spinor CPU precisions must match");
176 #ifndef MULTI_GPU // free parameter for single GPU 178 #else // must be this one c/b face for multi gpu 179 int x_face_size = gauge_param.
X[1]*gauge_param.
X[2]*gauge_param.
X[3]/2;
180 int y_face_size = gauge_param.
X[0]*gauge_param.
X[2]*gauge_param.
X[3]/2;
181 int z_face_size = gauge_param.
X[0]*gauge_param.
X[1]*gauge_param.
X[3]/2;
182 int t_face_size = gauge_param.
X[0]*gauge_param.
X[1]*gauge_param.
X[2]/2;
183 int pad_size =
MAX(x_face_size, y_face_size);
184 pad_size =
MAX(pad_size, z_face_size);
185 pad_size =
MAX(pad_size, t_face_size);
186 gauge_param.
ga_pad = pad_size;
270 for (
int d=0; d<4; d++) csParam.
x[d] = gauge_param.
X[d];
287 csParam.
x[4] = inv_param.
Ls;
316 csParam.
x[0] = gauge_param.
X[0];
349 else printfQuda(
"Sending clover field to GPU\n");
412 dirac = (
Dirac*)dirac_4dpc;
416 dirac = (
Dirac*)dirac_mdwf;
450 for (
int dir = 0; dir < 4; dir++) free(
hostGauge[dir]);
465 DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
472 timeval tstart, tstop;
474 cudaEvent_t start,
end;
475 cudaEventCreate(&start);
476 cudaEventCreate(&end);
479 cudaEventRecord(start, 0);
481 for (
int i = 0; i <
niter; i++) {
483 gettimeofday(&tstart, NULL);
512 dirac_4dpc->
M(*cudaSpinorOut, *cudaSpinor);
519 dirac_4dpc->
MdagM(*cudaSpinorOut, *cudaSpinor);
558 dirac_mdwf->
M(*cudaSpinorOut, *cudaSpinor);
565 dirac_mdwf->
MdagM(*cudaSpinorOut, *cudaSpinor);
590 dirac->
M(*cudaSpinorOut, *cudaSpinor);
597 dirac->
M(*cudaSpinorOut, *cudaSpinor);
604 dirac->
MdagM(*cudaSpinorOut, *cudaSpinor);
611 dirac->
MdagM(*cudaSpinorOut, *cudaSpinor);
617 gettimeofday(&tstop, NULL);
618 long ds = tstop.tv_sec - tstart.tv_sec;
619 long dus = tstop.tv_usec - tstart.tv_usec;
620 double elapsed = ds + 0.000001*dus;
624 if (i>0 && i<niter) {
630 cudaEventRecord(end, 0);
631 cudaEventSynchronize(end);
633 cudaEventElapsedTime(&runTime, start, end);
634 cudaEventDestroy(start);
635 cudaEventDestroy(end);
640 cudaError_t stat = cudaGetLastError();
641 if (stat != cudaSuccess)
642 printfQuda(
"with ERROR: %s\n", cudaGetErrorString(stat));
710 tm_dslash(spinorRef->
V(),
hostGauge, spinor->
V(), inv_param.
kappa, inv_param.
mu, inv_param.
twist_flavor,
parity, inv_param.
matpc_type,
dagger, inv_param.
cpu_prec,
gauge_param);
713 int tm_offset = 12*spinorRef->
Volume();
715 void *ref1 = spinorRef->
V();
716 void *ref2 = (
char*)ref1 + tm_offset*
cpu_prec;
718 void *flv1 = spinor->
V();
719 void *flv2 = (
char*)flv1 + tm_offset*
cpu_prec;
730 int tm_offset = 12 * spinorRef->
Volume();
732 void *ref1 = spinorRef->
V();
733 void *ref2 = (
char *)ref1 + tm_offset *
cpu_prec;
735 void *flv1 = spinor->
V();
736 void *flv2 = (
char *)flv1 + tm_offset *
cpu_prec;
747 int tm_offset = 12 * spinorRef->
Volume();
749 void *evenOut = spinorRef->
V();
750 void *oddOut = (
char *)evenOut + tm_offset *
cpu_prec;
752 void *evenIn = spinor->
V();
753 void *oddIn = (
char *)evenIn + tm_offset *
cpu_prec;
766 int tm_offset = 12 * spinorRef->
Volume();
768 void *ref1 = spinorRef->
V();
769 void *ref2 = (
char *)ref1 + tm_offset *
cpu_prec;
771 void *flv1 = spinor->
V();
772 void *flv2 = (
char *)flv1 + tm_offset *
cpu_prec;
774 void *tmp1 = spinorTmp->
V();
790 int tm_offset = 12 * spinorRef->
Volume();
792 void *evenOut = spinorRef->
V();
793 void *oddOut = (
char *)evenOut + tm_offset *
cpu_prec;
795 void *evenIn = spinor->
V();
796 void *oddIn = (
char *)evenIn + tm_offset *
cpu_prec;
798 void *evenTmp = spinorTmp->
V();
799 void *oddTmp = (
char *)evenTmp + tm_offset *
cpu_prec;
807 default:
printfQuda(
"Test type not defined\n"); exit(-1);
813 tmc_dslash(spinorRef->
V(),
hostGauge, spinor->
V(),
hostClover,
hostCloverInv, inv_param.
kappa, inv_param.
mu, inv_param.
twist_flavor,
parity, inv_param.
matpc_type,
dagger, inv_param.
cpu_prec,
gauge_param);
873 printf(
"Test type not supported for domain wall\n");
877 double *kappa_5 = (
double*)malloc(
Ls*
sizeof(
double));
878 for(
int xs = 0; xs <
Ls ; xs++)
886 inv_param.
mass,
true);
890 inv_param.
mass, kappa_5);
903 printf(
"Test type not supported for domain wall\n");
908 double _Complex *kappa_b = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
909 double _Complex *kappa_c = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
910 double _Complex *kappa_5 = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
911 double _Complex *kappa_mdwf = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
912 for(
int xs = 0 ; xs <
Lsdim ; xs++)
914 kappa_b[xs] = 1.0/(2*(inv_param.
b_5[xs]*(4.0 + inv_param.
m5) + 1.0));
915 kappa_c[xs] = 1.0/(2*(inv_param.
c_5[xs]*(4.0 + inv_param.
m5) - 1.0));
916 kappa_5[xs] = 0.5*kappa_b[xs]/kappa_c[xs];
917 kappa_mdwf[xs] = -kappa_5[xs];
927 mdw_dslash_4_pre(spinorRef->
V(),
hostGauge, spinor->
V(),
parity,
dagger, gauge_param.
cpu_prec,
gauge_param, inv_param.
mass, inv_param.
b_5, inv_param.
c_5,
true);
931 inv_param.
mass, kappa_mdwf);
945 printf(
"Test type not supported for domain wall\n");
966 printfQuda(
"prec recon test_type matpc_type dagger S_dim T_dimension Ls_dimension dslash_type niter\n");
967 printfQuda(
"%6s %2s %d %12s %d %3d/%3d/%3d %3d %2d %14s %d\n",
983 using ::testing::TestWithParam;
984 using ::testing::Bool;
985 using ::testing::Values;
986 using ::testing::Range;
987 using ::testing::Combine;
989 class DslashTest :
public ::testing::TestWithParam<::testing::tuple<int, int, int>> {
991 ::testing::tuple<int, int, int>
param;
996 if ((QUDA_PRECISION &
getPrecision(::testing::get<0>(GetParam()))) == 0
1006 int prec = ::testing::get<0>(GetParam());
1009 if (skip()) GTEST_SKIP();
1011 int value = ::testing::get<2>(GetParam());
1012 for(
int j=0; j < 4;j++){
1013 if (value & (1 << j)){
1026 if (skip()) GTEST_SKIP();
1056 printfQuda(
"Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
1058 printfQuda(
"Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
1065 ASSERT_LE(deviation, tol) <<
"CPU and CUDA implementations do not agree";
1074 printfQuda(
"%fus per kernel call\n", 1e6 * dslash_time.event_time /
niter);
1076 unsigned long long flops = 0;
1078 double gflops = 1.0e-9 * flops / dslash_time.event_time;
1080 RecordProperty(
"Gflops", std::to_string(gflops));
1081 RecordProperty(
"Halo_bidirectitonal_BW_GPU", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() *
niter / dslash_time.event_time);
1082 RecordProperty(
"Halo_bidirectitonal_BW_CPU", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() *
niter / dslash_time.cpu_time);
1083 RecordProperty(
"Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() / dslash_time.cpu_max);
1084 RecordProperty(
"Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() / dslash_time.cpu_min);
1085 RecordProperty(
"Halo_message_size_bytes", 2 * cudaSpinor->
GhostBytes());
1087 printfQuda(
"Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate " 1088 "message size %lu bytes\n",
1089 1.0e-9 * 2 * cudaSpinor->
GhostBytes() *
niter / dslash_time.event_time,
1091 1.0e-9 * 2 * cudaSpinor->
GhostBytes() / dslash_time.cpu_max,
1098 ::testing::InitGoogleTest(&argc, argv);
1101 for (
int i = 1; i < argc; i++) {
1104 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
1110 ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
1111 if (
comm_rank() != 0) {
delete listeners.Release(listeners.default_result_printer()); }
1112 test_rc = RUN_ALL_TESTS();
1120 const int prec = ::testing::get<0>(
param.param);
1121 const int recon = ::testing::get<1>(
param.param);
1122 const int part = ::testing::get<2>(
param.param);
1123 std::stringstream ss;
1127 ss <<
"_r" << recon;
1128 ss <<
"_partition" << part;
1139 ::testing::Values(0)),
cudaColorSpinorField * tmp2
QudaDiracFieldOrder dirac_order
QudaReconstructType reconstruct_sloppy
void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
int main(int argc, char **argv)
cudaColorSpinorField * tmp1
void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, bool zero_initialize)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
int getReconstructNibble(QudaReconstructType recon)
int gridsize_from_cmdline[]
enum QudaPrecision_s QudaPrecision
DiracMobiusPC * dirac_mdwf
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
double_complex c_5[QUDA_MAX_DWF_LS]
void dw_setDims(int *X, const int L5)
cpuColorSpinorField * spinorRef
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_dslash(void *res, void **gaugeFull, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
double norm2(const ColorSpinorField &a)
QudaDslashType dslash_type
int return_clover_inverse
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam ¶m)
void commDimPartitionedSet(int dir)
int process_command_line_option(int argc, char **argv, int *idx)
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
void tm_ndeg_mat(void *evenOut, void *oddOut, void **gauge, void *evenIn, void *oddIn, double kappa, double mu, double epsilon, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
const char * get_matpc_str(QudaMatPCType type)
void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaGaugeParam gauge_param
QudaPrecision clover_cuda_prec_refinement_sloppy
QudaGaugeFieldOrder gauge_order
void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
const char * get_prec_str(QudaPrecision prec)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void M(ColorSpinorField &out, const ColorSpinorField &in) const
unsigned long long Flops() const
QudaTwistFlavorType twist_flavor
QudaDslashType dslash_type
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0
QudaSiteSubset siteSubset
QudaPrecision clover_cuda_prec_sloppy
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
QudaFieldLocation input_location
double_complex b_5[QUDA_MAX_DWF_LS]
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
QudaPrecision clover_cuda_prec
void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5, bool zero_initialize)
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void initQuda(int device)
void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaPrecision getPrecision(int i)
QudaFieldLocation output_location
QudaPrecision clover_cuda_prec_precondition
double benchmark(int kernel, const int niter)
QudaFieldOrder fieldOrder
QudaReconstructType link_recon
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity, const double &kappa5) const
void mdw_dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *kappa)
void setSpinorSiteSize(int n)
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
QudaCloverFieldOrder clover_order
enum QudaMatPCType_s QudaMatPCType
QudaGammaBasis gammaBasis
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
QudaGammaBasis gamma_basis
QudaPrecision cuda_prec_sloppy
void Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
const char * get_dslash_str(QudaDslashType type)
void display_test_info(int precision, QudaReconstructType link_recon)
std::string getdslashtestname(testing::TestParamInfo<::testing::tuple< int, int, int >> param)
void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
void Dslash4(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity, int dagger, QudaPrecision precision, QudaGaugeParam ¶m)
int compute_clover_inverse
::testing::tuple< int, int, int > param
void tm_ndeg_dslash(void *res1, void *res2, void **gauge, void *spinorField1, void *spinorField2, double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param)
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
DslashTime dslashCUDA(int niter)
void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *kappa, bool zero_initialize)
enum QudaReconstructType_s QudaReconstructType
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Main header file for the QUDA library.
QudaPrecision Precision() const
void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
INSTANTIATE_TEST_SUITE_P(QUDA, DslashTest, Combine(Range(0, 4), ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8), ::testing::Values(0)), getdslashtestname)
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
QudaTwistFlavorType twist_flavor
void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaTwistFlavorType twistFlavor
void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
enum QudaDslashType_s QudaDslashType
static void SetUpTestCase()
QudaInvertParam inv_param
void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
static void TearDownTestCase()
enum QudaVerbosity_s QudaVerbosity
void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
cpuColorSpinorField * spinorTmp
static Dirac * create(const DiracParam ¶m)
void Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
cudaColorSpinorField * cudaSpinorOut
void initComms(int argc, char **argv, int *const commDims)
void Dslash5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
cudaColorSpinorField * cudaSpinor
size_t GhostBytes() const
cpuColorSpinorField * spinor
QudaPrecision clover_cpu_prec
void M(ColorSpinorField &out, const ColorSpinorField &in) const
void init(int precision, QudaReconstructType link_recon)
void setVerbosity(QudaVerbosity verbosity)
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
double getTolerance(QudaPrecision prec)
void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
void updateR()
update the radius for halos.
QudaGaugeParam newQudaGaugeParam(void)
cpuColorSpinorField * spinorOut
enum QudaTwistFlavorType_s QudaTwistFlavorType
DiracDomainWall4DPC * dirac_4dpc
TEST_P(DslashTest, verify)