22 #include <gtest/gtest.h> 24 #define MAX(a,b) ((a)>(b)?(a):(b)) 94 void init(
int argc,
char **argv) {
101 gauge_param.
X[0] =
xdim;
102 gauge_param.
X[1] =
ydim;
103 gauge_param.
X[2] =
zdim;
104 gauge_param.
X[3] =
tdim;
107 errorQuda(
"Asqtad not supported. Please try staggered_dslash_test instead");
130 inv_param.
kappa = 0.1;
142 for(
int k = 0; k <
Lsdim; k++)
146 inv_param.
b_5[k] = 1.50;
147 inv_param.
c_5[k] = 0.50;
162 errorQuda(
"Gauge and spinor CPU precisions must match");
169 #ifndef MULTI_GPU // free parameter for single GPU 171 #else // must be this one c/b face for multi gpu 172 int x_face_size = gauge_param.
X[1]*gauge_param.
X[2]*gauge_param.
X[3]/2;
173 int y_face_size = gauge_param.
X[0]*gauge_param.
X[2]*gauge_param.
X[3]/2;
174 int z_face_size = gauge_param.
X[0]*gauge_param.
X[1]*gauge_param.
X[3]/2;
175 int t_face_size = gauge_param.
X[0]*gauge_param.
X[1]*gauge_param.
X[2]/2;
176 int pad_size =
MAX(x_face_size, y_face_size);
177 pad_size =
MAX(pad_size, z_face_size);
178 pad_size =
MAX(pad_size, t_face_size);
179 gauge_param.
ga_pad = pad_size;
260 for (
int d=0; d<4; d++) csParam.
x[d] = gauge_param.
X[d];
277 csParam.
x[4] = inv_param.
Ls;
300 csParam.
x[0] = gauge_param.
X[0];
339 else printfQuda(
"Sending clover field to GPU\n");
386 printfQuda(
"Source: CPU = %e, CUDA = %e\n", cpu_norm, cuda_norm);
428 for (
int dir = 0; dir < 4; dir++) free(
hostGauge[dir]);
443 DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
450 timeval tstart, tstop;
452 cudaEvent_t start,
end;
453 cudaEventCreate(&start);
454 cudaEventCreate(&end);
457 cudaEventRecord(start, 0);
459 for (
int i = 0; i <
niter; i++) {
461 gettimeofday(&tstart, NULL);
491 dirac->
M(*cudaSpinorOut, *cudaSpinor);
499 dirac->
MdagM(*cudaSpinorOut, *cudaSpinor);
538 dirac->
M(*cudaSpinorOut, *cudaSpinor);
546 dirac->
MdagM(*cudaSpinorOut, *cudaSpinor);
572 dirac->
M(*cudaSpinorOut, *cudaSpinor);
580 dirac->
MdagM(*cudaSpinorOut, *cudaSpinor);
586 gettimeofday(&tstop, NULL);
587 long ds = tstop.tv_sec - tstart.tv_sec;
588 long dus = tstop.tv_usec - tstart.tv_usec;
589 double elapsed = ds + 0.000001*dus;
593 if (i>0 && i<niter) {
599 cudaEventRecord(end, 0);
600 cudaEventSynchronize(end);
602 cudaEventElapsedTime(&runTime, start, end);
603 cudaEventDestroy(start);
604 cudaEventDestroy(end);
609 cudaError_t stat = cudaGetLastError();
610 if (stat != cudaSuccess)
611 printfQuda(
"with ERROR: %s\n", cudaGetErrorString(stat));
619 printfQuda(
"Calculating reference implementation...");
683 int tm_offset = 12*spinorRef->
Volume();
685 void *ref1 = spinorRef->
V();
686 void *ref2 = (
char*)ref1 + tm_offset*
cpu_prec;
688 void *flv1 = spinor->
V();
689 void *flv2 = (
char*)flv1 + tm_offset*
cpu_prec;
700 int tm_offset = 12*spinorRef->
Volume();
702 void *ref1 = spinorRef->
V();
703 void *ref2 = (
char*)ref1 + tm_offset*
cpu_prec;
705 void *flv1 = spinor->
V();
706 void *flv2 = (
char*)flv1 + tm_offset*
cpu_prec;
708 tm_ndeg_matpc(ref1, ref2,
hostGauge, flv1, flv2, inv_param.
kappa, inv_param.
mu, inv_param.
epsilon, inv_param.
matpc_type,
dagger, inv_param.
cpu_prec, gauge_param);
716 int tm_offset = 12*spinorRef->
Volume();
718 void *evenOut = spinorRef->
V();
719 void *oddOut = (
char*)evenOut + tm_offset*
cpu_prec;
721 void *evenIn = spinor->
V();
722 void *oddIn = (
char*)evenIn + tm_offset*
cpu_prec;
724 tm_ndeg_mat(evenOut, oddOut,
hostGauge, evenIn, oddIn, inv_param.
kappa, inv_param.
mu, inv_param.
epsilon,
dagger, inv_param.
cpu_prec, gauge_param);
736 int tm_offset = 12*spinorRef->
Volume();
738 void *ref1 = spinorRef->
V();
739 void *ref2 = (
char*)ref1 + tm_offset*
cpu_prec;
741 void *flv1 = spinor->
V();
742 void *flv2 = (
char*)flv1 + tm_offset*
cpu_prec;
744 void *tmp1 = spinorTmp->
V();
747 tm_ndeg_matpc(tmp1, tmp2,
hostGauge, flv1, flv2, inv_param.
kappa, inv_param.
mu, inv_param.
epsilon, inv_param.
matpc_type,
dagger, inv_param.
cpu_prec, gauge_param);
748 tm_ndeg_matpc(ref1, ref2,
hostGauge, tmp1, tmp2, inv_param.
kappa, inv_param.
mu, inv_param.
epsilon, inv_param.
matpc_type,
not_dagger, inv_param.
cpu_prec, gauge_param);
760 int tm_offset = 12*spinorRef->
Volume();
762 void *evenOut = spinorRef->
V();
763 void *oddOut = (
char*)evenOut + tm_offset*
cpu_prec;
765 void *evenIn = spinor->
V();
766 void *oddIn = (
char*)evenIn + tm_offset*
cpu_prec;
768 void *evenTmp = spinorTmp->
V();
769 void *oddTmp = (
char*)evenTmp + tm_offset*
cpu_prec;
771 tm_ndeg_mat(evenTmp, oddTmp,
hostGauge, evenIn, oddIn, inv_param.
kappa, inv_param.
mu, inv_param.
epsilon,
dagger, inv_param.
cpu_prec, gauge_param);
772 tm_ndeg_mat(evenOut, oddOut,
hostGauge, evenTmp, oddTmp, inv_param.
kappa, inv_param.
mu, inv_param.
epsilon,
not_dagger, inv_param.
cpu_prec, gauge_param);
783 tmc_dslash(spinorRef->
V(),
hostGauge, spinor->
V(),
hostClover,
hostCloverInv, inv_param.
kappa, inv_param.
mu, inv_param.
twist_flavor,
parity, inv_param.
matpc_type,
dagger, inv_param.
cpu_prec,
gauge_param);
789 tmc_matpc(spinorRef->
V(),
hostGauge, spinor->
V(),
hostClover,
hostCloverInv, inv_param.
kappa, inv_param.
mu, inv_param.
twist_flavor, inv_param.
matpc_type,
dagger, inv_param.
cpu_prec,
gauge_param);
838 printf(
"Test type not supported for domain wall\n");
842 double *kappa_5 = (
double*)malloc(
Ls*
sizeof(
double));
843 for(
int xs = 0; xs <
Ls ; xs++)
873 printf(
"Test type not supported for domain wall\n");
878 double _Complex *kappa_b = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
879 double _Complex *kappa_c = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
880 double _Complex *kappa_5 = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
881 double _Complex *kappa_mdwf = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
882 for(
int xs = 0 ; xs <
Lsdim ; xs++)
884 kappa_b[xs] = 1.0/(2*(inv_param.
b_5[xs]*(4.0 + inv_param.
m5) + 1.0));
885 kappa_c[xs] = 1.0/(2*(inv_param.
c_5[xs]*(4.0 + inv_param.
m5) - 1.0));
886 kappa_5[xs] = 0.5*kappa_b[xs]/kappa_c[xs];
887 kappa_mdwf[xs] = -kappa_5[xs];
897 mdw_dslash_4_pre(spinorRef->
V(),
hostGauge, spinor->
V(),
parity,
dagger, gauge_param.
cpu_prec,
gauge_param, inv_param.
mass, inv_param.
b_5, inv_param.
c_5,
true);
901 inv_param.
mass, kappa_mdwf);
904 mdw_matpc(spinorRef->
V(),
hostGauge, spinor->
V(), kappa_b, kappa_c, inv_param.
matpc_type,
dagger, gauge_param.
cpu_prec,
gauge_param, inv_param.
mass, inv_param.
b_5, inv_param.
c_5);
908 inv_param.
mass, inv_param.
b_5, inv_param.
c_5);
911 mdw_matpc(spinorTmp->
V(),
hostGauge, spinor->
V(), kappa_b, kappa_c, inv_param.
matpc_type,
dagger, gauge_param.
cpu_prec,
gauge_param, inv_param.
mass, inv_param.
b_5, inv_param.
c_5);
912 mdw_matpc(spinorRef->
V(),
hostGauge, spinorTmp->
V(), kappa_b, kappa_c, inv_param.
matpc_type,
not_dagger, gauge_param.
cpu_prec,
gauge_param, inv_param.
mass, inv_param.
b_5, inv_param.
c_5);
916 inv_param.
mass, inv_param.
b_5, inv_param.
c_5);
921 printf(
"Test type not supported for domain wall\n");
941 printfQuda(
"prec recon test_type matpc_type dagger S_dim T_dimension Ls_dimension dslash_type niter\n");
942 printfQuda(
"%6s %2s %d %12s %d %3d/%3d/%3d %3d %2d %14s %d\n",
957 extern void usage(
char**);
964 ASSERT_LE(deviation, tol) <<
"CPU and CUDA implementations do not agree";
967 int main(
int argc,
char **argv)
970 ::testing::InitGoogleTest(&argc, argv);
974 for (
int i =1;i < argc; i++) {
979 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
991 for (
int i=0; i<attempts; i++) {
1007 unsigned long long flops = 0;
1010 "%llu flops per kernel call, %llu flops per site\n", flops /
niter, (flops /
niter) / cudaSpinor->
Volume());
1013 printfQuda(
"Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate message size %lu bytes\n",
1022 printfQuda(
"Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
1024 printfQuda(
"Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
1028 ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
1029 if (
comm_rank() != 0) {
delete listeners.Release(listeners.default_result_printer()); }
1031 test_rc = RUN_ALL_TESTS();
QudaGaugeParam gauge_param
cudaColorSpinorField * cudaSpinorOut
int dimPartitioned(int dim)
QudaDiracFieldOrder dirac_order
double getTolerance(QudaPrecision prec)
void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, bool zero_initialize)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
cpuColorSpinorField * spinorRef
enum QudaPrecision_s QudaPrecision
double_complex c_5[QUDA_MAX_DWF_LS]
void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
void dw_setDims(int *X, const int L5)
cpuColorSpinorField * spinorTmp
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_dslash(void *res, void **gaugeFull, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
double norm2(const ColorSpinorField &a)
QudaDslashType dslash_type
int return_clover_inverse
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam ¶m)
int process_command_line_option(int argc, char **argv, int *idx)
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
cudaColorSpinorField * tmp1
void tm_ndeg_mat(void *evenOut, void *oddOut, void **gauge, void *evenIn, void *oddIn, double kappa, double mu, double epsilon, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
const char * get_matpc_str(QudaMatPCType type)
void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaGaugeFieldOrder gauge_order
void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
cpuColorSpinorField * spinor
void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
const char * get_prec_str(QudaPrecision prec)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
unsigned long long Flops() const
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const =0
QudaSiteSubset siteSubset
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
QudaFieldLocation input_location
int gridsize_from_cmdline[]
double_complex b_5[QUDA_MAX_DWF_LS]
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
int main(int argc, char **argv)
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
QudaPrecision clover_cuda_prec
void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5, bool zero_initialize)
void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
DslashTime dslashCUDA(int niter)
void initQuda(int device)
void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
cudaColorSpinorField * tmp2
QudaFieldLocation output_location
QudaFieldOrder fieldOrder
void mdw_dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *kappa)
void setSpinorSiteSize(int n)
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
QudaCloverFieldOrder clover_order
void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
enum QudaMatPCType_s QudaMatPCType
QudaGammaBasis gammaBasis
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
QudaGammaBasis gamma_basis
QudaDslashType dslash_type
const char * get_dslash_str(QudaDslashType type)
void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
QudaTwistFlavorType twist_flavor
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity, int dagger, QudaPrecision precision, QudaGaugeParam ¶m)
int compute_clover_inverse
void tm_ndeg_dslash(void *res1, void *res2, void **gauge, void *spinorField1, void *spinorField2, double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param)
void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
QudaInvertParam inv_param
void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *kappa, bool zero_initialize)
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
cpuColorSpinorField * spinorOut
QudaPrecision Precision() const
void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
QudaTwistFlavorType twist_flavor
void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaTwistFlavorType twistFlavor
enum QudaDslashType_s QudaDslashType
QudaReconstructType link_recon
void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void init(int argc, char **argv)
enum QudaVerbosity_s QudaVerbosity
void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
static Dirac * create(const DiracParam ¶m)
void initComms(int argc, char **argv, int *const commDims)
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
size_t GhostBytes() const
QudaPrecision clover_cpu_prec
cudaColorSpinorField * cudaSpinor
void setVerbosity(QudaVerbosity verbosity)
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int test_type)
void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
QudaGaugeParam newQudaGaugeParam(void)
enum QudaTwistFlavorType_s QudaTwistFlavorType