47 DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
74 void *hostGauge[4] = {
nullptr};
75 void *hostClover =
nullptr;
76 void *hostCloverInv =
nullptr;
90 const bool transfer =
false;
126 void init(
int argc,
char **argv)
129 test_split_grid = num_src > 1;
133 errorQuda(
"Asqtad not supported. Please try staggered_dslash_test instead");
159 default:
errorQuda(
"Test type %d not defined QUDA_DOMAIN_WALL_4D_DSLASH\n",
static_cast<int>(
dtest_type));
172 default:
errorQuda(
"Test type %d not defined on QUDA_MOBIUS_DWF_(EOFA_)DSLASH\n",
static_cast<int>(
dtest_type));
247 if (test_split_grid) {
250 for (
int n = 0; n < num_src; n++) {
257 if (test_split_grid) {
258 for (
int n = 0; n < num_src; n++) { *vp_spinor[n] = *
spinor; }
276 printfQuda(
"Computing clover field on GPU\n");
320 printfQuda(
"Source: CPU = %e, CUDA = %e\n", cpu_norm, cuda_norm);
327 diracParam.
tmp1 = tmp1;
328 diracParam.
tmp2 = tmp2;
349 if (
dirac !=
nullptr) {
365 for (
auto p : vp_spinor) {
delete p; }
366 for (
auto p : vp_spinorOut) {
delete p; }
367 for (
auto p : vp_spinorRef) {
delete p; }
370 vp_spinorOut.clear();
371 vp_spinorRef.clear();
373 for (
int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
385 printfQuda(
"Calculating reference implementation...");
409 default:
printfQuda(
"Test type not defined\n"); exit(-1);
437 default:
printfQuda(
"Test type not defined\n"); exit(-1);
476 default:
printfQuda(
"Test type not defined\n"); exit(-1);
488 void *ref2 = (
char *)ref1 + tm_offset *
cpu_prec;
491 void *flv2 = (
char *)flv1 + tm_offset *
cpu_prec;
505 void *ref2 = (
char *)ref1 + tm_offset *
cpu_prec;
508 void *flv2 = (
char *)flv1 + tm_offset *
cpu_prec;
522 void *oddOut = (
char *)evenOut + tm_offset *
cpu_prec;
525 void *oddIn = (
char *)evenIn + tm_offset *
cpu_prec;
541 void *ref2 = (
char *)ref1 + tm_offset *
cpu_prec;
544 void *flv2 = (
char *)flv1 + tm_offset *
cpu_prec;
546 void *tmp1 = spinorTmp->
V();
547 void *tmp2 = (
char *)tmp1 + tm_offset *
cpu_prec;
565 void *oddOut = (
char *)evenOut + tm_offset *
cpu_prec;
568 void *oddIn = (
char *)evenIn + tm_offset *
cpu_prec;
570 void *evenTmp = spinorTmp->
V();
571 void *oddTmp = (
char *)evenTmp + tm_offset *
cpu_prec;
579 default:
printfQuda(
"Test type not defined\n"); exit(-1);
622 default:
printfQuda(
"Test type not defined\n"); exit(-1);
647 default: printf(
"Test type not supported for domain wall\n"); exit(-1);
650 double *kappa_5 = (
double *)malloc(
Ls *
sizeof(
double));
651 for (
int xs = 0; xs <
Ls; xs++) kappa_5[xs] =
kappa5;
685 default: printf(
"Test type not supported for domain wall\n"); exit(-1);
689 double _Complex *kappa_b = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
690 double _Complex *kappa_c = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
691 double _Complex *kappa_5 = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
692 double _Complex *kappa_mdwf = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
693 for (
int xs = 0; xs <
Lsdim; xs++) {
696 kappa_5[xs] = 0.5 * kappa_b[xs] / kappa_c[xs];
697 kappa_mdwf[xs] = -kappa_5[xs];
741 default: printf(
"Test type not supported for Mobius domain wall\n"); exit(-1);
748 double _Complex *kappa_b = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
749 double _Complex *kappa_c = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
750 double _Complex *kappa_5 = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
751 double _Complex *kappa_mdwf = (
double _Complex *)malloc(
Lsdim *
sizeof(
double _Complex));
752 for (
int xs = 0; xs <
Lsdim; xs++) {
755 kappa_5[xs] = 0.5 * kappa_b[xs] / kappa_c[xs];
756 kappa_mdwf[xs] = -kappa_5[xs];
806 default: printf(
"Test type not supported for Mobius domain wall EOFA\n"); exit(-1);
825 timeval tstart, tstop;
828 cudaEventCreate(&
start);
829 cudaEventCreate(&
end);
832 cudaEventRecord(
start, 0);
834 if (test_split_grid) {
839 _hp_x[i] = vp_spinorOut[i]->V();
840 _hp_b[i] = vp_spinor[i]->V();
853 for (
int i = 0; i <
niter; i++) {
855 gettimeofday(&tstart, NULL);
947 errorQuda(
"(transfer == true) version NOT yet available!\n");
959 errorQuda(
"(transfer == true) version NOT yet available!\n");
966 errorQuda(
"(transfer == true) version NOT yet available!\n");
973 errorQuda(
"(transfer == true) version NOT yet available!\n");
980 errorQuda(
"(transfer == true) version NOT yet available!\n");
988 errorQuda(
"(transfer == true) version NOT yet available!\n");
997 errorQuda(
"(transfer == true) version NOT yet available!\n");
1005 errorQuda(
"(transfer == true) version NOT yet available!\n");
1050 gettimeofday(&tstop, NULL);
1051 long ds = tstop.tv_sec - tstart.tv_sec;
1052 long dus = tstop.tv_usec - tstart.tv_usec;
1053 double elapsed = ds + 0.000001 * dus;
1057 if (i > 0 && i <
niter) {
1058 if (elapsed < dslash_time.
cpu_min) dslash_time.
cpu_min = elapsed;
1059 if (elapsed > dslash_time.
cpu_max) dslash_time.
cpu_max = elapsed;
1064 cudaEventRecord(
end, 0);
1065 cudaEventSynchronize(
end);
1067 cudaEventElapsedTime(&runTime,
start,
end);
1068 cudaEventDestroy(
start);
1069 cudaEventDestroy(
end);
1087 if (!test_split_grid) {
1093 unsigned long long flops = 0;
1101 printfQuda(
"Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for "
1102 "aggregate message size %lu bytes\n",
1104 1.0e-9 * 2 * ghost_bytes *
niter / dslash_time.
cpu_time, 1.0e-9 * 2 * ghost_bytes / dslash_time.
cpu_max,
1105 1.0e-9 * 2 * ghost_bytes / dslash_time.
cpu_min, 2 * ghost_bytes);
1121 if (test_split_grid) {
1122 for (
int n = 0; n < num_src; n++) {
1124 double norm2_cpu_cuda =
blas::norm2(*vp_spinorOut[n]);
1125 printfQuda(
"Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
1133 printfQuda(
"Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
1135 printfQuda(
"Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);
size_t GhostBytes() const
QudaGammaBasis gammaBasis
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
QudaFieldOrder fieldOrder
QudaTwistFlavorType twistFlavor
virtual void MdagMLocal(ColorSpinorField &out, const ColorSpinorField &in) const
Apply the local MdagM operator: equivalent to applying zero Dirichlet boundary condition to MdagM on ...
unsigned long long Flops() const
returns and then zeroes flopcount
static Dirac * create(const DiracParam ¶m)
Creates a subclass from parameters.
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Apply M for the dirac op. E.g. the Schur Complement operator.
virtual void MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
Apply MdagM operator which may be optimized.
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
apply 'dslash' operator for the DiracOp. This may be e.g. AD
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
static void RecordProperty(const std::string &key, const std::string &value)
void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity, int dagger, QudaPrecision precision, QudaGaugeParam ¶m)
void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, int parity, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam ¶m)
void cloverHasenbuschTwist_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
void cloverHasenbuchTwist_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, QudaMatPCType matpc_type)
void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
QudaReconstructType link_recon
std::array< int, 4 > grid_partition
QudaDslashType dslash_type
std::string get_string(CLI::TransformPairs< T > &map, T val)
double dslashCUDA(int niter, int mu)
cudaColorSpinorField * cudaSpinor
cudaColorSpinorField * cudaSpinorOut
cpuColorSpinorField * spinor
cpuColorSpinorField * spinorOut
cpuColorSpinorField * spinorRef
QudaGaugeParam gauge_param
QudaInvertParam inv_param
void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5, bool zero_initialize)
void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, bool zero_initialize)
void mdw_dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *kappa)
void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift, QudaPrecision precision)
void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift)
void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift, QudaPrecision precision)
void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *kappa, bool zero_initialize)
void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift)
void dslashQuda_mdwf(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type)
void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type)
CLI::TransformPairs< dslash_test_type > dtest_type_map
@ QUDA_TWISTED_CLOVER_DSLASH
@ QUDA_CLOVER_WILSON_DSLASH
@ QUDA_TWISTED_MASS_DSLASH
@ QUDA_DOMAIN_WALL_DSLASH
@ QUDA_MOBIUS_DWF_EOFA_DSLASH
@ QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
@ QUDA_DOMAIN_WALL_4D_DSLASH
@ QUDA_PARITY_SITE_SUBSET
enum QudaDagType_s QudaDagType
@ QUDA_EVEN_ODD_SITE_ORDER
enum QudaReconstructType_s QudaReconstructType
@ QUDA_MATDAG_MAT_SOLUTION
@ QUDA_MATPCDAG_MATPC_SOLUTION
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
enum QudaParity_s QudaParity
void constructHostCloverField(void *clover, void *clover_inv, QudaInvertParam &inv_param)
QudaPrecision & cuda_prec
void dw_setDims(int *X, const int L5)
void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc, char **argv)
void setWilsonGaugeParam(QudaGaugeParam &gauge_param)
QudaPrecision getPrecision(int i)
void setInvertParam(QudaInvertParam &invertParam, QudaInvertArgs_t &inv_args, int external_precision, int quda_precision, double kappa, double reliable_delta)
void init()
Create the BLAS context.
double norm2(const ColorSpinorField &a)
void start()
Start profiling.
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Main header file for the QUDA library.
QudaGaugeParam newQudaGaugeParam(void)
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv)
Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers to dire...
void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, QudaGaugeParam *gauge_param)
Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into sub-partitions...
void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
QudaInvertParam newQudaInvertParam(void)
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
dslash_test_type dtest_type
void init_test(int argc, char **argv)
void init(int argc, char **argv)
void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon)
std::vector< quda::cpuColorSpinorField * > vp_spinorOut
void run_test(int niter, bool print_metrics=false)
DslashTime dslashCUDA(int niter)
std::vector< quda::cpuColorSpinorField * > vp_spinor
QudaGaugeParam gauge_param
QudaInvertParam inv_param
std::vector< quda::cpuColorSpinorField * > vp_spinorRef
QudaReconstructType reconstruct
QudaPrecision cuda_prec_precondition
QudaPrecision cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_sloppy
QudaReconstructType reconstruct_sloppy
QudaSolutionType solution_type
int split_grid[QUDA_MAX_DIM]
int compute_clover_inverse
QudaTwistFlavorType twist_flavor
QudaPrecision clover_cpu_prec
int return_clover_inverse
double_complex b_5[QUDA_MAX_DWF_LS]
int num_src_per_sub_partition
double_complex c_5[QUDA_MAX_DWF_LS]
QudaGammaBasis gamma_basis
QudaSiteSubset siteSubset
void setVerbosity(QudaVerbosity verbosity)
void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_dslash(void *res, void **gaugeFull, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor, int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_ndeg_mat(void *evenOut, void *oddOut, void **gauge, void *evenIn, void *oddIn, double kappa, double mu, double epsilon, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_ndeg_matpc(void *outEven1, void *outEven2, void **gauge, void *inEven1, void *inEven2, double kappa, double mu, double epsilon, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
void tm_ndeg_dslash(void *res1, void *res2, void **gauge, void *spinorField1, void *spinorField2, double kappa, double mu, double epsilon, int oddBit, int daggerBit, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param)