42 DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
47 bool is_ctest =
false;
49 void *qdp_inlink[4] = {
nullptr,
nullptr,
nullptr,
nullptr};
72 void *qdp_fatlink_cpu[4] = {
nullptr,
nullptr,
nullptr,
nullptr};
73 void *qdp_longlink_cpu[4] = {
nullptr,
nullptr,
nullptr,
nullptr};
78 void *qdp_fatlink_cpu_backup[16][4];
79 void *qdp_longlink_cpu_backup[16][4];
80 void *qdp_inlink_backup[16][4];
82 void *qdp_fatlink_cpu_backup[1][4];
83 void *qdp_longlink_cpu_backup[1][4];
84 void *qdp_inlink_backup[1][4];
103 printfQuda(
"Calculating reference implementation...");
127 default:
errorQuda(
"Test type not defined");
133 static bool has_been_called =
false;
134 if (has_been_called) {
errorQuda(
"This function is not supposed to be called twice.\n"); }
140 for (
int p = 0; p < pmax; p++) {
141 for (
int d = 0; d < 4; d++) {
142 qdp_fatlink_cpu_backup[p][d] =
nullptr;
143 qdp_longlink_cpu_backup[p][d] =
nullptr;
144 qdp_inlink_backup[p][d] =
nullptr;
148 has_been_called =
true;
153 static bool has_been_called =
false;
154 if (has_been_called) {
errorQuda(
"This function is not supposed to be called twice.\n"); }
160 for (
int p = 0; p < pmax; p++) {
161 for (
int d = 0; d < 4; d++) {
162 if (qdp_inlink_backup[p][d] !=
nullptr) {
163 free(qdp_inlink_backup[p][d]);
164 qdp_inlink_backup[p][d] =
nullptr;
166 if (qdp_fatlink_cpu_backup[p][d] !=
nullptr) {
167 free(qdp_fatlink_cpu_backup[p][d]);
168 qdp_fatlink_cpu_backup[p][d] =
nullptr;
170 if (qdp_longlink_cpu_backup[p][d] !=
nullptr) {
171 free(qdp_longlink_cpu_backup[p][d]);
172 qdp_longlink_cpu_backup[p][d] =
nullptr;
176 has_been_called =
true;
221 test_split_grid = num_src > 1;
241 void *qdp_fatlink_gpu[4];
242 void *qdp_longlink_gpu[4];
244 for (
int dir = 0; dir < 4; dir++) {
251 if (qdp_fatlink_gpu[dir] == NULL || qdp_longlink_gpu[dir] == NULL || qdp_fatlink_cpu[dir] == NULL
252 || qdp_longlink_cpu[dir] == NULL) {
253 errorQuda(
"ERROR: malloc failed for fatlink/longlink");
258 for (
int dir = 0; dir < 4; dir++) {
281 ghost_fatlink_cpu = cpuFat->
Ghost();
287 ghost_longlink_cpu = cpuLong->
Ghost();
351 if (test_split_grid) {
354 for (
int n = 0; n < num_src; n++) {
377 for (
int dir = 0; dir < 4; dir++) {
378 free(qdp_fatlink_gpu[dir]);
379 qdp_fatlink_gpu[dir] =
nullptr;
380 free(qdp_longlink_gpu[dir]);
381 qdp_longlink_gpu[dir] =
nullptr;
385 free(milc_fatlink_cpu);
386 milc_fatlink_cpu =
nullptr;
387 free(milc_longlink_cpu);
388 milc_longlink_cpu =
nullptr;
395 for (
int dir = 0; dir < 4; dir++) {
396 if (qdp_fatlink_cpu[dir] !=
nullptr) {
397 free(qdp_fatlink_cpu[dir]);
398 qdp_fatlink_cpu[dir] =
nullptr;
400 if (qdp_longlink_cpu[dir] !=
nullptr) {
401 free(qdp_longlink_cpu[dir]);
402 qdp_longlink_cpu[dir] =
nullptr;
406 if (
dirac !=
nullptr) {
418 if (
tmp !=
nullptr) {
435 if (tmpCpu !=
nullptr) {
440 if (test_split_grid) {
441 for (
auto p : vp_spinor) {
delete p; }
442 for (
auto p : vp_spinor_out) {
delete p; }
444 vp_spinor_out.clear();
447 free(milc_fatlink_gpu);
448 milc_fatlink_gpu =
nullptr;
449 free(milc_longlink_gpu);
450 milc_longlink_gpu =
nullptr;
469 timeval tstart, tstop;
472 cudaEventCreate(&
start);
473 cudaEventRecord(
start, 0);
474 cudaEventSynchronize(
start);
477 cudaEventRecord(
start, 0);
479 if (test_split_grid) {
484 _hp_x[i] = vp_spinor_out[i]->V();
485 _hp_b[i] = vp_spinor[i]->V();
492 for (
int i = 0; i <
niter; i++) {
494 gettimeofday(&tstart, NULL);
500 default:
errorQuda(
"Test type %d not defined on staggered dslash.\n",
static_cast<int>(
dtest_type));
503 gettimeofday(&tstop, NULL);
504 long ds = tstop.tv_sec - tstart.tv_sec;
505 long dus = tstop.tv_usec - tstart.tv_usec;
506 double elapsed = ds + 0.000001 * dus;
510 if (i > 0 && i <
niter) {
517 cudaEventCreate(&
end);
518 cudaEventRecord(
end, 0);
519 cudaEventSynchronize(
end);
521 cudaEventElapsedTime(&runTime,
start,
end);
522 cudaEventDestroy(
start);
523 cudaEventDestroy(
end);
560 "Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate "
561 "message size %lu bytes\n",
563 1.0e-9 * 2 * ghost_bytes *
niter / dslash_time.
cpu_time, 1.0e-9 * 2 * ghost_bytes / dslash_time.
cpu_max,
564 1.0e-9 * 2 * ghost_bytes / dslash_time.
cpu_min, 2 * ghost_bytes);
570 double deviation = 0.0;
572 if (test_split_grid) {
573 for (
int n = 0; n < num_src; n++) {
575 double spinor_out_norm2 =
blas::norm2(*vp_spinor_out[n]);
579 if (std::isnan(spinor_ref_norm2)) { failed =
true; }
580 if (std::isnan(spinor_out_norm2)) { failed =
true; }
582 printfQuda(
"Results: CPU=%f, CPU-CUDA=%f\n", spinor_ref_norm2, spinor_out_norm2);
584 if (failed) { deviation = 1.0; }
592 if (std::isnan(spinor_ref_norm2)) { failed =
true; }
593 if (std::isnan(spinor_out_norm2)) { failed =
true; }
596 printfQuda(
"Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2);
598 if (failed) { deviation = 1.0; }
const ColorSpinorField & Odd() const
size_t GhostBytes() const
const ColorSpinorField & Even() const
QudaGammaBasis gammaBasis
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
QudaFieldOrder fieldOrder
unsigned long long Flops() const
returns and then zeroes flopcount
static Dirac * create(const DiracParam ¶m)
Creates a subclass from parameters.
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const
Apply M for the dirac op. E.g. the Schur Complement operator.
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
apply 'dslash' operator for the DiracOp. This may be e.g. AD
const void ** Ghost() const
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
static void RecordProperty(const std::string &key, const std::string &value)
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
QudaReconstructType link_recon
std::array< int, 4 > grid_partition
QudaDslashType dslash_type
double dslashCUDA(int niter, int mu)
cudaColorSpinorField * cudaSpinor
cudaColorSpinorField * cudaSpinorOut
cudaColorSpinorField * tmp
cpuColorSpinorField * spinor
cpuColorSpinorField * spinorOut
cpuColorSpinorField * spinorRef
QudaGaugeParam gauge_param
QudaInvertParam inv_param
@ QUDA_STAGGERED_PHASE_NO
@ QUDA_PARITY_SITE_SUBSET
@ QUDA_GHOST_EXCHANGE_PAD
@ QUDA_EVEN_ODD_SITE_ORDER
enum QudaReconstructType_s QudaReconstructType
@ QUDA_FLOAT2_FIELD_ORDER
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
enum QudaParity_s QudaParity
size_t host_gauge_data_type_size
void dw_setDims(int *X, const int L5)
void setStaggeredInvertParam(QudaInvertParam &inv_param)
void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded)
void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPrecision out_precision, QudaPrecision in_precision)
void setStaggeredGaugeParam(QudaGaugeParam &gauge_param)
QudaPrecision getPrecision(int i)
void init()
Create the BLAS context.
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
double norm2(const ColorSpinorField &a)
void axpy(double a, ColorSpinorField &x, ColorSpinorField &y)
void start()
Start profiling.
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
Main header file for the QUDA library.
QudaGaugeParam newQudaGaugeParam(void)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
QudaInvertParam newQudaInvertParam(void)
void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param)
Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers to f...
void staggeredMatDagMat(ColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, ColorSpinorField *in, double mass, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField *tmp, QudaParity parity, QudaDslashType dslash_type)
void staggeredDslash(ColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, ColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
CLI::TransformPairs< dslash_test_type > dtest_type_map
dslash_test_type dtest_type
QudaReconstructType reconstruct
QudaPrecision cuda_prec_precondition
QudaPrecision cuda_prec_refinement_sloppy
QudaPrecision cuda_prec_sloppy
QudaReconstructType reconstruct_sloppy
QudaStaggeredPhase staggered_phase_type
QudaSolutionType solution_type
int split_grid[QUDA_MAX_DIM]
int num_src_per_sub_partition
QudaGammaBasis gamma_basis
void run_test(int niter, bool print_metrics=false)
DslashTime dslashCUDA(int niter)
std::vector< cpuColorSpinorField * > vp_spinor_out
QudaGaugeParam gauge_param
void ** ghost_fatlink_cpu
void staggeredDslashRef()
std::vector< cpuColorSpinorField * > vp_spinor
void init_ctest(int precision, QudaReconstructType link_recon_, int partition)
QudaInvertParam inv_param
QudaGhostExchange ghostExchange
QudaSiteSubset siteSubset
void setVerbosity(QudaVerbosity verbosity)