26 #include <gtest/gtest.h> 30 #define MAX(a,b) ((a)>(b)?(a):(b)) 32 #define staggeredSpinorSiteSize 6 37 extern void usage(
char** argv );
39 void *
qdp_inlink[4] = {
nullptr,
nullptr,
nullptr,
nullptr };
106 gaugeParam.
X[0] =
X[0] =
xdim;
107 gaugeParam.
X[1] =
X[1] =
ydim;
108 gaugeParam.
X[2] =
X[2] =
zdim;
109 gaugeParam.
X[3] =
X[3] =
tdim;
130 gaugeParam.
scale = -1.0 / 24.0;
135 gaugeParam.
scale = 1.0;
143 int tmpint =
MAX(
X[1] *
X[2] *
X[3], X[0] * X[2] * X[3]);
144 tmpint =
MAX(tmpint, X[0] * X[1] * X[3]);
145 tmpint =
MAX(tmpint, X[0] * X[1] * X[2]);
147 gaugeParam.
ga_pad = tmpint;
168 int tmpint =
MAX(
X[1]*
X[2]*
X[3], X[0]*X[2]*X[3]);
169 tmpint =
MAX(tmpint, X[0]*X[1]*X[3]);
170 tmpint =
MAX(tmpint, X[0]*X[1]*X[2]);
172 inv_param.
sp_pad = tmpint;
199 void* qdp_fatlink_gpu[4];
200 void* qdp_longlink_gpu[4];
202 for (
int dir = 0; dir < 4; dir++) {
209 if (qdp_fatlink_gpu[dir] == NULL || qdp_longlink_gpu[dir] == NULL ||
211 errorQuda(
"ERROR: malloc failed for fatlink/longlink");
216 for (
int dir = 0; dir < 4; dir++) {
239 for (
int dir = 0; dir < 4; dir++) {
252 for (
int dir = 0; dir < 4; dir++) {
284 int x_face_size =
X[1]*
X[2]*
X[3]/2;
285 int y_face_size =
X[0]*
X[2]*
X[3]/2;
286 int z_face_size =
X[0]*
X[1]*
X[3]/2;
287 int t_face_size =
X[0]*
X[1]*
X[2]/2;
288 int pad_size =
MAX(x_face_size, y_face_size);
289 pad_size =
MAX(pad_size, z_face_size);
290 pad_size =
MAX(pad_size, t_face_size);
291 gaugeParam.
ga_pad = pad_size;
313 gaugeParam.
ga_pad = 3*pad_size;
331 for (
int d = 0; d < 4; d++) { csParam.
x[d] = gaugeParam.
X[d]; }
365 cudaDeviceSynchronize();
375 for (
int dir = 0; dir < 4; dir++) {
376 free(qdp_fatlink_gpu[dir]); qdp_fatlink_gpu[dir] =
nullptr;
377 free(qdp_longlink_gpu[dir]); qdp_longlink_gpu[dir] =
nullptr;
379 free(milc_fatlink_gpu); milc_fatlink_gpu =
nullptr;
380 free(milc_longlink_gpu); milc_longlink_gpu =
nullptr;
381 free(milc_fatlink_cpu); milc_fatlink_cpu =
nullptr;
382 free(milc_longlink_cpu); milc_longlink_cpu =
nullptr;
391 for (
int dir = 0; dir < 4; dir++) {
396 if (dirac !=
nullptr) {
400 if (cudaSpinor !=
nullptr) {
402 cudaSpinor =
nullptr;
404 if (cudaSpinorOut !=
nullptr) {
406 cudaSpinorOut =
nullptr;
408 if (tmp !=
nullptr) {
413 if (spinor !=
nullptr) {
delete spinor; spinor =
nullptr; }
414 if (spinorOut !=
nullptr) {
delete spinorOut; spinorOut =
nullptr; }
415 if (spinorRef !=
nullptr) {
delete spinorRef; spinorRef =
nullptr; }
416 if (tmpCpu !=
nullptr) {
delete tmpCpu; tmpCpu =
nullptr; }
420 if (cpuFat) {
delete cpuFat; cpuFat =
nullptr; }
421 if (cpuLong) {
delete cpuLong; cpuLong =
nullptr; }
433 DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
439 timeval tstart, tstop;
441 cudaEvent_t start,
end;
442 cudaEventCreate(&start);
443 cudaEventRecord(start, 0);
444 cudaEventSynchronize(start);
447 cudaEventRecord(start, 0);
449 for (
int i = 0; i <
niter; i++) {
451 gettimeofday(&tstart, NULL);
454 case 0: dirac->
Dslash(*cudaSpinorOut, *cudaSpinor,
parity);
break;
455 case 1: dirac->
M(*cudaSpinorOut, *cudaSpinor);
break;
456 case 2: dirac->
M(*cudaSpinorOut, *cudaSpinor);
break;
459 gettimeofday(&tstop, NULL);
460 long ds = tstop.tv_sec - tstart.tv_sec;
461 long dus = tstop.tv_usec - tstart.tv_usec;
462 double elapsed = ds + 0.000001*dus;
466 if (i>0 && i<niter) {
472 cudaEventCreate(&end);
473 cudaEventRecord(end, 0);
474 cudaEventSynchronize(end);
476 cudaEventElapsedTime(&runTime, start, end);
477 cudaEventDestroy(start);
478 cudaEventDestroy(end);
483 cudaError_t stat = cudaGetLastError();
484 if (stat != cudaSuccess)
485 errorQuda(
"with ERROR: %s\n", cudaGetErrorString(stat));
527 ASSERT_LE(deviation, tol) <<
"CPU and CUDA implementations do not agree";
533 for (
int dir = 0; dir < 4; dir++) {
539 cudaSpinor =
nullptr;
540 cudaSpinorOut =
nullptr;
556 for (
int i=0; i<attempts; i++) {
578 if (std::isnan(spinor_ref_norm2)) { failed =
true; }
579 if (std::isnan(spinor_out_norm2)) { failed =
true; }
585 printfQuda(
"Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for " 586 "aggregate message size %lu bytes\n",
593 double cuda_spinor_out_norm2 =
blas::norm2(*cudaSpinorOut);
594 printfQuda(
"Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2);
597 test_rc = RUN_ALL_TESTS();
598 if (test_rc != 0 || failed)
warningQuda(
"Tests failed");
609 printfQuda(
"prec recon test_type dagger S_dim T_dimension\n");
622 printfQuda(
" --test <0/1/2> # Test method\n");
629 int main(
int argc,
char **argv)
636 ::testing::InitGoogleTest(&argc, argv);
637 for (
int i=1 ;i < argc; i++){
641 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
651 warningQuda(
"The dslash_type %d isn't staggered, asqtad, or laplace. Defaulting to asqtad.\n",
dslash_type);
658 errorQuda(
"Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".\n");
666 printfQuda(
"Note: epsilon-naik != 0, testing epsilon correction links.\n");
669 printfQuda(
"Not computing fat-long, ignoring epsilon correction.\n");
672 printfQuda(
"Note: epsilon-naik = 0, testing original HISQ links.\n");
686 errorQuda(
"Half precision unsupported in fat/long compute");
690 errorQuda(
"Half precision unsupported for Laplace operator.\n");
void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik)
int dimPartitioned(int dim)
QudaDiracFieldOrder dirac_order
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
QudaMassNormalization mass_normalization
QudaDslashType dslash_type
QudaReconstructType reconstruct_sloppy
cpuColorSpinorField * spinorOut
void setInvertParam(QudaInvertParam &inv_param)
QudaGhostExchange ghostExchange
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
enum QudaPrecision_s QudaPrecision
void dw_setDims(int *X, const int L5)
int main(int argc, char **argv)
DslashTime dslashCUDA(int niter)
void ** ghost_fatlink_cpu
cpuColorSpinorField * spinorRef
void ** ghost_longlink_cpu
QudaReconstructType link_recon
double norm2(const ColorSpinorField &a)
QudaDslashType dslash_type
QudaGaugeParam gaugeParam
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
int process_command_line_option(int argc, char **argv, int *idx)
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
QudaStaggeredPhase staggered_phase_type
void * qdp_fatlink_cpu[4]
cpuColorSpinorField * spinor
cudaColorSpinorField * cudaSpinor
QudaGaugeFieldOrder gauge_order
const char * get_prec_str(QudaPrecision prec)
unsigned long long Flops() const
QudaSiteSubset siteSubset
QudaFieldLocation input_location
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
#define staggeredSpinorSiteSize
cpuColorSpinorField * tmpCpu
void usage_extra(char **argv)
void initQuda(int device)
cudaColorSpinorField * tmp
QudaFieldLocation output_location
QudaFieldOrder fieldOrder
void * qdp_longlink_cpu[4]
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
void setSpinorSiteSize(int n)
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
QudaGammaBasis gammaBasis
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity)
QudaGammaBasis gamma_basis
void staggered_dslash(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
cudaColorSpinorField * cudaSpinorOut
QudaPrecision cuda_prec_sloppy
const void ** Ghost() const
enum QudaDagType_s QudaDagType
QudaReconstructType link_recon_sloppy
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
QudaPrecision prec_sloppy
void setGaugeParam(QudaGaugeParam &gaugeParam)
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
void * memset(void *s, int c, size_t n)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
double getTolerance(QudaPrecision prec)
enum QudaReconstructType_s QudaReconstructType
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Main header file for the QUDA library.
QudaInvertParam inv_param
enum QudaDslashType_s QudaDslashType
void staggeredDslashRef()
enum QudaVerbosity_s QudaVerbosity
__device__ void axpy(real a, const real *x, Link &y)
static Dirac * create(const DiracParam ¶m)
void initComms(int argc, char **argv, int *const commDims)
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
size_t GhostBytes() const
int gridsize_from_cmdline[]
void setVerbosity(QudaVerbosity verbosity)
QudaGaugeParam newQudaGaugeParam(void)