23 #if defined(QMP_COMMS) 25 #elif defined(MPI_COMMS) 32 #include <gtest/gtest.h> 36 #define MAX(a,b) ((a)>(b)?(a):(b)) 37 #define staggeredSpinorSiteSize 6 41 extern void usage(
char** argv );
45 void *
qdp_inlink[4] = {
nullptr,
nullptr,
nullptr,
nullptr };
101 const char *
prec_str[] = {
"quarter",
"half",
"single",
"double"};
129 gauge_param.
X[0] =
X[0] =
xdim;
130 gauge_param.
X[1] =
X[1] =
ydim;
131 gauge_param.
X[2] =
X[2] =
zdim;
132 gauge_param.
X[3] =
X[3] =
tdim;
159 gauge_param.
scale = -1.0 / 24.0;
162 gauge_param.
scale = 1.0;
185 int tmpint =
MAX(
X[1] *
X[2] *
X[3], X[0] * X[2] * X[3]);
186 tmpint =
MAX(tmpint, X[0] * X[1] * X[3]);
187 tmpint =
MAX(tmpint, X[0] * X[1] * X[2]);
189 gauge_param.
ga_pad = tmpint;
190 inv_param.
sp_pad = tmpint;
201 void* qdp_fatlink_gpu[4];
202 void* qdp_longlink_gpu[4];
204 for (
int dir = 0; dir < 4; dir++) {
211 if (qdp_fatlink_gpu[dir] == NULL || qdp_longlink_gpu[dir] == NULL ||
213 errorQuda(
"ERROR: malloc failed for fatlink/longlink");
218 for (
int dir = 0; dir < 4; dir++) {
245 for (
int dir = 0; dir < 4; dir++) {
258 for (
int dir = 0; dir < 4; dir++) {
290 int x_face_size = X[1]*X[2]*X[3]/2;
291 int y_face_size = X[0]*X[2]*X[3]/2;
292 int z_face_size = X[0]*X[1]*X[3]/2;
293 int t_face_size = X[0]*X[1]*X[2]/2;
294 int pad_size =
MAX(x_face_size, y_face_size);
295 pad_size =
MAX(pad_size, z_face_size);
296 pad_size =
MAX(pad_size, t_face_size);
297 gauge_param.
ga_pad = pad_size;
314 gauge_param.
ga_pad = 3 * pad_size;
330 for (
int d = 0; d < 4; d++) { csParam.
x[d] = gauge_param.
X[d]; }
370 cudaDeviceSynchronize();
384 for (
int dir = 0; dir < 4; dir++) {
385 free(qdp_fatlink_gpu[dir]); qdp_fatlink_gpu[dir] =
nullptr;
386 free(qdp_longlink_gpu[dir]); qdp_longlink_gpu[dir] =
nullptr;
388 free(milc_fatlink_gpu); milc_fatlink_gpu =
nullptr;
389 free(milc_longlink_gpu); milc_longlink_gpu =
nullptr;
390 free(milc_fatlink_cpu); milc_fatlink_cpu =
nullptr;
391 free(milc_longlink_cpu); milc_longlink_cpu =
nullptr;
400 for (
int dir = 0; dir < 4; dir++) {
405 if (dirac !=
nullptr) {
409 if (cudaSpinor !=
nullptr) {
411 cudaSpinor =
nullptr;
413 if (cudaSpinorOut !=
nullptr) {
415 cudaSpinorOut =
nullptr;
417 if (tmp !=
nullptr) {
422 if (spinor !=
nullptr) {
delete spinor; spinor =
nullptr; }
423 if (spinorOut !=
nullptr) {
delete spinorOut; spinorOut =
nullptr; }
424 if (spinorRef !=
nullptr) {
delete spinorRef; spinorRef =
nullptr; }
425 if (tmpCpu !=
nullptr) {
delete tmpCpu; tmpCpu =
nullptr; }
429 if (cpuFat) {
delete cpuFat; cpuFat =
nullptr; }
430 if (cpuLong) {
delete cpuLong; cpuLong =
nullptr; }
440 DslashTime() : event_time(0.0), cpu_time(0.0), cpu_min(DBL_MAX), cpu_max(0.0) {}
446 timeval tstart, tstop;
448 cudaEvent_t start,
end;
449 cudaEventCreate(&start);
450 cudaEventRecord(start, 0);
451 cudaEventSynchronize(start);
454 cudaEventRecord(start, 0);
456 for (
int i = 0; i <
niter; i++) {
458 gettimeofday(&tstart, NULL);
461 case 0: dirac->
Dslash(*cudaSpinorOut, *cudaSpinor,
parity);
break;
462 case 1: dirac->
M(*cudaSpinorOut, *cudaSpinor);
break;
463 case 2: dirac->
M(*cudaSpinorOut, *cudaSpinor);
break;
466 gettimeofday(&tstop, NULL);
467 long ds = tstop.tv_sec - tstart.tv_sec;
468 long dus = tstop.tv_usec - tstart.tv_usec;
469 double elapsed = ds + 0.000001*dus;
473 if (i>0 && i<niter) {
479 cudaEventCreate(&end);
480 cudaEventRecord(end, 0);
481 cudaEventSynchronize(end);
483 cudaEventElapsedTime(&runTime, start, end);
484 cudaEventDestroy(start);
485 cudaEventDestroy(end);
490 cudaError_t stat = cudaGetLastError();
491 if (stat != cudaSuccess)
492 errorQuda(
"with ERROR: %s\n", cudaGetErrorString(stat));
537 printfQuda(
"prec recon test_type dagger S_dim T_dimension\n");
544 using ::testing::TestWithParam;
545 using ::testing::Bool;
546 using ::testing::Values;
547 using ::testing::Range;
548 using ::testing::Combine;
560 using ::testing::TestWithParam;
561 using ::testing::Bool;
562 using ::testing::Values;
563 using ::testing::Range;
564 using ::testing::Combine;
568 ::testing::tuple<int, int, int>
param;
574 if ((QUDA_PRECISION &
getPrecision(::testing::get<0>(GetParam()))) == 0
580 && (::testing::get<0>(GetParam()) == 0 || ::testing::get<0>(GetParam()) == 1)) {
581 warningQuda(
"Fixed precision unsupported in fat/long compute, skipping...");
586 warningQuda(
"Reconstruct 9 unsupported in fat/long compute, skipping...");
591 warningQuda(
"Fixed precision unsupported for Laplace operator, skipping...");
600 int prec = ::testing::get<0>(GetParam());
603 if (skip()) GTEST_SKIP();
605 int value = ::testing::get<2>(GetParam());
606 for(
int j=0; j < 4;j++){
607 if (value & (1 << j)){
614 for (
int dir = 0; dir < 4; dir++) {
620 cudaSpinor =
nullptr;
621 cudaSpinorOut =
nullptr;
629 init(prec, recon, value);
635 if (skip()) GTEST_SKIP();
648 double deviation = 1.0;
654 if (spinorRef !=
nullptr) {
675 if (std::isnan(spinor_ref_norm2)) { failed =
true; }
676 if (std::isnan(spinor_out_norm2)) { failed =
true; }
678 double cuda_spinor_out_norm2 =
blas::norm2(*cudaSpinorOut);
679 printfQuda(
"Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2);
681 if (failed) { deviation = 1.0; }
683 ASSERT_LE(deviation, tol) <<
"CPU and CUDA implementations do not agree";
703 double gflops = 1.0e-9 * flops / dslash_time.
event_time;
705 RecordProperty(
"Gflops", std::to_string(gflops));
708 RecordProperty(
"Halo_bidirectitonal_BW_CPU", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() *
niter / dslash_time.
cpu_time);
709 RecordProperty(
"Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() / dslash_time.
cpu_max);
710 RecordProperty(
"Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * cudaSpinor->
GhostBytes() / dslash_time.
cpu_min);
711 RecordProperty(
"Halo_message_size_bytes", 2 * cudaSpinor->
GhostBytes());
713 printfQuda(
"Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for aggregate " 714 "message size %lu bytes\n",
721 int main(
int argc,
char **argv)
732 for (
int p = 0; p < pmax; p++) {
733 for (
int d = 0; d < 4; d++) {
741 ::testing::InitGoogleTest(&argc, argv);
742 for (
int i = 1; i < argc; i++) {
746 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
756 warningQuda(
"The dslash_type %d isn't staggered, asqtad, or laplace. Defaulting to asqtad.\n",
dslash_type);
763 errorQuda(
"Cannot load a gauge field and test the ASQTAD/HISQ operator without setting \"--compute-fat-long true\".\n");
772 printfQuda(
"Note: epsilon-naik != 0, testing epsilon correction links.\n");
775 printfQuda(
"Not computing fat-long, ignoring epsilon correction.\n");
778 printfQuda(
"Note: epsilon-naik = 0, testing original HISQ links.\n");
787 int test_rc = RUN_ALL_TESTS();
790 for (
int dir = 0; dir < 4; dir++) {
795 for (
int p = 0; p < pmax; p++) {
796 for (
int d = 0; d < 4; d++) {
815 const int prec = ::testing::get<0>(
param.param);
816 const int recon = ::testing::get<1>(
param.param);
817 const int part = ::testing::get<2>(
param.param);
818 std::stringstream ss;
822 ss <<
"_partition" << part;
836 ::testing::Values(0)),
void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik)
void init(int precision, QudaReconstructType link_recon, int partition)
QudaDiracFieldOrder dirac_order
void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type)
QudaMassNormalization mass_normalization
QudaReconstructType reconstruct_sloppy
void usage_extra(char **argv)
cpuColorSpinorField * spinorOut
QudaGhostExchange ghostExchange
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
virtual ~StaggeredDslashTest()
void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param)
int getReconstructNibble(QudaReconstructType recon)
enum QudaPrecision_s QudaPrecision
void * qdp_inlink_backup[1][4]
void dw_setDims(int *X, const int L5)
INSTANTIATE_TEST_SUITE_P(QUDA, StaggeredDslashTest, Combine(Range(0, 4), ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8), ::testing::Values(0)), getstaggereddslashtestname)
cpuColorSpinorField * tmpCpu
QudaInvertParam inv_param
double norm2(const ColorSpinorField &a)
QudaDslashType dslash_type
QudaDslashType dslash_type
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
void commDimPartitionedSet(int dir)
static void SetUpTestCase()
int process_command_line_option(int argc, char **argv, int *idx)
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
QudaStaggeredPhase staggered_phase_type
std::string getstaggereddslashtestname(testing::TestParamInfo<::testing::tuple< int, int, int >> param)
double getTolerance(QudaPrecision prec)
QudaGaugeFieldOrder gauge_order
void * qdp_longlink_cpu_backup[1][4]
DslashTime dslashCUDA(int niter)
const char * get_prec_str(QudaPrecision prec)
unsigned long long Flops() const
cudaColorSpinorField * cudaSpinor
void * qdp_fatlink_cpu[4]
QudaSiteSubset siteSubset
QudaFieldLocation input_location
cpuColorSpinorField * spinor
void xpay(ColorSpinorField &x, double a, ColorSpinorField &y)
void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
QudaSolutionType solution_type
void initQuda(int device)
QudaPrecision getPrecision(int i)
QudaFieldLocation output_location
double benchmark(int kernel, const int niter)
QudaFieldOrder fieldOrder
QudaReconstructType link_recon
void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
void setSpinorSiteSize(int n)
QudaInvertParam newQudaInvertParam(void)
const char * get_recon_str(QudaReconstructType recon)
QudaGammaBasis gammaBasis
::testing::tuple< int, int, int > param
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity)
QudaGammaBasis gamma_basis
void staggered_dslash(cpuColorSpinorField *out, void **fatlink, void **longlink, void **ghost_fatlink, void **ghost_longlink, cpuColorSpinorField *in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
QudaPrecision cuda_prec_sloppy
const void ** Ghost() const
enum QudaDagType_s QudaDagType
enum QudaParity_s QudaParity
QudaReconstructType reconstruct
void staggeredDslashRef()
static int Compare(const cpuColorSpinorField &a, const cpuColorSpinorField &b, const int resolution=1)
Perform a component by component comparison of two color-spinor fields. In doing we normalize with re...
void * memset(void *s, int c, size_t n)
void construct_fat_long_gauge_field(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *param, QudaDslashType dslash_type)
virtual void M(ColorSpinorField &out, const ColorSpinorField &in) const =0
void display_test_info(int precision, QudaReconstructType link_recon)
enum QudaReconstructType_s QudaReconstructType
void commDimPartitionedReset()
Reset the comm dim partioned array to zero,.
Main header file for the QUDA library.
void ** ghost_fatlink_cpu
int main(int argc, char **argv)
void ** ghost_longlink_cpu
enum QudaDslashType_s QudaDslashType
TEST_P(StaggeredDslashTest, verify)
cudaColorSpinorField * cudaSpinorOut
void * qdp_longlink_cpu[4]
__device__ void axpy(real a, const real *x, Link &y)
static Dirac * create(const DiracParam ¶m)
cpuColorSpinorField * spinorRef
void initComms(int argc, char **argv, int *const commDims)
cudaColorSpinorField * tmp
void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[])
virtual void Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const =0
size_t GhostBytes() const
static void TearDownTestCase()
void setVerbosity(QudaVerbosity verbosity)
QudaGaugeParam gauge_param
void updateR()
update the radius for halos.
QudaGaugeParam newQudaGaugeParam(void)
int gridsize_from_cmdline[]
void * qdp_fatlink_cpu_backup[1][4]