22 #define MAX(a,b) ((a)>(b)?(a):(b))
66 void init(
int argc,
char **argv) {
79 errorQuda(
"Asqtad not supported. Please try staggered_dslash_test instead");
134 errorQuda(
"Gauge and spinor CPU precisions must match");
141 #ifndef MULTI_GPU // free parameter for single GPU
143 #else // must be this one c/b face for multi gpu
148 int pad_size =
MAX(x_face_size, y_face_size);
149 pad_size =
MAX(pad_size, z_face_size);
150 pad_size =
MAX(pad_size, t_face_size);
315 printfQuda(
"Source: CPU = %e, CUDA = %e\n", cpu_norm, cuda_norm);
359 cudaEvent_t start,
end;
360 cudaEventCreate(&start);
361 cudaEventCreate(&end);
362 cudaEventRecord(start, 0);
364 for (
int i = 0; i <
niter; i++) {
392 cudaEventRecord(end, 0);
393 cudaEventSynchronize(end);
395 cudaEventElapsedTime(&runTime, start, end);
396 cudaEventDestroy(start);
397 cudaEventDestroy(end);
399 double secs = runTime / 1000;
402 cudaError_t stat = cudaGetLastError();
403 if (stat != cudaSuccess)
404 printfQuda(
"with ERROR: %s\n", cudaGetErrorString(stat));
412 printfQuda(
"Calculating reference implementation...");
452 void *ref2 =
cpu_prec ==
sizeof(double) ? (
void*)((
double*)ref1 + tm_offset): (
void*)((
float*)ref1 + tm_offset);
455 void *flv2 =
cpu_prec ==
sizeof(double) ? (
void*)((
double*)flv1 + tm_offset): (
void*)((
float*)flv1 + tm_offset);
469 void *ref2 =
cpu_prec ==
sizeof(double) ? (
void*)((
double*)ref1 + tm_offset): (
void*)((
float*)ref1 + tm_offset);
472 void *flv2 =
cpu_prec ==
sizeof(double) ? (
void*)((
double*)flv1 + tm_offset): (
void*)((
float*)flv1 + tm_offset);
474 tm_ndeg_matpc(ref1, ref2,
hostGauge, flv1, flv2,
inv_param.
kappa,
inv_param.
mu,
inv_param.
epsilon,
inv_param.
matpc_type, dagger,
inv_param.
cpu_prec,
gauge_param);
485 void *oddOut =
cpu_prec ==
sizeof(double) ? (
void*)((
double*)evenOut + tm_offset): (
void*)((
float*)evenOut + tm_offset);
488 void *oddIn =
cpu_prec ==
sizeof(double) ? (
void*)((
double*)evenIn + tm_offset): (
void*)((
float*)evenIn + tm_offset);
490 tm_ndeg_mat(evenOut, oddOut,
hostGauge, evenIn, oddIn,
inv_param.
kappa,
inv_param.
mu,
inv_param.
epsilon, dagger,
inv_param.
cpu_prec,
gauge_param);
538 printf(
"Test type not supported for domain wall\n");
554 printfQuda(
"prec recon test_type dagger S_dim T_dimension Ls_dimension dslash_type niter\n");
555 printfQuda(
"%s %s %d %d %d/%d/%d %d %d %s %d\n",
570 extern void usage(
char**);
573 int main(
int argc,
char **argv)
576 for (
int i =1;i < argc; i++){
581 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
592 printfQuda(
"\nSpinor mem: %.3f GiB\n", spinorGiB);
609 #ifdef DSLASH_PROFILING
610 printDslashProfile();
618 unsigned long long flops = 0;
620 int spinor_floats =
test_type ? 2*(7*24+24)+24 : 7*24+24;
622 spinor_floats +=
test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2;
627 printfQuda(
"GFLOPS = %f\n", 1.0e-9*flops/secs);
635 printfQuda(
"Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
639 printfQuda(
"Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda);