30 extern void usage(
char** argv);
352 gauge_force_test(
void)
360 qudaGaugeParam.
X[0] =
xdim;
361 qudaGaugeParam.
X[1] =
ydim;
362 qudaGaugeParam.
X[2] =
zdim;
363 qudaGaugeParam.
X[3] =
tdim;
375 int gSize = qudaGaugeParam.
cpu_prec;
381 if (cudaMallocHost(&sitelink_1d, 4*
V*
gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
382 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_1d\n");
387 if(sitelink_1d == NULL){
388 printf(
"ERROR: malloc failed for sitelink_1d\n");
394 void* sitelink_2d[4];
395 for(
int i=0;i < 4;i++){
397 if(cudaMallocHost(&sitelink_2d[i],
V*
gaugeSiteSize*qudaGaugeParam.
cpu_prec) == cudaErrorMemoryAllocation) {
398 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_2d\n");
401 sitelink_2d[i] = malloc(
V*gaugeSiteSize*qudaGaugeParam.
cpu_prec);
411 for(
int i=0; i <
V; i++){
418 sitelink = sitelink_1d;
420 sitelink = (
void**)sitelink_2d;
425 void* sitelink_ex_2d[4];
426 void* sitelink_ex_1d;
428 if (cudaMallocHost((
void**)&sitelink_ex_1d, 4*
V_ex*
gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
429 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_ex_1d\n");
431 for(
int i=0;i < 4;i++){
432 if (cudaMallocHost((
void**)&sitelink_ex_2d[i],
V_ex*
gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
433 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_ex_2d\n");
435 if(sitelink_ex_2d[i] == NULL){
436 errorQuda(
"ERROR; allocate sitelink_ex[%d] failed\n", i);
445 for(
int i=0; i <
V_ex; i++){
459 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
462 if( x1< 2 || x1 >= X1 +2
463 || x2< 2 || x2 >= X2 +2
464 || x3< 2 || x3 >= X3 +2
465 || x4< 2 || x4 >= X4 +2){
469 x1 = (x1 - 2 +
X1) % X1;
470 x2 = (x2 - 2 +
X2) % X2;
471 x3 = (x3 - 2 +
X3) % X3;
472 x4 = (x4 - 2 +
X4) % X4;
474 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+
x1)>>1;
479 char* src = (
char*)sitelink_2d[
dir];
480 char* dst = (
char*)sitelink_ex_2d[
dir];
487 for(
int i=0; i <
V_ex; i++){
495 sitelink_ex = sitelink_ex_2d;
497 sitelink_ex = sitelink_ex_1d;
506 if(mom == NULL || refmom == NULL){
507 printf(
"ERROR: malloc failed for mom/refmom\n");
516 double loop_coeff_d[
sizeof(
loop_coeff_f)/
sizeof(
float)];
517 for(
unsigned int i=0;i <
sizeof(
loop_coeff_f)/
sizeof(
float); i++){
525 loop_coeff = loop_coeff_d;
530 int** input_path_buf[4];
532 input_path_buf[
dir] = (
int**)malloc(num_paths*
sizeof(
int*));
533 if (input_path_buf[
dir] == NULL){
534 printf(
"ERORR: malloc failed for input path\n");
538 for(
int i=0;i < num_paths;i++){
539 input_path_buf[
dir][i] = (
int*)malloc(
length[i]*
sizeof(
int));
540 if (input_path_buf[
dir][i] == NULL){
541 printf(
"ERROR: malloc failed for input_path_buf[dir][%d]\n", i);
556 struct timeval t0, t1;
560 gettimeofday(&t0, NULL);
563 loop_coeff, num_paths, max_length, eb3,
564 &qudaGaugeParam, timeinfo);
568 loop_coeff, num_paths, max_length, eb3,
569 &qudaGaugeParam, timeinfo);
571 gettimeofday(&t1, NULL);
574 double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
583 int R[4] = {2, 2, 2, 2};
587 input_path_buf,
length, loop_coeff, num_paths);
590 input_path_buf,
length, loop_coeff, num_paths);
601 printf(
"Test %s\n",(1 == res) ?
"PASSED" :
"FAILED");
603 double perf = 1.0* flops*V/(total_time*1e+9);
604 double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
605 printf(
"init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f total time =%.2f ms\n",
606 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
607 printf(
"kernel performance: %.2f GFLOPS, overall performance : %.2f GFOPS\n", kernel_perf, perf);
610 for(
int i=0;i < num_paths; i++){
611 free(input_path_buf[
dir][i]);
613 free(input_path_buf[
dir]);
617 cudaFreeHost(sitelink_1d);
623 cudaFreeHost(sitelink_2d[
dir]);
625 free(sitelink_2d[
dir]);
630 cudaFreeHost(sitelink_ex_1d);
632 cudaFreeHost(sitelink_ex_2d[
dir]);
643 printf(
"Warning: you test failed. \n");
644 printf(
" Did you use --verify?\n");
645 printf(
" Did you check the GPU health by running cuda memtest?\n");
648 return accuracy_level;
655 printf(
"running the following test:\n");
657 printf(
"link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order Attempts\n");
658 printf(
"%s %s %d/%d/%d %d %s %d\n",
671 printf(
"Extra options:\n");
672 printf(
" --gauge-order <qdp/milc> # Gauge storing order in CPU\n");
673 printf(
" --attempts <n> # Number of tests\n");
674 printf(
" --verify # Verify the GPU results using CPU results\n");
682 for (i =1;i < argc; i++){
688 if( strcmp(argv[i],
"--gauge-order") == 0){
693 if(strcmp(argv[i+1],
"milc") == 0){
695 }
else if(strcmp(argv[i+1],
"qdp") == 0){
698 fprintf(stderr,
"Error: unsupported gauge-field order\n");
704 if( strcmp(argv[i],
"--attempts") == 0){
709 attempts = atoi(argv[i+1]);
711 printf(
"ERROR: invalid number of attempts(%d)\n", attempts);
717 if( strcmp(argv[i],
"--verify") == 0){
722 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
733 int accuracy_level = gauge_force_test();
734 printfQuda(
"accuracy_level=%d\n", accuracy_level);
739 if(accuracy_level >=3 ){