18 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
23 extern void usage(
char** argv);
93 int linksize = prec*recon;
96 int matrix_mul_flops = 198;
97 int matrix_add_flops = 18;
99 int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
100 int middle_link_data_io[6][2] = {
108 int middle_link_data_flops[6][2] = {
118 int num_calls_side_link[2]= {192, 48};
119 int side_link_data_io[2][2] = {
123 int side_link_data_flops[2][2] = {
130 int num_calls_all_link[2] ={192, 192};
131 int all_link_data_io[2][2] = {
135 int all_link_data_flops[2][2] = {
142 for(
int i = 0;i < 6; i++){
143 total_io += num_calls_middle_link[i]
144 *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
147 for(
int i = 0;i < 2; i++){
148 total_io += num_calls_side_link[i]
149 *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
151 for(
int i = 0;i < 2; i++){
152 total_io += num_calls_all_link[i]
153 *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
158 double total_flops = 0;
159 for(
int i = 0;i < 6; i++){
160 total_flops += num_calls_middle_link[i]
161 *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
164 for(
int i = 0;i < 2; i++){
165 total_flops += num_calls_side_link[i]
166 *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
168 for(
int i = 0;i < 2; i++){
169 total_flops += num_calls_all_link[i]
170 *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
175 *flops = total_flops;
177 printfQuda(
"flop/byte =%.1f\n", total_flops/total_io);
188 qudaGaugeParam.
X[0] =
xdim;
189 qudaGaugeParam.
X[1] =
ydim;
190 qudaGaugeParam.
X[2] =
zdim;
191 qudaGaugeParam.
X[3] =
tdim;
205 memcpy(&qudaGaugeParam_ex, &qudaGaugeParam,
sizeof(
QudaGaugeParam));
206 qudaGaugeParam_ex.
X[0] = qudaGaugeParam.
X[0] + 4;
207 qudaGaugeParam_ex.
X[1] = qudaGaugeParam.
X[1] + 4;
208 qudaGaugeParam_ex.
X[2] = qudaGaugeParam.
X[2] + 4;
209 qudaGaugeParam_ex.
X[3] = qudaGaugeParam.
X[3] + 4;
225 int gSize = qudaGaugeParam.
cpu_prec;
227 for(
int i=0;i < 4;i++){
230 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_2d\n");
233 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_ex_2d\n");
240 errorQuda(
"malloc failed for siteLink_2d/siteLink_ex_2d\n");
256 for(
int i=0; i <
V_ex; i++){
270 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
274 if( x1< 2 || x1 >= X1 +2
275 || x2< 2 || x2 >= X2 +2
276 || x3< 2 || x3 >= X3 +2
277 || x4< 2 || x4 >= X4 +2){
283 x1 = (x1 - 2 +
X1) % X1;
284 x2 = (x2 - 2 +
X2) % X2;
285 x3 = (x3 - 2 +
X3) % X3;
286 x4 = (x4 - 2 +
X4) % X4;
288 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+
x1)>>1;
295 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
304 for(
int i = 0;i <
V; i++){
307 memcpy(dst + (4*i+
dir)*gaugeSiteSize*
link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize \
314 for(
int i = 0;i <
V; i++){
317 memcpy(dst + (4*i+
dir)*gaugeSiteSize*
link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec);
324 memcpy(dst, src, V*gaugeSiteSize*
link_prec);
330 errorQuda(
"multi_gpu milc is not supported\n");
335 memcpy(dst, src, V_ex*gaugeSiteSize*
link_prec);
343 gParam_ex.precision =
prec;
392 fprintf(stderr,
"ERROR: malloc failed for hw\n");
418 for(
int i=0; i <
V_ex; i++){
427 int x1h = sid - za*
E1h;
432 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
433 int x1 = 2*x1h +
x1odd;
436 if( x1< 2 || x1 >= X1 +2
437 || x2< 2 || x2 >= X2 +2
438 || x3< 2 || x3 >= X3 +2
439 || x4< 2 || x4 >= X4 +2){
445 x1 = (x1 - 2 +
X1) % X1;
446 x2 = (x2 - 2 +
X2) % X2;
447 x3 = (x3 - 2 +
X3) % X3;
448 x4 = (x4 - 2 +
X4) % X4;
450 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+
x1)>>1;
457 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
459 src = ((
char**)cpuLongLinkOprod->Gauge_p())[
dir];
461 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
483 for(
int i = 0;i < 4; i++){
530 hisq_force_test(
void)
542 float act_path_coeff[6];
544 act_path_coeff[0] = 0.625000;
545 act_path_coeff[1] = -0.058479;
546 act_path_coeff[2] = -0.087719;
547 act_path_coeff[3] = 0.030778;
548 act_path_coeff[4] = -0.007200;
549 act_path_coeff[5] = -0.123113;
553 double d_act_path_coeff[6];
554 for(
int i=0; i<6; ++i){
555 d_act_path_coeff[i] = act_path_coeff[i];
563 int R[4] = {2, 2, 2, 2};
588 struct timeval ht0, ht1;
589 gettimeofday(&ht0, NULL);
615 gettimeofday(&ht1, NULL);
617 struct timeval t0, t1, t2, t3;
619 gettimeofday(&t0, NULL);
623 cudaDeviceSynchronize();
624 gettimeofday(&t1, NULL);
630 cudaDeviceSynchronize();
632 gettimeofday(&t2, NULL);
636 cudaDeviceSynchronize();
637 gettimeofday(&t1, NULL);
643 cudaDeviceSynchronize();
644 gettimeofday(&t2, NULL);
665 cudaDeviceSynchronize();
667 gettimeofday(&t3, NULL);
677 printfQuda(
"Test %s\n",(1 == res) ?
"PASSED" :
"FAILED");
683 float perf_flops = total_flops / (
TDIFF(t0, t1)) *1e-9;
684 float perf = total_io / (
TDIFF(t0, t1)) *1e-9;
685 printfQuda(
"Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n",
TDIFF(t0,t1)*1000, perf_flops, perf);
686 printfQuda(
"Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n",
TDIFF(t0,t1)*1000,
TDIFF(t1,t2)*1000,
TDIFF(t2,t3)*1000);
687 printfQuda(
"Host time (half-wilson fermion force) : %g ms\n",
TDIFF(ht0, ht1)*1000);
691 return accuracy_level;
698 printfQuda(
"running the following fermion force computation test:\n");
700 printfQuda(
"link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
714 printfQuda(
" --no_verify # Do not verify the GPU results using CPU results\n");
721 for (i =1;i < argc; i++){
727 if( strcmp(argv[i],
"--gauge-order") == 0){
732 if(strcmp(argv[i+1],
"milc") == 0){
734 }
else if(strcmp(argv[i+1],
"qdp") == 0){
737 fprintf(stderr,
"Error: unsupported gauge-field order\n");
744 if( strcmp(argv[i],
"--no_verify") == 0){
748 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
754 errorQuda(
"Multi-gpu for milc order is not supported\n");
764 int accuracy_level = hisq_force_test();
768 if(accuracy_level >=3 ){