6 #include <cuda_runtime.h>
19 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
21 extern void usage(
char** argv);
42 void* ghost_sitelink[4];
43 void* ghost_sitelink_diag[16];
55 qudaGaugeParam.
X[0] =
xdim;
56 qudaGaugeParam.
X[1] =
ydim;
57 qudaGaugeParam.
X[2] =
zdim;
58 qudaGaugeParam.
X[3] =
tdim;
74 if (cudaMallocHost((
void**)&fatlink, 4*
V*
gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
75 errorQuda(
"ERROR: cudaMallocHost failed for fatlink\n");
79 for(
int i=0;i < 4;i++){
80 if (cudaMallocHost((
void**)&sitelink[i],
V*
gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
81 errorQuda(
"ERROR: cudaMallocHost failed for sitelink\n");
86 for(
int i=0;i < 4;i++){
87 if (cudaMallocHost((
void**)&sitelink_ex[i],
V_ex*
gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
88 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_ex\n");
95 if(milc_sitelink == NULL){
96 errorQuda(
"ERROR: allocating milc_sitelink failed\n");
99 void* milc_sitelink_ex;
101 if(milc_sitelink_ex == NULL){
102 errorQuda(
"Error: allocating milc_sitelink failed\n");
110 for(
int i=0; i<
V; ++i){
112 char* src = (
char*)sitelink[
dir];
123 for(
int i=0; i <
V_ex; i++){
137 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
141 if( x1< 2 || x1 >= X1 +2
142 || x2< 2 || x2 >= X2 +2
143 || x3< 2 || x3 >= X3 +2
144 || x4< 2 || x4 >= X4 +2){
152 x1 = (x1 - 2 +
X1) % X1;
153 x2 = (x2 - 2 +
X2) % X2;
154 x3 = (x3 - 2 +
X3) % X3;
155 x4 = (x4 - 2 +
X4) % X4;
157 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+
x1)>>1;
162 char* src = (
char*)sitelink[
dir];
163 char* dst = (
char*)sitelink_ex[
dir];
172 double act_path_coeff[6];
173 for(
int i=0;i < 6;i++){
174 act_path_coeff[i]= 0.1*i;
180 struct timeval t0, t1;
182 for(
int i=0;i < 2;i++){
183 gettimeofday(&t0, NULL);
197 computeFatLinkQuda(fatlink, (
void**)milc_sitelink_ex, act_path_coeff, &qudaGaugeParam,
201 gettimeofday(&t1, NULL);
204 double secs =
TDIFF(t0,t1);
207 for(
int i=0;i < 4;i++){
209 if(reflink[i] == NULL){
210 errorQuda(
"ERROR; allocate reflink[%d] failed\n", i);
221 for(
int i=0;i < 6;i++){
222 coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
234 for(
int i=0;i < 4; i++){
236 if (ghost_sitelink[i] == NULL){
237 printf(
"ERROR: malloc failed for ghost_sitelink[%d] \n",i);
248 for(
int nu=0;nu < 4;nu++){
249 for(
int mu=0;
mu < 4;
mu++){
251 ghost_sitelink_diag[nu*4+
mu] = NULL;
255 for(dir1= 0; dir1 < 4; dir1++){
256 if(dir1 !=nu && dir1 !=
mu){
260 for(dir2=0; dir2 < 4; dir2++){
261 if(dir2 != nu && dir2 !=
mu && dir2 != dir1){
266 if(ghost_sitelink_diag[nu*4+
mu] == NULL){
267 errorQuda(
"malloc failed for ghost_sitelink_diag\n");
286 for(
int i=0;i < 4;i++){
288 if(myfatlink[i] == NULL){
289 printf(
"Error: malloc failed for myfatlink[%d]\n", i);
295 for(
int i=0;i <
V; i++){
304 for(
int i=0;i < 4;i++){
310 reflink,
"CPU reference results:",
313 printfQuda(
"Test %s\n",(1 == res) ?
"PASSED" :
"FAILED");
314 int volume = qudaGaugeParam.
X[0]*qudaGaugeParam.
X[1]*qudaGaugeParam.
X[2]*qudaGaugeParam.
X[3];
316 double perf = 1.0* flops*volume/(secs*1024*1024*1024);
317 printfQuda(
"fatlink computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);
320 for(
int i=0;i < 4;i++){
328 printfQuda(
" Did you check the GPU health by running cuda memtest?\n");
335 free(ghost_sitelink[i]);
338 for(
int j=0;j <4; j++){
342 free(ghost_sitelink_diag[i*4+j]);
348 for(
int i=0;i < 4; i++){
349 cudaFreeHost(sitelink[i]);
350 cudaFreeHost(sitelink_ex[i]);
353 cudaFreeHost(fatlink);
354 if(milc_sitelink) free(milc_sitelink);
355 if(milc_sitelink_ex) free(milc_sitelink_ex);
361 return accuracy_level;
370 printfQuda(
"link_precision link_reconstruct space_dimension T_dimension Test Ordering\n");
397 printfQuda(
" --verify # Verify the GPU results using CPU results\n");
398 printfQuda(
" --gauge-order <qdp/milc> # ordering of the input gauge-field\n");
414 for (i =1;i < argc; i++){
420 if( strcmp(argv[i],
"--gauge-order") == 0){
425 if(strcmp(argv[i+1],
"milc") == 0){
427 }
else if(strcmp(argv[i+1],
"qdp") == 0){
430 fprintf(stderr,
"Error: unsupported gauge-field order\n");
438 if( strcmp(argv[i],
"--verify") == 0){
443 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
451 errorQuda(
"ERROR: milc format for multi-gpu with test0 is not supported yet!\n");
459 int accuracy_level = llfat_test(test);
461 printfQuda(
"accuracy_level=%d\n", accuracy_level);
466 if(accuracy_level >=3 ){