3 #define gaugeSiteSize 18
40 template<
int N,
typename FloatN,
typename Float2>
44 int Vh,
int pad,
int ghostV,
size_t threads)
46 int tid = blockIdx.x * blockDim.x + threadIdx.x;
47 int thread0_tid = blockIdx.x * blockDim.x;
53 for(dir = 0; dir < 4; dir++){
55 Float2* src_start = src + dir*9*(Vh+ghostV) + thread0_tid*9;
57 Float2* src_start = src + dir*9*(
Vh) + thread0_tid*9;
60 if(thread0_tid*9+j*blockDim.x+threadIdx.x >= 9*threads)
break;
62 ((Float2*)buf)[j*blockDim.x + threadIdx.x] = src_start[j*blockDim.x + threadIdx.x];
64 int idx = j*blockDim.x + threadIdx.x;
68 ((Float2*)buf)[divval*6+modval] = src_start[
idx];
78 dst_start[tid + j*
stride] = buf[N*threadIdx.x + j];
100 template<
int N,
typename FloatN,
typename Float2>
104 int Vh,
int pad,
int ghostV,
size_t threads)
108 int block_idx = blockIdx.x*blockDim.x/4;
109 int local_idx = 16*(threadIdx.x/64) + threadIdx.x%16;
110 int pos_idx = blockIdx.x * blockDim.x/4 + 16*(threadIdx.x/64) + threadIdx.x%16;
111 int mydir = (threadIdx.x >> 4)% 4;
115 for(j=0; j < 9; j++){
116 if(block_idx*9*4 + j*blockDim.x+threadIdx.x >= 9*threads)
break;
118 ((Float2*)buf)[j*blockDim.x + threadIdx.x] = src[block_idx*9*4 + j*blockDim.x + threadIdx.x];
120 int idx = j*blockDim.x + threadIdx.x;
121 int modval = idx % 9;
122 int divval = idx / 9;
124 ((Float2*)buf)[divval*6+modval] = src[block_idx*9*4 +
idx];
131 if(pos_idx >= threads/4)
return;
133 for(j=0; j < N; j++){
135 dst[pos_idx + mydir*N*stride + j*
stride] = buf[local_idx*4*9+mydir*9+j];
137 dst[pos_idx + mydir*N*stride + j*
stride] = buf[local_idx*4*N+mydir*N+j];
144 int reconstruct,
int Vh,
int pad,
160 switch( reconstruct){
162 do_link_format_cpu_to_gpu<9><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
165 do_link_format_cpu_to_gpu<6><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
168 errorQuda(
"reconstruct type not supported\n");
173 switch( reconstruct){
175 do_link_format_cpu_to_gpu<9><<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
178 do_link_format_cpu_to_gpu<3><<<gridDim, blockDim>>>((float4*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
181 errorQuda(
"reconstruct type not supported\n");
186 errorQuda(
"ERROR: half precision not support in %s\n", __FUNCTION__);
198 switch( reconstruct){
200 do_link_format_cpu_to_gpu_milc<9><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
203 do_link_format_cpu_to_gpu_milc<6><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
206 errorQuda(
"reconstruct type not supported\n");
211 switch( reconstruct){
213 do_link_format_cpu_to_gpu_milc<9><<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
216 do_link_format_cpu_to_gpu_milc<3><<<gridDim, blockDim, 0, stream>>>((float4*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
219 errorQuda(
"reconstruct type not supported\n");
224 errorQuda(
"ERROR: half precision not support in %s\n", __FUNCTION__);
228 errorQuda(
"ERROR: invalid cpu ordering (%d)\n", cpu_order);
318 template<
typename FloatN>
327 int block_idx = blockIdx.x*blockDim.x/4;
328 int local_idx = 16*(threadIdx.x/64) + threadIdx.x%16;
329 int pos_idx = blockIdx.x * blockDim.x/4 + 16*(threadIdx.x/64) + threadIdx.x%16;
330 int mydir = (threadIdx.x >> 4)% 4;
331 for(j=0; j < 9; j++){
332 buf[local_idx*4*9+mydir*9+j] = src[pos_idx + mydir*9*stride + j*
stride];
336 for(j=0; j < 9; j++){
337 dst[block_idx*9*4 + j*blockDim.x + threadIdx.x ] = buf[j*blockDim.x + threadIdx.x];
350 dim3 gridDim(4*Vh/blockDim.x);
352 if ((4*Vh) % blockDim.x != 0){
353 errorQuda(
"ERROR: 4*Vh(%d) is not multiple of blocksize(%d), exitting\n", Vh, blockDim.x);
356 do_link_format_gpu_to_cpu<<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src,
Vh,
stride);
358 do_link_format_gpu_to_cpu<<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src,
Vh,
stride);
360 printf(
"ERROR: half precision is not supported in %s\n",__FUNCTION__);
366 #define READ_ST_STAPLE(staple, idx, mystride) \
367 Float2 P0 = staple[idx + 0*mystride]; \
368 Float2 P1 = staple[idx + 1*mystride]; \
369 Float2 P2 = staple[idx + 2*mystride]; \
370 Float2 P3 = staple[idx + 3*mystride]; \
371 Float2 P4 = staple[idx + 4*mystride]; \
372 Float2 P5 = staple[idx + 5*mystride]; \
373 Float2 P6 = staple[idx + 6*mystride]; \
374 Float2 P7 = staple[idx + 7*mystride]; \
375 Float2 P8 = staple[idx + 8*mystride];
377 #define WRITE_ST_STAPLE(staple, idx, mystride) \
378 staple[idx + 0*mystride] = P0; \
379 staple[idx + 1*mystride] = P1; \
380 staple[idx + 2*mystride] = P2; \
381 staple[idx + 3*mystride] = P3; \
382 staple[idx + 4*mystride] = P4; \
383 staple[idx + 5*mystride] = P5; \
384 staple[idx + 6*mystride] = P6; \
385 staple[idx + 7*mystride] = P7; \
386 staple[idx + 8*mystride] = P8;
390 template<
int dir,
int whichway,
typename Float2>
393 Float2* nbr_staple_gpu)
396 int sid = blockIdx.x*blockDim.x + threadIdx.x;
403 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
412 ghost_face_idx = (x4*(X3*
X2)+x3*X2 +x2)>>1;
419 ghost_face_idx = (x4*(X3*
X2)+x3*X2 +x2)>>1;
426 ghost_face_idx = (x4*X3*
X1+x3*
X1+
x1)>>1;
433 ghost_face_idx = (x4*X3*
X1+x3*
X1+
x1)>>1;
440 ghost_face_idx = (x4*X2*
X1+x2*
X1+
x1)>>1;
447 ghost_face_idx = (x4*X2*
X1 + x2*
X1 +
x1)>>1;
454 ghost_face_idx = (x3*X2*
X1+x2*
X1+
x1)>>1;
461 ghost_face_idx = (x3*X2*
X1+x2*
X1+
x1)>>1;
473 void* ghost_staple_gpu,
474 int dir,
int whichway, cudaStream_t*
stream)
478 Vsh_x = X[1]*X[2]*X[3]/2;
479 Vsh_y = X[0]*X[2]*X[3]/2;
480 Vsh_z = X[0]*X[1]*X[3]/2;
481 Vsh_t = X[0]*X[1]*X[2]/2;
487 void* gpu_buf_even = ghost_staple_gpu;
488 void* gpu_buf_odd = ((
char*)ghost_staple_gpu) + Vsh[
dir]*
gaugeSiteSize*precision ;
490 gpu_buf_odd = ghost_staple_gpu;
491 gpu_buf_even = ((
char*)ghost_staple_gpu) + Vsh[
dir]*
gaugeSiteSize*precision ;
502 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
503 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
506 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
507 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
518 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
519 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
522 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
523 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
534 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
535 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
538 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
539 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
550 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
551 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
554 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)even, even_parity, (double2*)gpu_buf_even);
555 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)odd, odd_parity, (double2*)gpu_buf_odd);
568 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
569 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
572 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
573 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
584 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
585 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
588 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
589 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
600 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
601 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
604 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
605 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
616 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
617 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
620 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)even, even_parity, (float2*)gpu_buf_even);
621 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)odd, odd_parity, (float2*)gpu_buf_odd);
630 printf(
"ERROR: invalid precision for %s\n", __FUNCTION__);