4 #define gaugeSiteSize 18
41 template<
int N,
typename FloatN,
typename Float2>
45 int Vh,
int pad,
int ghostV,
size_t threads)
47 int tid = blockIdx.x * blockDim.x + threadIdx.x;
48 int thread0_tid = blockIdx.x * blockDim.x;
54 for(dir = 0; dir < 4; dir++){
56 Float2* src_start = src + dir*9*(Vh+ghostV) + thread0_tid*9;
58 Float2* src_start = src + dir*9*(
Vh) + thread0_tid*9;
61 if(thread0_tid*9+j*blockDim.x+threadIdx.x >= 9*threads)
break;
63 ((Float2*)buf)[j*blockDim.x + threadIdx.x] = src_start[j*blockDim.x + threadIdx.x];
65 int idx = j*blockDim.x + threadIdx.x;
69 ((Float2*)buf)[divval*6+modval] = src_start[
idx];
77 FloatN* dst_start = (FloatN*)(dst+dir*N*stride);
79 dst_start[tid + j*stride] = buf[N*threadIdx.x + j];
101 template<
int N,
typename FloatN,
typename Float2>
105 int Vh,
int pad,
int ghostV,
size_t threads)
109 int block_idx = blockIdx.x*blockDim.x/4;
110 int local_idx = 16*(threadIdx.x/64) + threadIdx.x%16;
111 int pos_idx = blockIdx.x * blockDim.x/4 + 16*(threadIdx.x/64) + threadIdx.x%16;
112 int mydir = (threadIdx.x >> 4)% 4;
116 for(j=0; j < 9; j++){
117 if(block_idx*9*4 + j*blockDim.x+threadIdx.x >= 9*threads)
break;
119 ((Float2*)buf)[j*blockDim.x + threadIdx.x] = src[block_idx*9*4 + j*blockDim.x + threadIdx.x];
121 int idx = j*blockDim.x + threadIdx.x;
122 int modval = idx % 9;
123 int divval = idx / 9;
125 ((Float2*)buf)[divval*6+modval] = src[block_idx*9*4 +
idx];
132 if(pos_idx >= threads/4)
return;
134 for(j=0; j < N; j++){
136 dst[pos_idx + mydir*N*stride + j*stride] = buf[local_idx*4*9+mydir*9+j];
138 dst[pos_idx + mydir*N*stride + j*stride] = buf[local_idx*4*N+mydir*N+j];
145 int reconstruct,
int Vh,
int pad,
162 switch( reconstruct){
164 do_link_format_cpu_to_gpu<9><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
167 do_link_format_cpu_to_gpu<6><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
170 errorQuda(
"reconstruct type not supported\n");
175 switch( reconstruct){
177 do_link_format_cpu_to_gpu<9><<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
180 do_link_format_cpu_to_gpu<3><<<gridDim, blockDim>>>((float4*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
183 errorQuda(
"reconstruct type not supported\n");
188 errorQuda(
"ERROR: half precision not support in %s\n", __FUNCTION__);
200 switch( reconstruct){
202 do_link_format_cpu_to_gpu_milc<9><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
205 do_link_format_cpu_to_gpu_milc<6><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct,
Vh, pad, ghostV,
threads);
208 errorQuda(
"reconstruct type not supported\n");
213 switch( reconstruct){
215 do_link_format_cpu_to_gpu_milc<9><<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
218 do_link_format_cpu_to_gpu_milc<3><<<gridDim, blockDim, 0, stream>>>((float4*)dst, (float2*)src, reconstruct,
Vh, pad, ghostV,
threads);
221 errorQuda(
"reconstruct type not supported\n");
226 errorQuda(
"ERROR: half precision not support in %s\n", __FUNCTION__);
230 errorQuda(
"ERROR: invalid cpu ordering (%d)\n", cpu_order);
320 template<
typename FloatN>
329 int block_idx = blockIdx.x*blockDim.x/4;
330 int local_idx = 16*(threadIdx.x/64) + threadIdx.x%16;
331 int pos_idx = blockIdx.x * blockDim.x/4 + 16*(threadIdx.x/64) + threadIdx.x%16;
332 int mydir = (threadIdx.x >> 4)% 4;
333 for(j=0; j < 9; j++){
334 buf[local_idx*4*9+mydir*9+j] = src[pos_idx + mydir*9*stride + j*stride];
338 for(j=0; j < 9; j++){
339 dst[block_idx*9*4 + j*blockDim.x + threadIdx.x ] = buf[j*blockDim.x + threadIdx.x];
352 dim3 gridDim(4*Vh/blockDim.x);
354 if ((4*Vh) % blockDim.x != 0){
355 errorQuda(
"ERROR: 4*Vh(%d) is not multiple of blocksize(%d), exitting\n", Vh, blockDim.x);
358 do_link_format_gpu_to_cpu<<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src,
Vh, stride);
360 do_link_format_gpu_to_cpu<<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src,
Vh, stride);
362 printf(
"ERROR: half precision is not supported in %s\n",__FUNCTION__);
368 #define READ_ST_STAPLE(staple, idx, mystride) \
369 Float2 P0 = staple[idx + 0*mystride]; \
370 Float2 P1 = staple[idx + 1*mystride]; \
371 Float2 P2 = staple[idx + 2*mystride]; \
372 Float2 P3 = staple[idx + 3*mystride]; \
373 Float2 P4 = staple[idx + 4*mystride]; \
374 Float2 P5 = staple[idx + 5*mystride]; \
375 Float2 P6 = staple[idx + 6*mystride]; \
376 Float2 P7 = staple[idx + 7*mystride]; \
377 Float2 P8 = staple[idx + 8*mystride];
379 #define WRITE_ST_STAPLE(staple, idx, mystride) \
380 staple[idx + 0*mystride] = P0; \
381 staple[idx + 1*mystride] = P1; \
382 staple[idx + 2*mystride] = P2; \
383 staple[idx + 3*mystride] = P3; \
384 staple[idx + 4*mystride] = P4; \
385 staple[idx + 5*mystride] = P5; \
386 staple[idx + 6*mystride] = P6; \
387 staple[idx + 7*mystride] = P7; \
388 staple[idx + 8*mystride] = P8;
395 in_stride(in_stride) {
396 for (
int i=0 ;i<4; i++) this->X[i] = X[i];
401 template<
int dir,
int whichway,
typename Float2>
406 int sid = blockIdx.x*blockDim.x + threadIdx.x;
407 int z1 = sid / (param.
X[0]>>1);
408 int x1h = sid - z1*(param.
X[0]>>1);
409 int z2 = z1 / param.
X[1];
410 int x2 = z1 - z2*param.
X[1];
411 int x4 = z2 / param.
X[2];
412 int x3 = z2 - x4*param.
X[2];
421 ghost_face_idx = (x4*(param.
X[2]*param.
X[1])+x3*param.
X[1] +x2)>>1;
427 if (x1 >= param.
X[0] - 1){
428 ghost_face_idx = (x4*(param.
X[2]*param.
X[1])+x3*param.
X[1] +x2)>>1;
435 ghost_face_idx = (x4*param.
X[2]*param.
X[0]+x3*param.
X[0]+
x1)>>1;
441 if (x2 >= param.
X[1] - 1){
442 ghost_face_idx = (x4*param.
X[2]*param.
X[0]+x3*param.
X[0]+
x1)>>1;
449 ghost_face_idx = (x4*param.
X[1]*param.
X[0]+x2*param.
X[0]+
x1)>>1;
455 if (x3 >= param.
X[2] - 1){
456 ghost_face_idx = (x4*param.
X[1]*param.
X[0] + x2*param.
X[0] +
x1)>>1;
463 ghost_face_idx = (x3*param.
X[1]*param.
X[0]+x2*param.
X[0]+
x1)>>1;
469 if (x4 >= param.
X[3] - 1){
470 ghost_face_idx = (x3*param.
X[1]*param.
X[0]+x2*param.
X[0]+
x1)>>1;
482 void* ghost_staple_gpu,
483 int dir,
int whichway, cudaStream_t*
stream)
487 Vsh_x = X[1]*X[2]*X[3]/2;
488 Vsh_y = X[0]*X[2]*X[3]/2;
489 Vsh_z = X[0]*X[1]*X[3]/2;
490 Vsh_t = X[0]*X[1]*X[2]/2;
496 void* gpu_buf_even = ghost_staple_gpu;
497 void* gpu_buf_odd = ((
char*)ghost_staple_gpu) + Vsh[dir]*
gaugeSiteSize*precision ;
499 gpu_buf_odd = ghost_staple_gpu;
500 gpu_buf_even = ((
char*)ghost_staple_gpu) + Vsh[dir]*
gaugeSiteSize*precision ;
512 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
513 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
516 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
517 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
528 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
529 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
532 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
533 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
544 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
545 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
548 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
549 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
560 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
561 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
564 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity,
param);
565 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity,
param);
578 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
579 collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
582 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
583 collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
594 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
595 collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
598 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
599 collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
610 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
611 collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
614 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
615 collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
626 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
627 collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
630 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity,
param);
631 collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity,
param);
640 printf(
"ERROR: invalid precision for %s\n", __FUNCTION__);
enum QudaPrecision_s QudaPrecision
__global__ void const FloatN FloatM FloatM Float Float int threads
void collectGhostStaple(int *X, void *even, void *odd, int volumeCB, int stride, QudaPrecision precision, void *ghost_staple_gpu, int dir, int whichway, cudaStream_t *stream)
GhostStapleParam(const int in_stride, const int X[4])
__global__ void collectGhostStapleKernel(Float2 *out, Float2 *in, int parity, GhostStapleParam param)
__global__ void do_link_format_cpu_to_gpu(FloatN *dst, Float2 *src, int reconstruct, int Vh, int pad, int ghostV, size_t threads)
#define READ_ST_STAPLE(staple, idx, mystride)
#define WRITE_ST_STAPLE(staple, idx, mystride)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
__global__ void do_link_format_gpu_to_cpu(FloatN *dst, FloatN *src, int Vh, int stride)
cpuColorSpinorField * out
__global__ void do_link_format_cpu_to_gpu_milc(FloatN *dst, Float2 *src, int reconstruct, int Vh, int pad, int ghostV, size_t threads)
void link_format_gpu_to_cpu(void *dst, void *src, int Vh, int stride, QudaPrecision prec, cudaStream_t stream)
void link_format_cpu_to_gpu(void *dst, void *src, int reconstruct, int Vh, int pad, int ghostV, QudaPrecision prec, QudaGaugeFieldOrder cpu_order, cudaStream_t stream)