2 #include <gauge_fix_ovr_extra.h>
3 #include <thrust_helper.cuh>
8 #if defined(GPU_GAUGE_ALG) && defined(MULTI_GPU)
11 int X[4]; // grid dimensions
13 BorderIdArg(int X_[4], int border_[4]) {
14 for ( int dir = 0; dir < 4; ++dir ) border[dir] = border_[dir];
15 for ( int dir = 0; dir < 4; ++dir ) X[dir] = X_[dir];
19 __global__ void ComputeBorderPointsActiveFaceIndex(BorderIdArg arg, int *faceindices, int facesize, int faceid, int parity){
20 int idd = blockDim.x * blockIdx.x + threadIdx.x;
21 if ( idd < facesize ) {
24 if ( idx >= facesize / 2 ) {
25 borderid = arg.X[faceid] - 1;
29 for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
34 za = idx / ( X[1] / 2);
36 x[2] = za - x[3] * X[2];
38 xodd = (borderid + x[2] + x[3] + parity) & 1;
39 x[1] = (2 * idx + xodd) - za * X[1];
42 za = idx / ( X[0] / 2);
44 x[2] = za - x[3] * X[2];
46 xodd = (borderid + x[2] + x[3] + parity) & 1;
47 x[0] = (2 * idx + xodd) - za * X[0];
50 za = idx / ( X[0] / 2);
52 x[1] = za - x[3] * X[1];
54 xodd = (borderid + x[1] + x[3] + parity) & 1;
55 x[0] = (2 * idx + xodd) - za * X[0];
58 za = idx / ( X[0] / 2);
60 x[1] = za - x[2] * X[1];
62 xodd = (borderid + x[1] + x[2] + parity) & 1;
63 x[0] = (2 * idx + xodd) - za * X[0];
66 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);;
67 faceindices[idd] = idx;
72 * @brief Pre-calculate lattice border points used by the gauge fixing with overrelaxation in multi-GPU implementation
74 void PreCalculateLatticeIndices(size_t faceVolume[4], size_t faceVolumeCB[4], int X[4], int border[4], \
75 int &threads, int *borderpoints[2]){
76 BorderIdArg arg(X, border);
78 for ( int dir = 0; dir < 4; ++dir )
79 if ( comm_dim_partitioned(dir)) nlinksfaces += faceVolume[dir];
80 thrust::device_ptr<int> array_faceT[2];
81 thrust::device_ptr<int> array_interiorT[2];
82 for ( int i = 0; i < 2; i++ ) { //even and odd ids
83 borderpoints[i] = static_cast<int*>(pool_device_malloc(nlinksfaces * sizeof(int) ));
84 qudaMemset(borderpoints[i], 0, nlinksfaces * sizeof(int) );
85 array_faceT[i] = thrust::device_pointer_cast(borderpoints[i]);
88 tp.block = dim3(128, 1, 1);
90 for ( int dir = 0; dir < 4; ++dir ) {
91 if ( comm_dim_partitioned(dir)) {
92 tp.grid = dim3((faceVolume[dir] + tp.block.x - 1) / tp.block.x,1,1);
93 for ( int oddbit = 0; oddbit < 2; oddbit++) {
94 auto faceindices = borderpoints[oddbit] + start;
95 qudaLaunchKernel(ComputeBorderPointsActiveFaceIndex, tp, (qudaStream_t)0, arg, faceindices, faceVolume[dir], dir, oddbit);
97 start += faceVolume[dir];
101 for ( int i = 0; i < 2; i++ ) {
102 //sort and remove duplicated lattice indices
103 thrust_allocator alloc;
104 thrust::sort(thrust::cuda::par(alloc), array_faceT[i], array_faceT[i] + nlinksfaces);
105 thrust::device_ptr<int> new_end = thrust::unique(array_faceT[i], array_faceT[i] + nlinksfaces);
106 size[i] = thrust::raw_pointer_cast(new_end) - thrust::raw_pointer_cast(array_faceT[i]);
108 if ( size[0] == size[1] ) threads = size[0];
109 else errorQuda("BORDER: Even and Odd sizes does not match, not supported!!!!, %d:%d",size[0],size[1]);
112 #endif // GPU_GAUGE_ALG && MULTI_GPU