QUDA  v1.1.0
A library for QCD on GPUs
gauge_fix_ovr_extra.cu
Go to the documentation of this file.
1 #include <comm_quda.h>
2 #include <gauge_fix_ovr_extra.h>
3 #include <thrust_helper.cuh>
4 #include <tune_quda.h>
5 
6 namespace quda {
7 
8 #if defined(GPU_GAUGE_ALG) && defined(MULTI_GPU)
9 
10  struct BorderIdArg {
11  int X[4]; // grid dimensions
12  int border[4];
13  BorderIdArg(int X_[4], int border_[4]) {
14  for ( int dir = 0; dir < 4; ++dir ) border[dir] = border_[dir];
15  for ( int dir = 0; dir < 4; ++dir ) X[dir] = X_[dir];
16  }
17  };
18 
19  __global__ void ComputeBorderPointsActiveFaceIndex(BorderIdArg arg, int *faceindices, int facesize, int faceid, int parity){
20  int idd = blockDim.x * blockIdx.x + threadIdx.x;
21  if ( idd < facesize ) {
22  int borderid = 0;
23  int idx = idd;
24  if ( idx >= facesize / 2 ) {
25  borderid = arg.X[faceid] - 1;
26  idx -= facesize / 2;
27  }
28  int X[4];
29  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
30  int x[4];
31  int za, xodd;
32  switch ( faceid ) {
33  case 0: //X FACE
34  za = idx / ( X[1] / 2);
35  x[3] = za / X[2];
36  x[2] = za - x[3] * X[2];
37  x[0] = borderid;
38  xodd = (borderid + x[2] + x[3] + parity) & 1;
39  x[1] = (2 * idx + xodd) - za * X[1];
40  break;
41  case 1: //Y FACE
42  za = idx / ( X[0] / 2);
43  x[3] = za / X[2];
44  x[2] = za - x[3] * X[2];
45  x[1] = borderid;
46  xodd = (borderid + x[2] + x[3] + parity) & 1;
47  x[0] = (2 * idx + xodd) - za * X[0];
48  break;
49  case 2: //Z FACE
50  za = idx / ( X[0] / 2);
51  x[3] = za / X[1];
52  x[1] = za - x[3] * X[1];
53  x[2] = borderid;
54  xodd = (borderid + x[1] + x[3] + parity) & 1;
55  x[0] = (2 * idx + xodd) - za * X[0];
56  break;
57  case 3: //T FACE
58  za = idx / ( X[0] / 2);
59  x[2] = za / X[1];
60  x[1] = za - x[2] * X[1];
61  x[3] = borderid;
62  xodd = (borderid + x[1] + x[2] + parity) & 1;
63  x[0] = (2 * idx + xodd) - za * X[0];
64  break;
65  }
66  idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);;
67  faceindices[idd] = idx;
68  }
69  }
70 
71  /**
72  * @brief Pre-calculate lattice border points used by the gauge fixing with overrelaxation in multi-GPU implementation
73  */
74  void PreCalculateLatticeIndices(size_t faceVolume[4], size_t faceVolumeCB[4], int X[4], int border[4], \
75  int &threads, int *borderpoints[2]){
76  BorderIdArg arg(X, border);
77  int nlinksfaces = 0;
78  for ( int dir = 0; dir < 4; ++dir )
79  if ( comm_dim_partitioned(dir)) nlinksfaces += faceVolume[dir];
80  thrust::device_ptr<int> array_faceT[2];
81  thrust::device_ptr<int> array_interiorT[2];
82  for ( int i = 0; i < 2; i++ ) { //even and odd ids
83  borderpoints[i] = static_cast<int*>(pool_device_malloc(nlinksfaces * sizeof(int) ));
84  qudaMemset(borderpoints[i], 0, nlinksfaces * sizeof(int) );
85  array_faceT[i] = thrust::device_pointer_cast(borderpoints[i]);
86  }
87  TuneParam tp;
88  tp.block = dim3(128, 1, 1);
89  int start = 0;
90  for ( int dir = 0; dir < 4; ++dir ) {
91  if ( comm_dim_partitioned(dir)) {
92  tp.grid = dim3((faceVolume[dir] + tp.block.x - 1) / tp.block.x,1,1);
93  for ( int oddbit = 0; oddbit < 2; oddbit++) {
94  auto faceindices = borderpoints[oddbit] + start;
95  qudaLaunchKernel(ComputeBorderPointsActiveFaceIndex, tp, (qudaStream_t)0, arg, faceindices, faceVolume[dir], dir, oddbit);
96  }
97  start += faceVolume[dir];
98  }
99  }
100  int size[2];
101  for ( int i = 0; i < 2; i++ ) {
102  //sort and remove duplicated lattice indices
103  thrust_allocator alloc;
104  thrust::sort(thrust::cuda::par(alloc), array_faceT[i], array_faceT[i] + nlinksfaces);
105  thrust::device_ptr<int> new_end = thrust::unique(array_faceT[i], array_faceT[i] + nlinksfaces);
106  size[i] = thrust::raw_pointer_cast(new_end) - thrust::raw_pointer_cast(array_faceT[i]);
107  }
108  if ( size[0] == size[1] ) threads = size[0];
109  else errorQuda("BORDER: Even and Odd sizes does not match, not supported!!!!, %d:%d",size[0],size[1]);
110  }
111 
112 #endif // GPU_GAUGE_ALG && MULTI_GPU
113 
114 }