QUDA  v1.1.0
A library for QCD on GPUs
gauge_force.cu
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 #include <quda_matrix.h>
3 #include <index_helper.cuh>
4 #include <tune_quda.h>
5 #include <instantiate.h>
6 
7 namespace quda {
8 
9  struct paths {
10  const int num_paths;
11  const int max_length;
12  int *input_path[4];
13  const int *length;
14  const double *path_coeff;
15  int count;
16 
17  paths(void *buffer, size_t bytes, int ***input_path, int *length_h, double *path_coeff_h, int num_paths, int max_length) :
18  num_paths(num_paths),
19  max_length(max_length),
20  count(0)
21  {
22  void *path_h = safe_malloc(bytes);
23  memset(path_h, 0, bytes);
24 
25  int *input_path_h = (int*)path_h;
26  for (int dir=0; dir<4; dir++) {
27  // flatten the input_path array for copying to the device
28  for (int i=0; i < num_paths; i++) {
29  for (int j=0; j < length_h[i]; j++) {
30  input_path_h[dir*num_paths*max_length + i*max_length + j] = input_path[dir][i][j];
31  if (dir==0) count++;
32  }
33  }
34  }
35 
36  // length array
37  memcpy((char*)path_h + 4 * num_paths * max_length * sizeof(int), length_h, num_paths*sizeof(int));
38 
39  // path_coeff array
40  memcpy((char*)path_h + 4 * num_paths * max_length * sizeof(int) + num_paths*sizeof(int), path_coeff_h, num_paths*sizeof(double));
41 
42  qudaMemcpy(buffer, path_h, bytes, cudaMemcpyHostToDevice);
43  host_free(path_h);
44 
45  // finally set the pointers to the correct offsets in the buffer
46  for (int d=0; d < 4; d++) this->input_path[d] = (int*)((char*)buffer + d*num_paths*max_length*sizeof(int));
47  length = (int*)((char*)buffer + 4*num_paths*max_length*sizeof(int));
48  path_coeff = (double*)((char*)buffer + 4 * num_paths * max_length * sizeof(int) + num_paths*sizeof(int));
49  }
50  };
51 
52  template <typename Float_, int nColor_, QudaReconstructType recon_u, QudaReconstructType recon_m>
53  struct GaugeForceArg {
54  using Float = Float_;
55  static constexpr int nColor = nColor_;
56  static_assert(nColor == 3, "Only nColor=3 enabled at this time");
57  typedef typename gauge_mapper<Float,recon_u>::type Gauge;
58  typedef typename gauge_mapper<Float,recon_m>::type Mom;
59 
60  Mom mom;
61  const Gauge u;
62 
63  int threads;
64  int X[4]; // the regular volume parameters
65  int E[4]; // the extended volume parameters
66  int border[4]; // radius of border
67 
68  Float epsilon; // stepsize and any other overall scaling factor
69  const paths p;
70 
71  GaugeForceArg(GaugeField &mom, const GaugeField &u, double epsilon, const paths &p)
72  : mom(mom), u(u),
73  threads(mom.VolumeCB()),
74  epsilon(epsilon),
75  p(p)
76  {
77  for (int i=0; i<4; i++) {
78  X[i] = mom.X()[i];
79  E[i] = u.X()[i];
80  border[i] = (E[i] - X[i])/2;
81  }
82  }
83  };
84 
85  constexpr int flipDir(int dir) { return (7-dir); }
86  constexpr bool isForwards(int dir) { return (dir <= 3); }
87 
88  // this ensures that array elements are held in cache
89  template <typename T> constexpr T cache(const T *ptr, int idx) {
90 #ifdef __CUDA_ARCH__
91  return __ldg(ptr+idx);
92 #else
93  return ptr[idx];
94 #endif
95  }
96 
97  template <typename Arg, int dir>
98  __device__ __host__ inline void GaugeForceKernel(Arg &arg, int idx, int parity)
99  {
100  using real = typename Arg::Float;
101  typedef Matrix<complex<real>,Arg::nColor> Link;
102 
103  int x[4] = {0, 0, 0, 0};
104  getCoords(x, idx, arg.X, parity);
105  for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
106 
107  //linkA: current matrix
108  //linkB: the loaded matrix in this round
109  Link linkA, linkB, staple;
110 
111 #ifdef __CUDA_ARCH__
112  extern __shared__ int s[];
113  int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
114  s[tid] = 0;
115  signed char *dx = (signed char*)&s[tid];
116 #else
117  int dx[4] = {0, 0, 0, 0};
118 #endif
119 
120  for (int i=0; i<arg.p.num_paths; i++) {
121  real coeff = cache(arg.p.path_coeff, i);
122  if (coeff == 0) continue;
123 
124  const int* path = arg.p.input_path[dir] + i*arg.p.max_length;
125 
126  // start from end of link in direction dir
127  int nbr_oddbit = (parity^1);
128  dx[dir]++;
129 
130  int path0 = cache(path,0);
131  int lnkdir = isForwards(path0) ? path0 : flipDir(path0);
132 
133  if (isForwards(path0)) {
134  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
135  linkA = linkB;
136  dx[lnkdir]++; // now have to update location
137  nbr_oddbit = nbr_oddbit^1;
138  } else {
139  dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
140  nbr_oddbit = nbr_oddbit^1;
141  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
142  linkA = conj(linkB);
143  }
144 
145  for (int j=1; j<cache(arg.p.length,i); j++) {
146 
147  int pathj = cache(path,j);
148  int lnkdir = isForwards(pathj) ? pathj : flipDir(pathj);
149 
150  if (isForwards(pathj)) {
151  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
152  linkA = linkA * linkB;
153  dx[lnkdir]++; // now have to update to new location
154  nbr_oddbit = nbr_oddbit^1;
155  } else {
156  dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
157  nbr_oddbit = nbr_oddbit^1;
158  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
159  linkA = linkA * conj(linkB);
160  }
161  } //j
162  staple = staple + coeff*linkA;
163  } //i
164 
165  // multiply by U(x)
166  linkA = arg.u(dir, linkIndex(x,arg.E), parity);
167  linkA = linkA * staple;
168 
169  // update mom(x)
170  Link mom = arg.mom(dir, idx, parity);
171  mom = mom - arg.epsilon * linkA;
172  makeAntiHerm(mom);
173  arg.mom(dir, idx, parity) = mom;
174  }
175 
176  template <typename Arg>
177  __global__ void GaugeForceKernel(Arg arg) {
178  int idx = blockIdx.x * blockDim.x + threadIdx.x;
179  if (idx >= arg.threads) return;
180  int parity = blockIdx.y * blockDim.y + threadIdx.y;
181  int dir = blockIdx.z * blockDim.z + threadIdx.z;
182  if (dir >= 4) return;
183 
184  switch(dir) {
185  case 0: GaugeForceKernel<Arg,0>(arg, idx, parity); break;
186  case 1: GaugeForceKernel<Arg,1>(arg, idx, parity); break;
187  case 2: GaugeForceKernel<Arg,2>(arg, idx, parity); break;
188  case 3: GaugeForceKernel<Arg,3>(arg, idx, parity); break;
189  }
190  }
191 
192  template <typename Float, int nColor, QudaReconstructType recon_u> class GaugeForce : public TunableVectorYZ {
193 
194  GaugeForceArg<Float, nColor, recon_u, QUDA_RECONSTRUCT_10> arg;
195  const GaugeField &meta;
196 
197  unsigned int sharedBytesPerThread() const { return 4; } // for dynamic indexing array
198  unsigned int minThreads() const { return arg.threads; }
199  bool tuneGridDim() const { return false; } // don't tune the grid dimension
200 
201  public:
202  GaugeForce(const GaugeField &u, GaugeField &mom, double epsilon, const paths &p) :
203  TunableVectorYZ(2,4),
204  arg(mom, u, epsilon, p),
205  meta(u)
206  {
207  apply(0);
208  qudaDeviceSynchronize();
209  }
210 
211  void apply(const qudaStream_t &stream) {
212  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
213  qudaLaunchKernel(GaugeForceKernel<decltype(arg)>, tp, stream, arg);
214  }
215 
216  void preTune() { arg.mom.save(); }
217  void postTune() { arg.mom.load(); }
218 
219  long long flops() const { return (arg.p.count - arg.p.num_paths + 1) * 198ll * 2 * arg.mom.volumeCB * 4; }
220  long long bytes() const { return ((arg.p.count + 1ll) * arg.u.Bytes() + 2ll*arg.mom.Bytes()) * 2 * arg.mom.volumeCB * 4; }
221 
222  TuneKey tuneKey() const {
223  std::stringstream aux;
224  aux << meta.AuxString() << ",num_paths=" << arg.p.num_paths << comm_dim_partitioned_string();
225  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
226  }
227  };
228 
229  void gaugeForce(GaugeField& mom, const GaugeField& u, double epsilon, int ***input_path,
230  int *length_h, double *path_coeff_h, int num_paths, int path_max_length)
231  {
232  checkPrecision(mom, u);
233  checkLocation(mom, u);
234  if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
235 
236  // create path struct in a single allocation
237  size_t bytes = 4 * num_paths * path_max_length * sizeof(int) + num_paths*sizeof(int) + num_paths*sizeof(double);
238  void *buffer = pool_device_malloc(bytes);
239  paths p(buffer, bytes, input_path, length_h, path_coeff_h, num_paths, path_max_length);
240 
241 #ifdef GPU_GAUGE_FORCE
242  // gauge field must be passed as first argument so we peel off its reconstruct type
243  instantiate<GaugeForce,ReconstructNo12>(u, mom, epsilon, p);
244 #else
245  errorQuda("Gauge force has not been built");
246 #endif // GPU_GAUGE_FORCE
247  pool_device_free(buffer);
248  }
249 
250 } // namespace quda