QUDA  1.0.0
gauge_force.cu
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 #include <quda_matrix.h>
3 #include <index_helper.cuh>
4 #include <generics/ldg.h>
5 #include <tune_quda.h>
6 
7 namespace quda {
8 
9 #ifdef GPU_GAUGE_FORCE
10 
11  template <typename Mom, typename Gauge>
12  struct GaugeForceArg {
13  Mom mom;
14  const Gauge u;
15 
16  int threads;
17  int X[4]; // the regular volume parameters
18  int E[4]; // the extended volume parameters
19  int border[4]; // radius of border
20 
21  int num_paths;
22  int path_max_length;
23 
24  double coeff;
25 
26  const int *input_path_d[4];
27  const int *length_d;
28  const double *path_coeff_d;
29 
30  int count; // equal to sum of all path lengths. Used a convenience for computing perf
31 
32  GaugeForceArg(Mom &mom, const Gauge &u, int num_paths, int path_max_length, double coeff,
33  int **input_path_d, const int *length_d, const double* path_coeff_d, int count,
34  const GaugeField &meta_mom, const GaugeField &meta_u)
35  : mom(mom), u(u), threads(meta_mom.VolumeCB()), num_paths(num_paths),
36  path_max_length(path_max_length), coeff(coeff),
37  input_path_d{ input_path_d[0], input_path_d[1], input_path_d[2], input_path_d[3] },
38  length_d(length_d), path_coeff_d(path_coeff_d), count(count)
39  {
40  for(int i=0; i<4; i++) {
41  X[i] = meta_mom.X()[i];
42  E[i] = meta_u.X()[i];
43  border[i] = (E[i] - X[i])/2;
44  }
45  }
46 
47  virtual ~GaugeForceArg() { }
48  };
49 
50  __device__ __host__ inline static int flipDir(int dir) { return (7-dir); }
51  __device__ __host__ inline static bool isForwards(int dir) { return (dir <= 3); }
52 
53  // this ensures that array elements are held in cache
54  template <typename T>
55  __device__ __host__ inline static T cache(const T *ptr, int idx) {
56 #ifdef __CUDA_ARCH__
57  return __ldg(ptr+idx);
58 #else
59  return ptr[idx];
60 #endif
61  }
62 
63  template<typename Float, typename Arg, int dir>
64  __device__ __host__ inline void GaugeForceKernel(Arg &arg, int idx, int parity)
65  {
66  typedef Matrix<complex<Float>,3> Link;
67 
68  int x[4] = {0, 0, 0, 0};
69  getCoords(x, idx, arg.X, parity);
70  for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
71 
72  //linkA: current matrix
73  //linkB: the loaded matrix in this round
74  Link linkA, linkB, staple;
75 
76 #ifdef __CUDA_ARCH__
77  extern __shared__ int s[];
78  int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
79  s[tid] = 0;
80  signed char *dx = (signed char*)&s[tid];
81 #else
82  int dx[4] = {0, 0, 0, 0};
83 #endif
84 
85  for (int i=0; i<arg.num_paths; i++) {
86  Float coeff = cache(arg.path_coeff_d,i);
87  if (coeff == 0) continue;
88 
89  const int* path = arg.input_path_d[dir] + i*arg.path_max_length;
90 
91  // start from end of link in direction dir
92  int nbr_oddbit = (parity^1);
93  dx[dir]++;
94 
95  int path0 = cache(path,0);
96  int lnkdir = isForwards(path0) ? path0 : flipDir(path0);
97 
98  if (isForwards(path0)) {
99  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
100  linkA = linkB;
101  dx[lnkdir]++; // now have to update location
102  nbr_oddbit = nbr_oddbit^1;
103  } else {
104  dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
105  nbr_oddbit = nbr_oddbit^1;
106  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
107  linkA = conj(linkB);
108  }
109 
110  for (int j=1; j<cache(arg.length_d,i); j++) {
111 
112  int pathj = cache(path,j);
113  int lnkdir = isForwards(pathj) ? pathj : flipDir(pathj);
114 
115  if (isForwards(pathj)) {
116  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
117  linkA = linkA * linkB;
118  dx[lnkdir]++; // now have to update to new location
119  nbr_oddbit = nbr_oddbit^1;
120  } else {
121  dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
122  nbr_oddbit = nbr_oddbit^1;
123  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
124  linkA = linkA * conj(linkB);
125  }
126  } //j
127  staple = staple + coeff*linkA;
128  } //i
129 
130  // multiply by U(x)
131  linkA = arg.u(dir, linkIndex(x,arg.E), parity);
132  linkA = linkA * staple;
133 
134  // update mom(x)
135  Link mom = arg.mom(dir, idx, parity);
136  mom = mom - arg.coeff * linkA;
137  makeAntiHerm(mom);
138  arg.mom(dir, idx, parity) = mom;
139  return;
140  }
141 
142  template <typename Float, typename Arg>
143  void GaugeForceCPU(Arg &arg) {
144  for (int dir=0; dir<4; dir++) {
145  for (int parity=0; parity<2; parity++) {
146  for (int idx=0; idx<arg.threads; idx++) {
147  switch(dir) {
148  case 0:
149  GaugeForceKernel<Float,Arg,0>(arg, idx, parity);
150  break;
151  case 1:
152  GaugeForceKernel<Float,Arg,1>(arg, idx, parity);
153  break;
154  case 2:
155  GaugeForceKernel<Float,Arg,2>(arg, idx, parity);
156  break;
157  case 3:
158  GaugeForceKernel<Float,Arg,3>(arg, idx, parity);
159  break;
160  }
161  }
162  }
163  }
164  return;
165  }
166 
167  template <typename Float, typename Arg>
168  __global__ void GaugeForceGPU(Arg arg) {
169  int idx = blockIdx.x * blockDim.x + threadIdx.x;
170  if (idx >= arg.threads) return;
171  int parity = blockIdx.y * blockDim.y + threadIdx.y;
172  int dir = blockIdx.z * blockDim.z + threadIdx.z;
173  switch(dir) {
174  case 0:
175  GaugeForceKernel<Float,Arg,0>(arg, idx, parity);
176  break;
177  case 1:
178  GaugeForceKernel<Float,Arg,1>(arg, idx, parity);
179  break;
180  case 2:
181  GaugeForceKernel<Float,Arg,2>(arg, idx, parity);
182  break;
183  case 3:
184  GaugeForceKernel<Float,Arg,3>(arg, idx, parity);
185  break;
186  }
187  return;
188  }
189 
190  template <typename Float, typename Arg>
191  class GaugeForce : public TunableVectorY {
192 
193  private:
194  Arg &arg;
195  QudaFieldLocation location;
196  const char *vol_str;
197  unsigned int sharedBytesPerThread() const { return 4; } // for dynamic indexing array
198  unsigned int minThreads() const { return arg.threads; }
199  bool tuneGridDim() const { return false; } // don't tune the grid dimension
200 
201  public:
202  GaugeForce(Arg &arg, const GaugeField &meta_mom, const GaugeField &meta_u)
203  : TunableVectorY(2), arg(arg), location(meta_mom.Location()), vol_str(meta_mom.VolString()) { }
204  virtual ~GaugeForce() { }
205 
206  void apply(const cudaStream_t &stream) {
207  if (location == QUDA_CUDA_FIELD_LOCATION) {
208  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
209  GaugeForceGPU<Float,Arg><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
210  } else {
211  GaugeForceCPU<Float,Arg>(arg);
212  }
213  }
214 
215  void preTune() { arg.mom.save(); }
216  void postTune() { arg.mom.load(); }
217 
218  long long flops() const { return (arg.count - arg.num_paths + 1) * 198ll * 2 * arg.mom.volumeCB * 4; }
219  long long bytes() const { return ((arg.count + 1ll) * arg.u.Bytes() + 2ll*arg.mom.Bytes()) * 2 * arg.mom.volumeCB * 4; }
220 
221  TuneKey tuneKey() const {
222  std::stringstream aux;
223  char comm[5];
224  comm[0] = (commDimPartitioned(0) ? '1' : '0');
225  comm[1] = (commDimPartitioned(1) ? '1' : '0');
226  comm[2] = (commDimPartitioned(2) ? '1' : '0');
227  comm[3] = (commDimPartitioned(3) ? '1' : '0');
228  comm[4] = '\0';
229  aux << "comm=" << comm << ",threads=" << arg.threads << ",num_paths=" << arg.num_paths;
230  return TuneKey(vol_str, typeid(*this).name(), aux.str().c_str());
231  }
232 
233  bool advanceBlockDim(TuneParam &param) const {
234  dim3 block = param.block;
235  dim3 grid = param.grid;
236  bool rtn = TunableVectorY::advanceBlockDim(param);
237  param.block.z = block.z;
238  param.grid.z = grid.z;
239 
240  if (!rtn) {
241  if (param.block.z < 4) {
242  param.block.z *= 2;
243  param.grid.z = 4 / param.block.z;
244  rtn = true;
245  } else {
246  param.block.z = 1;
247  param.grid.z = 4;
248  rtn = false;
249  }
250  }
251  return rtn;
252  }
253 
254  void initTuneParam(TuneParam &param) const {
256  param.block.z = 1;
257  param.grid.z = 4;
258  }
259 
260  void defaultTuneParam(TuneParam &param) const {
262  param.block.z = 1;
263  param.grid.z = 4;
264  }
265  };
266 
267  template <typename Float, typename Mom, typename Gauge>
268  void gaugeForce(Mom mom, const Gauge &u, GaugeField& meta_mom, const GaugeField& meta_u, const double coeff,
269  int ***input_path, const int* length_h, const double* path_coeff_h, const int num_paths, const int path_max_length)
270  {
271  size_t bytes = num_paths*path_max_length*sizeof(int);
272  int *input_path_d[4];
273 
274  int count = 0;
275  for (int dir=0; dir<4; dir++) {
276  input_path_d[dir] = (int*)pool_device_malloc(bytes);
277  cudaMemset(input_path_d[dir], 0, bytes);
278 
279  int* input_path_h = (int*)safe_malloc(bytes);
280  memset(input_path_h, 0, bytes);
281 
282  // flatten the input_path array for copying to the device
283  for (int i=0; i < num_paths; i++) {
284  for (int j=0; j < length_h[i]; j++) {
285  input_path_h[i*path_max_length + j] = input_path[dir][i][j];
286  if (dir==0) count++;
287  }
288  }
289  qudaMemcpy(input_path_d[dir], input_path_h, bytes, cudaMemcpyHostToDevice);
290 
291  host_free(input_path_h);
292  }
293 
294  //length
295  int* length_d = (int*)pool_device_malloc(num_paths*sizeof(int));
296  qudaMemcpy(length_d, length_h, num_paths*sizeof(int), cudaMemcpyHostToDevice);
297 
298  //path_coeff
299  double* path_coeff_d = (double*)pool_device_malloc(num_paths*sizeof(double));
300  qudaMemcpy(path_coeff_d, path_coeff_h, num_paths*sizeof(double), cudaMemcpyHostToDevice);
301 
302  GaugeForceArg<Mom,Gauge> arg(mom, u, num_paths, path_max_length, coeff, input_path_d,
303  length_d, path_coeff_d, count, meta_mom, meta_u);
304  GaugeForce<Float,GaugeForceArg<Mom,Gauge> > gauge_force(arg, meta_mom, meta_u);
305  gauge_force.apply(0);
306  checkCudaError();
307 
308  pool_device_free(length_d);
309  pool_device_free(path_coeff_d);
310  for (int dir=0; dir<4; dir++) pool_device_free(input_path_d[dir]);
312  }
313 
314  template <typename Float>
315  void gaugeForce(GaugeField& mom, const GaugeField& u, const double coeff, int ***input_path,
316  const int* length, const double* path_coeff, const int num_paths, const int max_length)
317  {
318  if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
319  errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
320 
321  if (mom.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
322  typedef typename gauge::FloatNOrder<Float,18,2,11> M;
323  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
324  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
325  gaugeForce<Float,M,G>(M(mom), G(u), mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
326  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
327  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
328  gaugeForce<Float,M,G>(M(mom), G(u), mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
329  } else {
330  errorQuda("Reconstruction type %d not supported", u.Reconstruct());
331  }
332  } else {
333  errorQuda("Gauge Field order %d not supported", mom.Order());
334  }
335 
336  }
337 #endif // GPU_GAUGE_FORCE
338 
339 
340  void gaugeForce(GaugeField& mom, const GaugeField& u, double coeff, int ***input_path,
341  int *length, double *path_coeff, int num_paths, int max_length)
342  {
343 #ifdef GPU_GAUGE_FORCE
344  if (mom.Precision() != u.Precision()) errorQuda("Mixed precision not supported");
345  if (mom.Location() != u.Location()) errorQuda("Mixed field locations not supported");
346 
347  switch(mom.Precision()) {
349  gaugeForce<double>(mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
350  break;
352  gaugeForce<float>(mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
353  break;
354  default:
355  errorQuda("Unsupported precision %d", mom.Precision());
356  }
357 #else
358  errorQuda("Gauge force has not been built");
359 #endif // GPU_GAUGE_FORCE
360  }
361 
362 } // namespace quda
363 
364 
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
int commDimPartitioned(int dir)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
#define host_free(ptr)
Definition: malloc_quda.h:71
cudaStream_t * stream
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:466
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:440
int E[4]
Definition: test_util.cpp:35
int length[]
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
Definition: gauge_force.cu:340
QudaGaugeParam param
Definition: pack_test.cpp:17
#define qudaDeviceSynchronize()
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:474
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
const char * vol_str
Definition: copy_quda.cu:27
#define safe_malloc(size)
Definition: malloc_quda.h:66
void * memset(void *s, int c, size_t n)
QudaFieldLocation Location() const
enum QudaFieldLocation_s QudaFieldLocation
__shared__ float s[]
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:746
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
#define checkCudaError()
Definition: util_quda.h:161
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.