QUDA  0.9.0
gauge_force.cu
Go to the documentation of this file.
1 #include <gauge_field_order.h>
2 #include <quda_matrix.h>
3 #include <index_helper.cuh>
4 #include <generics/ldg.h>
5 
6 namespace quda {
7 
8 #ifdef GPU_GAUGE_FORCE
9 
10  template <typename Mom, typename Gauge>
11  struct GaugeForceArg {
12  Mom mom;
13  const Gauge u;
14 
15  int threads;
16  int X[4]; // the regular volume parameters
17  int E[4]; // the extended volume parameters
18  int border[4]; // radius of border
19 
20  int num_paths;
21  int path_max_length;
22 
23  double coeff;
24 
25  const int *input_path_d[4];
26  const int *length_d;
27  const double *path_coeff_d;
28 
29  int count; // equal to sum of all path lengths. Used a convenience for computing perf
30 
31  GaugeForceArg(Mom &mom, const Gauge &u, int num_paths, int path_max_length, double coeff,
32  int **input_path_d, const int *length_d, const double* path_coeff_d, int count,
33  const GaugeField &meta_mom, const GaugeField &meta_u)
34  : mom(mom), u(u), threads(meta_mom.VolumeCB()), num_paths(num_paths),
35  path_max_length(path_max_length), coeff(coeff),
36  input_path_d{ input_path_d[0], input_path_d[1], input_path_d[2], input_path_d[3] },
37  length_d(length_d), path_coeff_d(path_coeff_d), count(count)
38  {
39  for(int i=0; i<4; i++) {
40  X[i] = meta_mom.X()[i];
41  E[i] = meta_u.X()[i];
42  border[i] = (E[i] - X[i])/2;
43  }
44  }
45 
46  virtual ~GaugeForceArg() { }
47  };
48 
49  __device__ __host__ inline static int flipDir(int dir) { return (7-dir); }
50  __device__ __host__ inline static bool isForwards(int dir) { return (dir <= 3); }
51 
52  // this ensures that array elements are held in cache
53  template <typename T>
54  __device__ __host__ inline static T cache(const T *ptr, int idx) {
55 #ifdef __CUDA_ARCH__
56  return __ldg(ptr+idx);
57 #else
58  return ptr[idx];
59 #endif
60  }
61 
62  template<typename Float, typename Arg, int dir>
63  __device__ __host__ inline void GaugeForceKernel(Arg &arg, int idx, int parity)
64  {
65  typedef Matrix<complex<Float>,3> Link;
66 
67  int x[4] = {0, 0, 0, 0};
68  getCoords(x, idx, arg.X, parity);
69  for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
70 
71  //linkA: current matrix
72  //linkB: the loaded matrix in this round
73  Link linkA, linkB, staple;
74 
75 #ifdef __CUDA_ARCH__
76  extern __shared__ int s[];
77  int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
78  s[tid] = 0;
79  signed char *dx = (signed char*)&s[tid];
80 #else
81  int dx[4] = {0, 0, 0, 0};
82 #endif
83 
84  for (int i=0; i<arg.num_paths; i++) {
85  Float coeff = cache(arg.path_coeff_d,i);
86  if (coeff == 0) continue;
87 
88  const int* path = arg.input_path_d[dir] + i*arg.path_max_length;
89 
90  // start from end of link in direction dir
91  int nbr_oddbit = (parity^1);
92  dx[dir]++;
93 
94  int path0 = cache(path,0);
95  int lnkdir = isForwards(path0) ? path0 : flipDir(path0);
96 
97  if (isForwards(path0)) {
98  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
99  linkA = linkB;
100  dx[lnkdir]++; // now have to update location
101  nbr_oddbit = nbr_oddbit^1;
102  } else {
103  dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
104  nbr_oddbit = nbr_oddbit^1;
105  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
106  linkA = conj(linkB);
107  }
108 
109  for (int j=1; j<cache(arg.length_d,i); j++) {
110 
111  int pathj = cache(path,j);
112  int lnkdir = isForwards(pathj) ? pathj : flipDir(pathj);
113 
114  if (isForwards(pathj)) {
115  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
116  linkA = linkA * linkB;
117  dx[lnkdir]++; // now have to update to new location
118  nbr_oddbit = nbr_oddbit^1;
119  } else {
120  dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
121  nbr_oddbit = nbr_oddbit^1;
122  linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
123  linkA = linkA * conj(linkB);
124  }
125  } //j
126  staple = staple + coeff*linkA;
127  } //i
128 
129  // multiply by U(x)
130  linkA = arg.u(dir, linkIndex(x,arg.E), parity);
131  linkA = linkA * staple;
132 
133  // update mom(x)
134  Link mom = arg.mom(dir, idx, parity);
135  mom = mom - arg.coeff * linkA;
136  makeAntiHerm(mom);
137  arg.mom(dir, idx, parity) = mom;
138  return;
139  }
140 
141  template <typename Float, typename Arg>
142  void GaugeForceCPU(Arg &arg) {
143  for (int dir=0; dir<4; dir++) {
144  for (int parity=0; parity<2; parity++) {
145  for (int idx=0; idx<arg.threads; idx++) {
146  switch(dir) {
147  case 0:
148  GaugeForceKernel<Float,Arg,0>(arg, idx, parity);
149  break;
150  case 1:
151  GaugeForceKernel<Float,Arg,1>(arg, idx, parity);
152  break;
153  case 2:
154  GaugeForceKernel<Float,Arg,2>(arg, idx, parity);
155  break;
156  case 3:
157  GaugeForceKernel<Float,Arg,3>(arg, idx, parity);
158  break;
159  }
160  }
161  }
162  }
163  return;
164  }
165 
166  template <typename Float, typename Arg>
167  __global__ void GaugeForceGPU(Arg arg) {
168  int idx = blockIdx.x * blockDim.x + threadIdx.x;
169  if (idx >= arg.threads) return;
170  int parity = blockIdx.y * blockDim.y + threadIdx.y;
171  int dir = blockIdx.z * blockDim.z + threadIdx.z;
172  switch(dir) {
173  case 0:
174  GaugeForceKernel<Float,Arg,0>(arg, idx, parity);
175  break;
176  case 1:
177  GaugeForceKernel<Float,Arg,1>(arg, idx, parity);
178  break;
179  case 2:
180  GaugeForceKernel<Float,Arg,2>(arg, idx, parity);
181  break;
182  case 3:
183  GaugeForceKernel<Float,Arg,3>(arg, idx, parity);
184  break;
185  }
186  return;
187  }
188 
189  template <typename Float, typename Arg>
190  class GaugeForce : public TunableVectorY {
191 
192  private:
193  Arg &arg;
194  QudaFieldLocation location;
195  const char *vol_str;
196  unsigned int sharedBytesPerThread() const { return 4; } // for dynamic indexing array
197  unsigned int minThreads() const { return arg.threads; }
198  bool tuneGridDim() const { return false; } // don't tune the grid dimension
199 
200  public:
201  GaugeForce(Arg &arg, const GaugeField &meta_mom, const GaugeField &meta_u)
202  : TunableVectorY(2), arg(arg), location(meta_mom.Location()), vol_str(meta_mom.VolString()) { }
203  virtual ~GaugeForce() { }
204 
205  void apply(const cudaStream_t &stream) {
206  if (location == QUDA_CUDA_FIELD_LOCATION) {
207  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
208  GaugeForceGPU<Float,Arg><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
209  } else {
210  GaugeForceCPU<Float,Arg>(arg);
211  }
212  }
213 
214  void preTune() { arg.mom.save(); }
215  void postTune() { arg.mom.load(); }
216 
217  long long flops() const { return (arg.count - arg.num_paths + 1) * 198ll * 2 * arg.mom.volumeCB * 4; }
218  long long bytes() const { return ((arg.count + 1ll) * arg.u.Bytes() + 2ll*arg.mom.Bytes()) * 2 * arg.mom.volumeCB * 4; }
219 
220  TuneKey tuneKey() const {
221  std::stringstream aux;
222  char comm[5];
223  comm[0] = (commDimPartitioned(0) ? '1' : '0');
224  comm[1] = (commDimPartitioned(1) ? '1' : '0');
225  comm[2] = (commDimPartitioned(2) ? '1' : '0');
226  comm[3] = (commDimPartitioned(3) ? '1' : '0');
227  comm[4] = '\0';
228  aux << "comm=" << comm << ",threads=" << arg.threads << ",num_paths=" << arg.num_paths;
229  return TuneKey(vol_str, typeid(*this).name(), aux.str().c_str());
230  }
231 
232  bool advanceBlockDim(TuneParam &param) const {
233  dim3 block = param.block;
234  dim3 grid = param.grid;
236  param.block.z = block.z;
237  param.grid.z = grid.z;
238 
239  if (!rtn) {
240  if (param.block.z < 4) {
241  param.block.z *= 2;
242  param.grid.z = 4 / param.block.z;
243  rtn = true;
244  } else {
245  param.block.z = 1;
246  param.grid.z = 4;
247  rtn = false;
248  }
249  }
250  return rtn;
251  }
252 
253  void initTuneParam(TuneParam &param) const {
255  param.block.z = 1;
256  param.grid.z = 4;
257  }
258 
259  void defaultTuneParam(TuneParam &param) const {
261  param.block.z = 1;
262  param.grid.z = 4;
263  }
264  };
265 
266  template <typename Float, typename Mom, typename Gauge>
267  void gaugeForce(Mom mom, const Gauge &u, GaugeField& meta_mom, const GaugeField& meta_u, const double coeff,
268  int ***input_path, const int* length_h, const double* path_coeff_h, const int num_paths, const int path_max_length)
269  {
270  size_t bytes = num_paths*path_max_length*sizeof(int);
271  int *input_path_d[4];
272 
273  int count = 0;
274  for (int dir=0; dir<4; dir++) {
275  input_path_d[dir] = (int*)pool_device_malloc(bytes);
276  cudaMemset(input_path_d[dir], 0, bytes);
277 
278  int* input_path_h = (int*)safe_malloc(bytes);
279  memset(input_path_h, 0, bytes);
280 
281  // flatten the input_path array for copying to the device
282  for (int i=0; i < num_paths; i++) {
283  for (int j=0; j < length_h[i]; j++) {
284  input_path_h[i*path_max_length + j] = input_path[dir][i][j];
285  if (dir==0) count++;
286  }
287  }
288  qudaMemcpy(input_path_d[dir], input_path_h, bytes, cudaMemcpyHostToDevice);
289 
290  host_free(input_path_h);
291  }
292 
293  //length
294  int* length_d = (int*)pool_device_malloc(num_paths*sizeof(int));
295  qudaMemcpy(length_d, length_h, num_paths*sizeof(int), cudaMemcpyHostToDevice);
296 
297  //path_coeff
298  double* path_coeff_d = (double*)pool_device_malloc(num_paths*sizeof(double));
299  qudaMemcpy(path_coeff_d, path_coeff_h, num_paths*sizeof(double), cudaMemcpyHostToDevice);
300 
301  GaugeForceArg<Mom,Gauge> arg(mom, u, num_paths, path_max_length, coeff, input_path_d,
302  length_d, path_coeff_d, count, meta_mom, meta_u);
303  GaugeForce<Float,GaugeForceArg<Mom,Gauge> > gauge_force(arg, meta_mom, meta_u);
304  gauge_force.apply(0);
305  checkCudaError();
306 
307  pool_device_free(length_d);
308  pool_device_free(path_coeff_d);
309  for (int dir=0; dir<4; dir++) pool_device_free(input_path_d[dir]);
311  }
312 
313  template <typename Float>
314  void gaugeForce(GaugeField& mom, const GaugeField& u, const double coeff, int ***input_path,
315  const int* length, const double* path_coeff, const int num_paths, const int max_length)
316  {
317  if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
318  errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
319 
320  if (mom.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
321  typedef typename gauge::FloatNOrder<Float,18,2,11> M;
322  if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
323  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
324  gaugeForce<Float,M,G>(M(mom), G(u), mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
325  } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
326  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
327  gaugeForce<Float,M,G>(M(mom), G(u), mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
328  } else {
329  errorQuda("Reconstruction type %d not supported", u.Reconstruct());
330  }
331  } else {
332  errorQuda("Gauge Field order %d not supported", mom.Order());
333  }
334 
335  }
336 #endif // GPU_GAUGE_FORCE
337 
338 
339  void gaugeForce(GaugeField& mom, const GaugeField& u, double coeff, int ***input_path,
340  int *length, double *path_coeff, int num_paths, int max_length)
341  {
342 #ifdef GPU_GAUGE_FORCE
343  if (mom.Precision() != u.Precision()) errorQuda("Mixed precision not supported");
344  if (mom.Location() != u.Location()) errorQuda("Mixed field locations not supported");
345 
346  switch(mom.Precision()) {
348  gaugeForce<double>(mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
349  break;
351  gaugeForce<float>(mom, u, coeff, input_path, length, path_coeff, num_paths, max_length);
352  break;
353  default:
354  errorQuda("Unsupported precision %d", mom.Precision());
355  }
356 #else
357  errorQuda("Gauge force has not been built");
358 #endif // GPU_GAUGE_FORCE
359  }
360 
361 } // namespace quda
362 
363 
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
dim3 dim3 blockDim
int commDimPartitioned(int dir)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
#define host_free(ptr)
Definition: malloc_quda.h:59
cudaStream_t * stream
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:382
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:357
int E[4]
Definition: test_util.cpp:36
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
Definition: gauge_force.cu:339
QudaGaugeParam param
Definition: pack_test.cpp:17
#define pool_device_malloc(size)
Definition: malloc_quda.h:113
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:390
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
const void * ptr
#define safe_malloc(size)
Definition: malloc_quda.h:54
const char * vol_str
Definition: blas_quda.cu:56
QudaFieldLocation Location() const
void * memset(void *__b, int __c, size_t __len)
enum QudaFieldLocation_s QudaFieldLocation
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:636
#define pool_device_free(ptr)
Definition: malloc_quda.h:114
#define checkCudaError()
Definition: util_quda.h:129
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)