QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gauge_force_quda.cu
Go to the documentation of this file.
1 #include <read_gauge.h>
2 #include <gauge_field.h>
3 #include <clover_field.h>
4 #include <dslash_quda.h>
5 #include <force_common.h>
7 #ifdef MULTI_GPU
8 #include "face_quda.h"
9 #endif
10 
11 namespace quda {
12 
13  namespace gaugeforce {
14 #include <dslash_constants.h>
15 #include <dslash_textures.h>
16  } // namespace gaugeforce
17 
18  using namespace gaugeforce;
19 
20 
21 #define GF_SITE_MATRIX_LOAD_TEX 1
22 
23  //single precsison, 12-reconstruct
24 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
25 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink0TexSingle_recon, dir, idx, var, gf.site_ga_stride)
26 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink1TexSingle_recon, dir, idx, var, gf.site_ga_stride)
27 #else
28 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkEven, dir, idx, var, gf.site_ga_stride)
29 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkOdd, dir, idx, var, gf.site_ga_stride)
30 #endif
31 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
32 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
33 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4
34 #define N_IN_FLOATN 4
35 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp12
36 #include "gauge_force_core.h"
37 #undef LOAD_EVEN_MATRIX
38 #undef LOAD_ODD_MATRIX
39 #undef LOAD_ANTI_HERMITIAN
40 #undef RECONSTRUCT_MATRIX
41 #undef DECLARE_LINK_VARS
42 #undef N_IN_FLOATN
43 #undef GAUGE_FORCE_KERN_NAME
44 
45  //double precsison, 12-reconstruct
46 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
47 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
48 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
49 #else
50 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkEven, dir, idx, var, gf.site_ga_stride)
51 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkOdd, dir, idx, var, gf.site_ga_stride)
52 #endif
53 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
54 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
55 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
56 #define N_IN_FLOATN 2
57 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp12
59 #undef LOAD_EVEN_MATRIX
60 #undef LOAD_ODD_MATRIX
61 #undef LOAD_ANTI_HERMITIAN
62 #undef RECONSTRUCT_MATRIX
63 #undef DECLARE_LINK_VARS
64 #undef N_IN_FLOATN
65 #undef GAUGE_FORCE_KERN_NAME
66 
67  //single precision, 18-reconstruct
68 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
69 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink0TexSingle, dir, idx, var, gf.site_ga_stride)
70 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink1TexSingle, dir, idx, var, gf.site_ga_stride)
71 #else
72 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
73 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
74 #endif
75 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var,gf.mom_ga_stride)
76 #define RECONSTRUCT_MATRIX(sign, var)
77 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
78 #define N_IN_FLOATN 2
79 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp18
81 #undef LOAD_EVEN_MATRIX
82 #undef LOAD_ODD_MATRIX
83 #undef LOAD_ANTI_HERMITIAN
84 #undef RECONSTRUCT_MATRIX
85 #undef DECLARE_LINK_VARS
86 #undef N_IN_FLOATN
87 #undef GAUGE_FORCE_KERN_NAME
88 
89  //double precision, 18-reconstruct
90 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
91 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
92 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
93 #else
94 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
95 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
96 #endif
97 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
98 #define RECONSTRUCT_MATRIX(sign, var)
99 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
100 #define N_IN_FLOATN 2
101 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp18
102 #include "gauge_force_core.h"
103 #undef LOAD_EVEN_MATRIX
104 #undef LOAD_ODD_MATRIX
105 #undef LOAD_ANTI_HERMITIAN
106 #undef RECONSTRUCT_MATRIX
107 #undef DECLARE_LINK_VARS
108 #undef N_IN_FLOATN
109 #undef GAUGE_FORCE_KERN_NAME
110 
111  void
113  {
114 
115  static int gauge_force_init_cuda_flag = 0;
116  if (gauge_force_init_cuda_flag){
117  return;
118  }
119  gauge_force_init_cuda_flag=1;
120 
121  int* X = param->X;
122 
123  int Vh = X[0]*X[1]*X[2]*X[3]/2;
124  fat_force_const_t gf_h;
125  gf_h.path_max_length = path_max_length;
126 #ifdef MULTI_GPU
127  int Vh_ex = (X[0]+4)*(X[1]+4)*(X[2]+4)*(X[3]+4)/2; // FIXME - this should not be hardcoded
128  gf_h.site_ga_stride = param->site_ga_pad + Vh_ex;
129 #else
130  gf_h.site_ga_stride = param->site_ga_pad + Vh;
131 #endif
132 
133  gf_h.mom_ga_stride = param->mom_ga_pad + Vh;
134  cudaMemcpyToSymbol(gf, &gf_h, sizeof(fat_force_const_t));
135  }
136 
137 
138  class GaugeForceCuda : public Tunable {
139 
140  private:
141  cudaGaugeField &mom;
142  const int dir;
143  const double eb3;
144  const cudaGaugeField &link;
145  const int *input_path;
146  const int *length;
147  const double *path_coeff;
148  const int num_paths;
149  const kernel_param_t &kparam;
150 
151  unsigned int sharedBytesPerThread() const { return 0; }
152  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
153 
154  // don't tune the grid dimension
155  bool tuneGridDim() const { return false; }
156  unsigned int minThreads() const { return kparam.threads; }
157 
158  public:
159  GaugeForceCuda(cudaGaugeField &mom, const int dir, const double &eb3, const cudaGaugeField &link,
160  const int *input_path, const int *length, const double *path_coeff,
161  const int num_paths, const kernel_param_t &kparam) :
162  mom(mom), dir(dir), eb3(eb3), link(link), input_path(input_path), length(length),
163  path_coeff(path_coeff), num_paths(num_paths), kparam(kparam) {
164 
165  if(link.Precision() == QUDA_DOUBLE_PRECISION){
166  cudaBindTexture(0, siteLink0TexDouble, link.Even_p(), link.Bytes()/2);
167  cudaBindTexture(0, siteLink1TexDouble, link.Odd_p(), link.Bytes()/2);
168  }else{ //QUDA_SINGLE_PRECISION
169  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
170  cudaBindTexture(0, siteLink0TexSingle, link.Even_p(), link.Bytes()/2);
171  cudaBindTexture(0, siteLink1TexSingle, link.Odd_p(), link.Bytes()/2);
172  }else{//QUDA_RECONSTRUCT_12
173  cudaBindTexture(0, siteLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
174  cudaBindTexture(0, siteLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
175  }
176  }
177  }
178 
179  virtual ~GaugeForceCuda() {
180  if(link.Precision() == QUDA_DOUBLE_PRECISION){
181  cudaBindTexture(0, siteLink0TexDouble, link.Even_p(), link.Bytes()/2);
182  cudaBindTexture(0, siteLink1TexDouble, link.Odd_p(), link.Bytes()/2);
183  }else{ //QUDA_SINGLE_PRECISION
184  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
185  cudaBindTexture(0, siteLink0TexSingle, link.Even_p(), link.Bytes()/2);
186  cudaBindTexture(0, siteLink1TexSingle, link.Odd_p(), link.Bytes()/2);
187  }else{//QUDA_RECONSTRUCT_12
188  cudaBindTexture(0, siteLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
189  cudaBindTexture(0, siteLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
190  }
191  }
192  }
193 
194  void apply(const cudaStream_t &stream) {
195  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
196  if(link.Precision() == QUDA_DOUBLE_PRECISION){
197  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
198  parity_compute_gauge_force_kernel_dp18<0,double><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
199  dir, eb3,
200  (double2*)link.Even_p(), (double2*)link.Odd_p(),
201  input_path, length, path_coeff,
202  num_paths, kparam);
203  parity_compute_gauge_force_kernel_dp18<1,double><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
204  dir, eb3,
205  (double2*)link.Even_p(), (double2*)link.Odd_p(),
206  input_path, length, path_coeff,
207  num_paths, kparam);
208 
209  }else{ //QUDA_RECONSTRUCT_12
210  parity_compute_gauge_force_kernel_dp12<0,double><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
211  dir, eb3,
212  (double2*)link.Even_p(), (double2*)link.Odd_p(),
213  input_path, length, path_coeff,
214  num_paths, kparam);
215  parity_compute_gauge_force_kernel_dp12<1,double><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
216  dir, eb3,
217  (double2*)link.Even_p(), (double2*)link.Odd_p(),
218  input_path, length, path_coeff,
219  num_paths, kparam);
220  }
221  }else{ //QUDA_SINGLE_PRECISION
222  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
223 
224  parity_compute_gauge_force_kernel_sp18<0,float><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
225  dir, eb3,
226  (float2*)link.Even_p(), (float2*)link.Odd_p(),
227  input_path, length, path_coeff,
228  num_paths, kparam);
229  parity_compute_gauge_force_kernel_sp18<1,float><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
230  dir, eb3,
231  (float2*)link.Even_p(), (float2*)link.Odd_p(),
232  input_path, length, path_coeff,
233  num_paths, kparam);
234 
235  }else{ //QUDA_RECONSTRUCT_12
236  parity_compute_gauge_force_kernel_sp12<0,float><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
237  dir, eb3,
238  (float4*)link.Even_p(), (float4*)link.Odd_p(),
239  input_path, length, path_coeff,
240  num_paths, kparam);
241  //odd
242  /* The reason we do not switch the even/odd function input paramemters and the texture binding
243  * is that we use the oddbit to decided where to load, in the kernel function
244  */
245  parity_compute_gauge_force_kernel_sp12<1,float><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
246  dir, eb3,
247  (float4*)link.Even_p(), (float4*)link.Odd_p(),
248  input_path, length, path_coeff,
249  num_paths, kparam);
250  }
251  }
252  }
253 
254  void preTune() { mom.backup(); }
255  void postTune() { mom.restore(); }
256 
257  long long flops() const { return 0; } // FIXME: add flops counter
258 
259  TuneKey tuneKey() const {
260  std::stringstream vol, aux;
261  vol << link.X()[0] << "x";
262  vol << link.X()[1] << "x";
263  vol << link.X()[2] << "x";
264  vol << link.X()[3];
265  aux << "threads=" << link.Volume() << ",prec=" << link.Precision();
266  aux << "stride=" << link.Stride() << ",recon=" << link.Reconstruct();
267  aux << "dir=" << dir << "num_paths=" << num_paths;
268  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
269  }
270 
271  };
272 
273  void
274  gauge_force_cuda_dir(cudaGaugeField& cudaMom, const int dir, const double eb3, const cudaGaugeField& cudaSiteLink,
275  const QudaGaugeParam* param, int** input_path, const int* length, const double* path_coeff,
276  const int num_paths, const int max_length)
277  {
278  //input_path
279  size_t bytes = num_paths*max_length*sizeof(int);
280 
281  int *input_path_d = (int *) device_malloc(bytes);
282  cudaMemset(input_path_d, 0, bytes);
283  checkCudaError();
284 
285  int* input_path_h = (int *) safe_malloc(bytes);
286  memset(input_path_h, 0, bytes);
287 
288  for(int i=0; i < num_paths; i++) {
289  for(int j=0; j < length[i]; j++) {
290  input_path_h[i*max_length + j] = input_path[i][j];
291  }
292  }
293 
294  cudaMemcpy(input_path_d, input_path_h, bytes, cudaMemcpyHostToDevice);
295 
296  //length
297  int* length_d = (int *) device_malloc(num_paths*sizeof(int));
298  cudaMemcpy(length_d, length, num_paths*sizeof(int), cudaMemcpyHostToDevice);
299 
300  //path_coeff
301  void* path_coeff_d = device_malloc(num_paths*sizeof(double));
302  cudaMemcpy(path_coeff_d, path_coeff, num_paths*sizeof(double), cudaMemcpyHostToDevice);
303 
304  //compute the gauge forces
305  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
306 
308 #ifdef MULTI_GPU
309  for(int i=0; i<4; i++) {
310  kparam.ghostDim[i] = commDimPartitioned(i);
311  }
312 #endif
313  kparam.threads = volume/2;
314 
315  GaugeForceCuda gaugeForce(cudaMom, dir, eb3, cudaSiteLink, input_path_d,
316  length_d, reinterpret_cast<double*>(path_coeff_d), num_paths, kparam);
317  gaugeForce.apply(0);
318  checkCudaError();
319 
320  host_free(input_path_h);
321  device_free(input_path_d);
322  device_free(length_d);
323  device_free(path_coeff_d);
324  }
325 
326 
327  void
329  QudaGaugeParam* param, int*** input_path,
330  int* length, double* path_coeff, int num_paths, int max_length)
331  {
332  for(int dir=0; dir < 4; dir++){
333  gauge_force_cuda_dir(cudaMom, dir, eb3, cudaSiteLink, param, input_path[dir],
334  length, path_coeff, num_paths, max_length);
335  }
336  }
337 
338 } // namespace quda
__constant__ int Vh
int commDimPartitioned(int dir)
__constant__ int Vh_ex
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
__constant__ fat_force_const_t gf
#define host_free(ptr)
Definition: malloc_quda.h:29
cudaStream_t * stream
TuneKey tuneKey() const
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
texture< float4, 1, cudaReadModeElementType > siteLink1TexSingle_recon
size_t Bytes() const
Definition: gauge_field.h:197
texture< int4, 1 > siteLink1TexDouble
QudaPrecision Precision() const
texture< float2, 1, cudaReadModeElementType > siteLink0TexSingle
int site_ga_pad
Definition: quda.h:55
struct quda::kernel_param_s kernel_param_t
texture< float2, 1, cudaReadModeElementType > siteLink1TexSingle
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
void gauge_force_cuda(cudaGaugeField &cudaMom, double eb3, cudaGaugeField &cudaSiteLink, QudaGaugeParam *param, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
int X[4]
Definition: quda.h:29
cudaGaugeField * cudaMom
texture< float4, 1, cudaReadModeElementType > siteLink0TexSingle_recon
#define safe_malloc(size)
Definition: malloc_quda.h:25
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
void * memset(void *s, int c, size_t n)
long long flops() const
void apply(const cudaStream_t &stream)
int mom_ga_pad
Definition: quda.h:59
#define device_malloc(size)
Definition: malloc_quda.h:24
#define checkCudaError()
Definition: util_quda.h:110
GaugeForceCuda(cudaGaugeField &mom, const int dir, const double &eb3, const cudaGaugeField &link, const int *input_path, const int *length, const double *path_coeff, const int num_paths, const kernel_param_t &kparam)
texture< int4, 1 > siteLink0TexDouble
QudaTune getTuning()
Definition: util_quda.cpp:32
void gauge_force_init_cuda(QudaGaugeParam *param, int max_length)
void gauge_force_cuda_dir(cudaGaugeField &cudaMom, const int dir, const double eb3, const cudaGaugeField &cudaSiteLink, const QudaGaugeParam *param, int **input_path, const int *length, const double *path_coeff, const int num_paths, const int max_length)
#define device_free(ptr)
Definition: malloc_quda.h:28