QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gauge_force_quda.cu
Go to the documentation of this file.
1 #include <read_gauge.h>
2 #include <gauge_field.h>
3 
5 #ifdef MULTI_GPU
6 #include "face_quda.h"
7 #endif
8 
9 namespace quda {
10 
11 #define GF_SITE_MATRIX_LOAD_TEX 1
12 
13  //single precsison, 12-reconstruct
14 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
15 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink0TexSingle_recon, dir, idx, var, gf.site_ga_stride)
16 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink1TexSingle_recon, dir, idx, var, gf.site_ga_stride)
17 #else
18 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkEven, dir, idx, var, gf.site_ga_stride)
19 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkOdd, dir, idx, var, gf.site_ga_stride)
20 #endif
21 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
22 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
23 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4
24 #define N_IN_FLOATN 4
25 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp12
26 #include "gauge_force_core.h"
27 #undef LOAD_EVEN_MATRIX
28 #undef LOAD_ODD_MATRIX
29 #undef LOAD_ANTI_HERMITIAN
30 #undef RECONSTRUCT_MATRIX
31 #undef DECLARE_LINK_VARS
32 #undef N_IN_FLOATN
33 #undef GAUGE_FORCE_KERN_NAME
34 
35  //double precsison, 12-reconstruct
36 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
37 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
38 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
39 #else
40 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkEven, dir, idx, var, gf.site_ga_stride)
41 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkOdd, dir, idx, var, gf.site_ga_stride)
42 #endif
43 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
44 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
45 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
46 #define N_IN_FLOATN 2
47 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp12
48 #include "gauge_force_core.h"
49 #undef LOAD_EVEN_MATRIX
50 #undef LOAD_ODD_MATRIX
51 #undef LOAD_ANTI_HERMITIAN
52 #undef RECONSTRUCT_MATRIX
53 #undef DECLARE_LINK_VARS
54 #undef N_IN_FLOATN
55 #undef GAUGE_FORCE_KERN_NAME
56 
57  //single precision, 18-reconstruct
58 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
59 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink0TexSingle, dir, idx, var, gf.site_ga_stride)
60 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink1TexSingle, dir, idx, var, gf.site_ga_stride)
61 #else
62 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
63 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
64 #endif
65 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var,gf.mom_ga_stride)
66 #define RECONSTRUCT_MATRIX(sign, var)
67 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
68 #define N_IN_FLOATN 2
69 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp18
71 #undef LOAD_EVEN_MATRIX
72 #undef LOAD_ODD_MATRIX
73 #undef LOAD_ANTI_HERMITIAN
74 #undef RECONSTRUCT_MATRIX
75 #undef DECLARE_LINK_VARS
76 #undef N_IN_FLOATN
77 #undef GAUGE_FORCE_KERN_NAME
78 
79  //double precision, 18-reconstruct
80 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
81 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
82 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
83 #else
84 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
85 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
86 #endif
87 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
88 #define RECONSTRUCT_MATRIX(sign, var)
89 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
90 #define N_IN_FLOATN 2
91 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp18
92 #include "gauge_force_core.h"
93 #undef LOAD_EVEN_MATRIX
94 #undef LOAD_ODD_MATRIX
95 #undef LOAD_ANTI_HERMITIAN
96 #undef RECONSTRUCT_MATRIX
97 #undef DECLARE_LINK_VARS
98 #undef N_IN_FLOATN
99 #undef GAUGE_FORCE_KERN_NAME
100 
101  void
103  {
104 
105  static int gauge_force_init_cuda_flag = 0;
106  if (gauge_force_init_cuda_flag){
107  return;
108  }
109  gauge_force_init_cuda_flag=1;
110 
111  int* X = param->X;
112 
113  int Vh = X[0]*X[1]*X[2]*X[3]/2;
114  fat_force_const_t gf_h;
115  gf_h.path_max_length = path_max_length;
116 #ifdef MULTI_GPU
117  int Vh_ex = (X[0]+4)*(X[1]+4)*(X[2]+4)*(X[3]+4)/2;
118  gf_h.site_ga_stride = param->site_ga_pad + Vh_ex;
119 #else
120  gf_h.site_ga_stride = param->site_ga_pad + Vh;
121 #endif
122 
123  gf_h.mom_ga_stride = param->mom_ga_pad + Vh;
124  cudaMemcpyToSymbol(gf, &gf_h, sizeof(fat_force_const_t));
125  }
126 
127 
128  class GaugeForceCuda : public Tunable {
129 
130  private:
131  cudaGaugeField &mom;
132  const int dir;
133  const double &eb3;
134  const cudaGaugeField &link;
135  const int *input_path;
136  const int *length;
137  const void *path_coeff;
138  const int num_paths;
139  const kernel_param_t &kparam;
140 
141  int sharedBytesPerThread() const { return 0; }
142  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
143 
144  // don't tune the grid dimension
145  bool advanceGridDim(TuneParam &param) const { return false; }
146  bool advanceBlockDim(TuneParam &param) const {
147  bool rtn = Tunable::advanceBlockDim(param);
148  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
149  return rtn;
150  }
151 
152  public:
153  GaugeForceCuda(cudaGaugeField &mom, const int dir, const double &eb3, const cudaGaugeField &link,
154  const int *input_path, const int *length, const void *path_coeff,
155  const int num_paths, const kernel_param_t &kparam) :
156  mom(mom), dir(dir), eb3(eb3), link(link), input_path(input_path), length(length),
157  path_coeff(path_coeff), num_paths(num_paths), kparam(kparam) {
158 
159  if(link.Precision() == QUDA_DOUBLE_PRECISION){
160  cudaBindTexture(0, siteLink0TexDouble, link.Even_p(), link.Bytes()/2);
161  cudaBindTexture(0, siteLink1TexDouble, link.Odd_p(), link.Bytes()/2);
162  }else{ //QUDA_SINGLE_PRECISION
163  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
164  cudaBindTexture(0, siteLink0TexSingle, link.Even_p(), link.Bytes()/2);
165  cudaBindTexture(0, siteLink1TexSingle, link.Odd_p(), link.Bytes()/2);
166  }else{//QUDA_RECONSTRUCT_12
167  cudaBindTexture(0, siteLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
168  cudaBindTexture(0, siteLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
169  }
170  }
171  }
172 
173  virtual ~GaugeForceCuda() {
174  if(link.Precision() == QUDA_DOUBLE_PRECISION){
175  cudaBindTexture(0, siteLink0TexDouble, link.Even_p(), link.Bytes()/2);
176  cudaBindTexture(0, siteLink1TexDouble, link.Odd_p(), link.Bytes()/2);
177  }else{ //QUDA_SINGLE_PRECISION
178  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
179  cudaBindTexture(0, siteLink0TexSingle, link.Even_p(), link.Bytes()/2);
180  cudaBindTexture(0, siteLink1TexSingle, link.Odd_p(), link.Bytes()/2);
181  }else{//QUDA_RECONSTRUCT_12
182  cudaBindTexture(0, siteLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
183  cudaBindTexture(0, siteLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
184  }
185  }
186  }
187 
188  void apply(const cudaStream_t &stream) {
189  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
190  if(link.Precision() == QUDA_DOUBLE_PRECISION){
191  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
192  parity_compute_gauge_force_kernel_dp18<0><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
193  dir, eb3,
194  (double2*)link.Even_p(), (double2*)link.Odd_p(),
195  input_path, length, (double*)path_coeff,
196  num_paths, kparam);
197  parity_compute_gauge_force_kernel_dp18<1><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
198  dir, eb3,
199  (double2*)link.Even_p(), (double2*)link.Odd_p(),
200  input_path, length, (double*)path_coeff,
201  num_paths, kparam);
202 
203  }else{ //QUDA_RECONSTRUCT_12
204  parity_compute_gauge_force_kernel_dp12<0><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
205  dir, eb3,
206  (double2*)link.Even_p(), (double2*)link.Odd_p(),
207  input_path, length, (double*)path_coeff,
208  num_paths, kparam);
209  parity_compute_gauge_force_kernel_dp12<1><<<tp.grid, tp.block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
210  dir, eb3,
211  (double2*)link.Even_p(), (double2*)link.Odd_p(),
212  input_path, length, (double*)path_coeff,
213  num_paths, kparam);
214  }
215  }else{ //QUDA_SINGLE_PRECISION
216  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
217 
218  parity_compute_gauge_force_kernel_sp18<0><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
219  dir, eb3,
220  (float2*)link.Even_p(), (float2*)link.Odd_p(),
221  input_path, length, (float*)path_coeff,
222  num_paths, kparam);
223  parity_compute_gauge_force_kernel_sp18<1><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
224  dir, eb3,
225  (float2*)link.Even_p(), (float2*)link.Odd_p(),
226  input_path, length, (float*)path_coeff,
227  num_paths, kparam);
228 
229  }else{ //QUDA_RECONSTRUCT_12
230  parity_compute_gauge_force_kernel_sp12<0><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
231  dir, eb3,
232  (float4*)link.Even_p(), (float4*)link.Odd_p(),
233  input_path, length, (float*)path_coeff,
234  num_paths, kparam);
235  //odd
236  /* The reason we do not switch the even/odd function input paramemters and the texture binding
237  * is that we use the oddbit to decided where to load, in the kernel function
238  */
239  parity_compute_gauge_force_kernel_sp12<1><<<tp.grid, tp.block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
240  dir, eb3,
241  (float4*)link.Even_p(), (float4*)link.Odd_p(),
242  input_path, length, (float*)path_coeff,
243  num_paths, kparam);
244  }
245  }
246  }
247 
248  void preTune() { mom.backup(); }
249  void postTune() { mom.restore(); }
250 
251  void initTuneParam(TuneParam &param) const {
252  Tunable::initTuneParam(param);
253  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
254  }
255 
257  void defaultTuneParam(TuneParam &param) const {
259  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
260  }
261 
262  long long flops() const { return 0; } // FIXME: add flops counter
263 
264  TuneKey tuneKey() const {
265  std::stringstream vol, aux;
266  vol << link.X()[0] << "x";
267  vol << link.X()[1] << "x";
268  vol << link.X()[2] << "x";
269  vol << link.X()[3] << "x";
270  aux << "threads=" << link.Volume() << ",prec=" << link.Precision();
271  aux << "stride=" << link.Stride() << ",recon=" << link.Reconstruct();
272  aux << "dir=" << dir << "num_paths=" << num_paths;
273  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
274  }
275 
276  };
277 
278  void
279  gauge_force_cuda_dir(cudaGaugeField& cudaMom, const int dir, const double eb3, const cudaGaugeField& cudaSiteLink,
280  const QudaGaugeParam* param, int** input_path, const int* length, const void* path_coeff,
281  const int num_paths, const int max_length)
282  {
283  //input_path
284  size_t bytes = num_paths*max_length*sizeof(int);
285 
286  int *input_path_d = (int *) device_malloc(bytes);
287  cudaMemset(input_path_d, 0, bytes);
288  checkCudaError();
289 
290  int* input_path_h = (int *) safe_malloc(bytes);
291  memset(input_path_h, 0, bytes);
292 
293  for(int i=0; i < num_paths; i++) {
294  for(int j=0; j < length[i]; j++) {
295  input_path_h[i*max_length + j] = input_path[i][j];
296  }
297  }
298 
299  cudaMemcpy(input_path_d, input_path_h, bytes, cudaMemcpyHostToDevice);
300 
301  //length
302  int* length_d = (int *) device_malloc(num_paths*sizeof(int));
303  cudaMemcpy(length_d, length, num_paths*sizeof(int), cudaMemcpyHostToDevice);
304 
305  //path_coeff
306  int gsize = param->cuda_prec;
307  void* path_coeff_d = device_malloc(num_paths*gsize);
308  cudaMemcpy(path_coeff_d, path_coeff, num_paths*gsize, cudaMemcpyHostToDevice);
309 
310  //compute the gauge forces
311  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
312 
314 #ifdef MULTI_GPU
315  for(int i=0; i<4; i++) {
316  kparam.ghostDim[i] = commDimPartitioned(i);
317  }
318 #endif
319  kparam.threads = volume/2;
320 
321  GaugeForceCuda gaugeForce(cudaMom, dir, eb3, cudaSiteLink, input_path_d,
322  length_d, path_coeff_d, num_paths, kparam);
323  gaugeForce.apply(0);
324  checkCudaError();
325 
326  host_free(input_path_h);
327  device_free(input_path_d);
328  device_free(length_d);
329  device_free(path_coeff_d);
330  }
331 
332 
333  void
335  QudaGaugeParam* param, int*** input_path,
336  int* length, void* path_coeff, int num_paths, int max_length)
337  {
338  for(int dir=0; dir < 4; dir++){
339  gauge_force_cuda_dir(cudaMom, dir, eb3, cudaSiteLink, param, input_path[dir],
340  length, path_coeff, num_paths, max_length);
341  }
342  }
343 
344 } // namespace quda