QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_quda_itf.cpp
Go to the documentation of this file.
1 
2 #include <stdio.h>
3 #include <cuda_runtime.h>
4 #include <cuda.h>
5 
6 #include <quda_internal.h>
7 #include <read_gauge.h>
8 #include "gauge_field.h"
9 #include <force_common.h>
10 #include "llfat_quda.h"
11 #include <face_quda.h>
12 
13 #define BLOCK_DIM 64
14 
15 extern void exchange_gpu_staple_start(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream);
16 extern void exchange_gpu_staple_comms(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream);
17 extern void exchange_gpu_staple_wait(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream);
18 
19 namespace quda {
20 
21  void
23  cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
24  QudaGaugeParam* param, double* act_path_coeff)
25  {
26  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
27  int Vh = volume/2;
28  dim3 gridDim(volume/BLOCK_DIM,1,1);
29  dim3 halfGridDim(Vh/BLOCK_DIM,1,1);
30  dim3 blockDim(BLOCK_DIM , 1, 1);
31 
32  QudaPrecision prec = cudaSiteLink.Precision();
33  QudaReconstructType recon = cudaSiteLink.Reconstruct();
34 
35  if( ((param->X[0] % 2 != 0)
36  ||(param->X[1] % 2 != 0)
37  ||(param->X[2] % 2 != 0)
38  ||(param->X[3] % 2 != 0))
39  && (recon == QUDA_RECONSTRUCT_12)){
40  errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
41 
42  }
43 
44  int nStream=9;
45  cudaStream_t stream[nStream];
46  for(int i = 0;i < nStream; i++){
47  cudaStreamCreate(&stream[i]);
48  }
49 
50 
51  llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1,
52  param, act_path_coeff);
53 
54 
56  for(int i=0;i < 4;i++){
57  kparam.ghostDim[i] = commDimPartitioned(i);
58  }
59 #ifdef MULTI_GPU
60  int ktype[8] = {
69  };
70 #endif
71 
72  for(int dir = 0;dir < 4; dir++){
73  for(int nu = 0; nu < 4; nu++){
74  if (nu != dir){
75 
76 #ifdef MULTI_GPU
77  //start of one call
78  for(int k=3; k >= 0 ;k--){
79  if(!commDimPartitioned(k)) continue;
80 
81  kparam.kernel_type = ktype[2*k];
82  siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
83  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
84  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
85  dir, nu,
86  act_path_coeff[2],
87  recon, prec, halfGridDim,
88  kparam, &stream[2*k]);
89 
90  exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);
91 
92  kparam.kernel_type = ktype[2*k+1];
93  siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
94  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
95  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
96  dir, nu,
97  act_path_coeff[2],
98  recon, prec, halfGridDim,
99  kparam, &stream[2*k+1]);
100  exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
101  }
102 #endif
104  siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
105  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
106  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
107  dir, nu,
108  act_path_coeff[2],
109  recon, prec, halfGridDim,
110  kparam, &stream[nStream-1]);
111 
112 #ifdef MULTI_GPU
113  for(int k=3; k >= 0 ;k--){
114  if(!commDimPartitioned(k)) continue;
115  exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);
116  exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
117  }
118  for(int k=3; k >= 0 ;k--){
119  if(!commDimPartitioned(k)) continue;
120  exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);
121  exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
122  }
123  for(int k=3; k >= 0 ;k--){
124  if(!commDimPartitioned(k)) continue;
125  cudaStreamSynchronize(stream[2*k]);
126  cudaStreamSynchronize(stream[2*k+1]);
127  }
128  cudaStreamSynchronize(stream[nStream-1]);
129 #endif
130  //end
131 
132  //start of one call
133  kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
134  if(act_path_coeff[5] != 0.0){
135  computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL,
136  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
137  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
138  (const void*)cudaStaple.Even_p(), (const void*)cudaStaple.Odd_p(),
139  dir, nu, 0,
140  act_path_coeff[5],
141  recon, prec, halfGridDim, kparam, &stream[nStream-1]);
142  }
143 
144 #ifdef MULTI_GPU
145  cudaStreamSynchronize(stream[nStream-1]);
146 #endif
147  //end
148 
149  for(int rho = 0; rho < 4; rho++){
150  if (rho != dir && rho != nu){
151 
152  //start of one call
153 #ifdef MULTI_GPU
154  for(int k=3; k >= 0 ;k--){
155  if(!commDimPartitioned(k)) continue;
156  kparam.kernel_type = ktype[2*k];
157  computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
158  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
159  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
160  (const void*)cudaStaple.Even_p(), (const void*)cudaStaple.Odd_p(),
161  dir, rho, 1,
162  act_path_coeff[3],
163  recon, prec, halfGridDim, kparam, &stream[2*k]);
164  exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]);
165  kparam.kernel_type = ktype[2*k+1];
166  computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
167  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
168  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
169  (const void*)cudaStaple.Even_p(), (const void*)cudaStaple.Odd_p(),
170  dir, rho, 1,
171  act_path_coeff[3],
172  recon, prec, halfGridDim, kparam, &stream[2*k+1]);
173  exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
174  }
175 #endif
176 
177  kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
178  computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
179  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
180  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
181  (const void*)cudaStaple.Even_p(), (const void*)cudaStaple.Odd_p(),
182  dir, rho, 1,
183  act_path_coeff[3],
184  recon, prec, halfGridDim, kparam, &stream[nStream-1]);
185 
186 #ifdef MULTI_GPU
187  for(int k=3; k >= 0 ;k--){
188  if(!commDimPartitioned(k)) continue;
189  exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]);
190  exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
191  }
192  for(int k=3; k >= 0 ;k--){
193  if(!commDimPartitioned(k)) continue;
194  exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]);
195  exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]);
196  }
197  for(int k=3; k >= 0 ;k--){
198  if(!commDimPartitioned(k)) continue;
199  cudaStreamSynchronize(stream[2*k]);
200  cudaStreamSynchronize(stream[2*k+1]);
201  }
202  cudaStreamSynchronize(stream[nStream-1]);
203 #endif
204  //end
205 
206 
207  for(int sig = 0; sig < 4; sig++){
208  if (sig != dir && sig != nu && sig != rho){
209 
210  //start of one call
211  kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
212  computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL,
213  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
214  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
215  (const void*)cudaStaple1.Even_p(), (const void*)cudaStaple1.Odd_p(),
216  dir, sig, 0,
217  act_path_coeff[4],
218  recon, prec, halfGridDim, kparam, &stream[nStream-1]);
219 
220  //end
221 
222  }
223  }//sig
224 
225 #ifdef MULTI_GPU
226  cudaStreamSynchronize(stream[nStream-1]);
227 #endif
228 
229  }
230  }//rho
231  }
232  }//nu
233  }//dir
234 
235 
236  cudaDeviceSynchronize();
237  checkCudaError();
238 
239  for(int i=0;i < nStream; i++){
240  cudaStreamDestroy(stream[i]);
241  }
242 
243  return;
244  }
245 
246 
247 
248  void
250  cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
251  QudaGaugeParam* param, double* act_path_coeff)
252  {
253 
254  dim3 blockDim(BLOCK_DIM, 1,1);
255 
256  int volume = (param->X[0])*(param->X[1])*(param->X[2])*(param->X[3]);
257  int Vh = volume/2;
258  dim3 halfGridDim(Vh/blockDim.x,1,1);
259  if(Vh % blockDim.x != 0){
260  halfGridDim.x +=1;
261  }
262 
263 
264  int volume_1g = (param->X[0]+2)*(param->X[1]+2)*(param->X[2]+2)*(param->X[3]+2);
265  int Vh_1g = volume_1g/2;
266  dim3 halfGridDim_1g(Vh_1g/blockDim.x,1,1);
267  if(Vh_1g % blockDim.x != 0){
268  halfGridDim_1g.x +=1;
269  }
270 
271  int volume_2g = (param->X[0]+4)*(param->X[1]+4)*(param->X[2]+4)*(param->X[3]+4);
272  int Vh_2g = volume_2g/2;
273  dim3 halfGridDim_2g(Vh_2g/blockDim.x,1,1);
274  if(Vh_2g % blockDim.x != 0){
275  halfGridDim_2g.x +=1;
276  }
277 
278  QudaPrecision prec = cudaSiteLink.Precision();
279  QudaReconstructType recon = cudaSiteLink.Reconstruct();
280 
281  if( ((param->X[0] % 2 != 0)
282  ||(param->X[1] % 2 != 0)
283  ||(param->X[2] % 2 != 0)
284  ||(param->X[3] % 2 != 0))
285  && (recon == QUDA_RECONSTRUCT_12)){
286  errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
287 
288  }
289 
290 
292  llfat_kernel_param_t kparam_1g;
293  llfat_kernel_param_t kparam_2g;
294 
295  kparam.threads= Vh;
296  kparam.halfGridDim = halfGridDim;
297  kparam.D1 = param->X[0];
298  kparam.D2 = param->X[1];
299  kparam.D3 = param->X[2];
300  kparam.D4 = param->X[3];
301  kparam.D1h = param->X[0]/2;
302  kparam.base_idx = 2;
303 
304  kparam_1g.threads= Vh_1g;
305  kparam_1g.halfGridDim = halfGridDim_1g;
306  kparam_1g.D1 = param->X[0] + 2;
307  kparam_1g.D2 = param->X[1] + 2;
308  kparam_1g.D3 = param->X[2] + 2;
309  kparam_1g.D4 = param->X[3] + 2;
310  kparam_1g.D1h = (param->X[0] + 2)/2;
311  kparam_1g.base_idx = 1;
312 
313  kparam_2g.threads= Vh_2g;
314  kparam_2g.halfGridDim = halfGridDim_2g;
315  kparam_2g.D1 = param->X[0] + 4;
316  kparam_2g.D2 = param->X[1] + 4;
317  kparam_2g.D3 = param->X[2] + 4;
318  kparam_2g.D4 = param->X[3] + 4;
319  kparam_2g.D1h = (param->X[0] + 4)/2;
320  kparam_2g.base_idx = 0;
321 
322  kparam_1g.blockDim = kparam_2g.blockDim = kparam.blockDim = blockDim;
323 
324  /*
325  {
326  static dim3 blocks[3]={{64, 1, 1}, {64,1,1}, {64,1,1}};
327  QudaVerbosity verbose = QUDA_DEBUG_VERBOSE;
328  TuneLinkFattening fatTune(cudaFatLink, cudaSiteLink, cudaStaple, cudaStaple1,
329  kparam, kparam_1g, verbose);
330  fatTune.BenchmarkMulti(blocks, 3);
331 
332  }
333  */
334 
335  llfatOneLinkKernel_ex(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1,
336  param, act_path_coeff, kparam);
337 
338  for(int dir = 0;dir < 4; dir++){
339  for(int nu = 0; nu < 4; nu++){
340  if (nu != dir){
341 
342 
343  siteComputeGenStapleParityKernel_ex((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
344  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
345  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
346  dir, nu,
347  act_path_coeff[2],
348  recon, prec, kparam_1g);
349 
350  if(act_path_coeff[5] != 0.0){
351  computeGenStapleFieldParityKernel_ex((void*)NULL, (void*)NULL,
352  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
353  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
354  (const void*)cudaStaple.Even_p(), (const void*)cudaStaple.Odd_p(),
355  dir, nu, 0,
356  act_path_coeff[5],
357  recon, prec, kparam);
358  }
359 
360  for(int rho = 0; rho < 4; rho++){
361  if (rho != dir && rho != nu){
362 
363  computeGenStapleFieldParityKernel_ex((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
364  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
365  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
366  (const void*)cudaStaple.Even_p(), (const void*)cudaStaple.Odd_p(),
367  dir, rho, 1,
368  act_path_coeff[3],
369  recon, prec, kparam_1g);
370 
371  for(int sig = 0; sig < 4; sig++){
372  if (sig != dir && sig != nu && sig != rho){
373 
374  computeGenStapleFieldParityKernel_ex((void*)NULL, (void*)NULL,
375  (const void*)cudaSiteLink.Even_p(), (const void*)cudaSiteLink.Odd_p(),
376  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
377  (const void*)cudaStaple1.Even_p(), (const void*)cudaStaple1.Odd_p(),
378  dir, sig, 0,
379  act_path_coeff[4],
380  recon, prec, kparam);
381 
382  }
383  }//sig
384  }
385  }//rho
386  }
387  }//nu
388  }//dir
389 
390 
391  cudaDeviceSynchronize();
392  checkCudaError();
393 
394  return;
395  }
396 
397 } // namespace quda
398 
399 #undef BLOCK_DIM