3 #include <cuda_runtime.h>
14 #define MIN_COEFF 1e-7
29 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
38 if( ((param->
X[0] % 2 != 0)
39 ||(param->
X[1] % 2 != 0)
40 ||(param->
X[2] % 2 != 0)
41 ||(param->
X[3] % 2 != 0))
43 errorQuda(
"12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
48 cudaStream_t
stream[nStream];
49 for(
int i = 0;i < nStream; i++){
50 cudaStreamCreate(&stream[i]);
57 kparam.
D1 = param->
X[0];
58 kparam.
D2 = param->
X[1];
59 kparam.
D3 = param->
X[2];
60 kparam.
D4 = param->
X[3];
61 kparam.
D1h = param->
X[0]/2;
63 for(
int i=0;i < 4;i++){
68 param, act_path_coeff);
71 errorQuda(
"Multi-GPU long-link calculation requires extended gauge field\n");
75 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
76 act_path_coeff[1], recon,
prec, halfGridDim,
kparam);
82 fabs(act_path_coeff[5]) <
MIN_COEFF)
return;
99 for(
int dir = 0;dir < 4; dir++){
100 for(
int nu = 0; nu < 4; nu++){
105 for(
int k=3; k >= 0 ;k--){
110 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
111 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
114 recon,
prec, halfGridDim,
119 kparam.kernel_type = ktype[2*k+1];
121 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
122 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
125 recon,
prec, halfGridDim,
132 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
133 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
136 recon,
prec, halfGridDim,
137 kparam, &stream[nStream-1]);
140 for(
int k=3; k >= 0 ;k--){
145 for(
int k=3; k >= 0 ;k--){
150 for(
int k=3; k >= 0 ;k--){
152 cudaStreamSynchronize(stream[2*k]);
153 cudaStreamSynchronize(stream[2*k+1]);
155 cudaStreamSynchronize(stream[nStream-1]);
161 if(act_path_coeff[5] != 0.0){
163 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
164 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
165 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
168 recon,
prec, halfGridDim,
kparam, &stream[nStream-1]);
172 cudaStreamSynchronize(stream[nStream-1]);
176 for(
int rho = 0; rho < 4; rho++){
177 if (rho != dir && rho != nu){
181 for(
int k=3; k >= 0 ;k--){
183 kparam.kernel_type = ktype[2*k];
185 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
186 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
187 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
190 recon,
prec, halfGridDim,
kparam, &stream[2*k]);
192 kparam.kernel_type = ktype[2*k+1];
194 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
195 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
196 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
199 recon,
prec, halfGridDim,
kparam, &stream[2*k+1]);
206 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
207 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
208 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
211 recon,
prec, halfGridDim,
kparam, &stream[nStream-1]);
214 for(
int k=3; k >= 0 ;k--){
219 for(
int k=3; k >= 0 ;k--){
224 for(
int k=3; k >= 0 ;k--){
226 cudaStreamSynchronize(stream[2*k]);
227 cudaStreamSynchronize(stream[2*k+1]);
229 cudaStreamSynchronize(stream[nStream-1]);
235 if (
sig != dir &&
sig != nu &&
sig != rho){
240 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
241 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
242 (
const void*)cudaStaple1.
Even_p(), (
const void*)cudaStaple1.
Odd_p(),
245 recon,
prec, halfGridDim,
kparam, &stream[nStream-1]);
253 cudaStreamSynchronize(stream[nStream-1]);
263 cudaDeviceSynchronize();
266 for(
int i=0;i < nStream; i++){
267 cudaStreamDestroy(stream[i]);
284 int volume = (param->
X[0])*(param->
X[1])*(param->
X[2])*(param->
X[3]);
286 dim3 halfGridDim((Vh+blockDim.x-1)/blockDim.x,1,1);
288 int volume_1g = (param->
X[0]+2)*(param->
X[1]+2)*(param->
X[2]+2)*(param->
X[3]+2);
289 int Vh_1g = volume_1g/2;
290 dim3 halfGridDim_1g((Vh_1g+blockDim.x-1)/blockDim.x,1,1);
292 int volume_2g = (param->
X[0]+4)*(param->
X[1]+4)*(param->
X[2]+4)*(param->
X[3]+4);
293 int Vh_2g = volume_2g/2;
294 dim3 halfGridDim_2g((Vh_2g+blockDim.x-1)/blockDim.x,1,1);
299 if( ((param->
X[0] % 2 != 0)
300 ||(param->
X[1] % 2 != 0)
301 ||(param->
X[2] % 2 != 0)
302 ||(param->
X[3] % 2 != 0))
304 errorQuda(
"12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
309 dim3 halfGridDim_ll((Vh+blockDim.x-1)/blockDim_ll.x,1,1);
318 kparam.
D1 = param->
X[0];
319 kparam.
D2 = param->
X[1];
320 kparam.
D3 = param->
X[2];
321 kparam.
D4 = param->
X[3];
322 kparam.
D1h = param->
X[0]/2;
327 kparam_ll.
D1 = param->
X[0];
328 kparam_ll.
D2 = param->
X[1];
329 kparam_ll.
D3 = param->
X[2];
330 kparam_ll.
D4 = param->
X[3];
331 kparam_ll.
D1h = param->
X[0]/2;
337 kparam_1g.
D1 = param->
X[0] + 2;
338 kparam_1g.
D2 = param->
X[1] + 2;
339 kparam_1g.
D3 = param->
X[2] + 2;
340 kparam_1g.
D4 = param->
X[3] + 2;
341 kparam_1g.
D1h = (param->
X[0] + 2)/2;
346 kparam_2g.
D1 = param->
X[0] + 4;
347 kparam_2g.
D2 = param->
X[1] + 4;
348 kparam_2g.
D3 = param->
X[2] + 4;
349 kparam_2g.
D4 = param->
X[3] + 4;
350 kparam_2g.
D1h = (param->
X[0] + 4)/2;
356 param, act_path_coeff, kparam);
360 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
365 if(fabs(act_path_coeff[2]) <
MIN_COEFF &&
368 fabs(act_path_coeff[5]) <
MIN_COEFF)
return;
373 for(
int dir = 0;dir < 4; dir++){
374 for(
int nu = 0; nu < 4; nu++){
379 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
380 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
383 recon,
prec, kparam_1g);
385 if(act_path_coeff[5] != 0.0){
387 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
388 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
389 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
395 for(
int rho = 0; rho < 4; rho++){
396 if (rho != dir && rho != nu){
399 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
400 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
401 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
404 recon,
prec, kparam_1g);
409 if (
sig != dir &&
sig != nu &&
sig != rho){
412 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
413 (
void*)cudaFatLink->
Even_p(), (
void*)cudaFatLink->
Odd_p(),
414 (
const void*)cudaStaple1.
Even_p(), (
const void*)cudaStaple1.
Odd_p(),
429 cudaDeviceSynchronize();
#define LLFAT_EXTERIOR_KERNEL_BACK_X
void llfat_cuda(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
enum QudaPrecision_s QudaPrecision
void exchange_gpu_staple_start(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
int commDimPartitioned(int dir)
#define LLFAT_EXTERIOR_KERNEL_FWD_X
void exchange_gpu_staple_wait(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
void computeLongLinkCuda(void *outEven, void *outOdd, const void *const inEven, const void *const inOdd, double coeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam)
#define LLFAT_EXTERIOR_KERNEL_FWD_Z
#define LLFAT_EXTERIOR_KERNEL_BACK_T
#define LLFAT_EXTERIOR_KERNEL_FWD_Y
QudaPrecision Precision() const
#define LLFAT_INTERIOR_KERNEL
void siteComputeGenStapleParityKernel_ex(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, int mu, int nu, double mycoeff, QudaReconstructType recon, QudaPrecision prec, llfat_kernel_param_t kparam)
void llfatOneLinkKernel(cudaGaugeField &cudaFatLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
void computeGenStapleFieldParityKernel_ex(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, const void *mulink_even, const void *mulink_odd, int mu, int nu, int save_staple, double mycoeff, QudaReconstructType recon, QudaPrecision prec, llfat_kernel_param_t kparam)
void llfat_cuda_ex(cudaGaugeField *cudaFatLink, cudaGaugeField *cudaLongLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
#define LLFAT_EXTERIOR_KERNEL_BACK_Y
cudaGaugeField * cudaFatLink
void exchange_gpu_staple_comms(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
QudaReconstructType Reconstruct() const
void computeGenStapleFieldParityKernel(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, const void *mulink_even, const void *mulink_odd, int mu, int nu, int save_staple, double mycoeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam, cudaStream_t *stream)
void siteComputeGenStapleParityKernel(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, int mu, int nu, double mycoeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam, cudaStream_t *stream)
#define LLFAT_EXTERIOR_KERNEL_BACK_Z
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
#define LLFAT_EXTERIOR_KERNEL_FWD_T
enum QudaReconstructType_s QudaReconstructType
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
void llfatOneLinkKernel_ex(cudaGaugeField &cudaFatLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff, llfat_kernel_param_t kparam)