3 #include <cuda_runtime.h>
26 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
35 if( ((param->
X[0] % 2 != 0)
36 ||(param->
X[1] % 2 != 0)
37 ||(param->
X[2] % 2 != 0)
38 ||(param->
X[3] % 2 != 0))
40 errorQuda(
"12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
45 cudaStream_t
stream[nStream];
46 for(
int i = 0;i < nStream; i++){
47 cudaStreamCreate(&stream[i]);
52 param, act_path_coeff);
56 for(
int i=0;i < 4;i++){
73 for(
int nu = 0; nu < 4; nu++){
78 for(
int k=3; k >= 0 ;k--){
83 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
84 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
87 recon,
prec, halfGridDim,
92 kparam.kernel_type = ktype[2*k+1];
94 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
95 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
98 recon,
prec, halfGridDim,
105 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
106 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
109 recon,
prec, halfGridDim,
110 kparam, &stream[nStream-1]);
113 for(
int k=3; k >= 0 ;k--){
118 for(
int k=3; k >= 0 ;k--){
123 for(
int k=3; k >= 0 ;k--){
125 cudaStreamSynchronize(stream[2*k]);
126 cudaStreamSynchronize(stream[2*k+1]);
128 cudaStreamSynchronize(stream[nStream-1]);
134 if(act_path_coeff[5] != 0.0){
136 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
137 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
138 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
141 recon,
prec, halfGridDim,
kparam, &stream[nStream-1]);
145 cudaStreamSynchronize(stream[nStream-1]);
149 for(
int rho = 0; rho < 4; rho++){
150 if (rho !=
dir && rho != nu){
154 for(
int k=3; k >= 0 ;k--){
156 kparam.kernel_type = ktype[2*k];
158 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
159 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
160 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
163 recon,
prec, halfGridDim,
kparam, &stream[2*k]);
165 kparam.kernel_type = ktype[2*k+1];
167 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
168 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
169 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
172 recon,
prec, halfGridDim,
kparam, &stream[2*k+1]);
179 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
180 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
181 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
184 recon,
prec, halfGridDim,
kparam, &stream[nStream-1]);
187 for(
int k=3; k >= 0 ;k--){
192 for(
int k=3; k >= 0 ;k--){
197 for(
int k=3; k >= 0 ;k--){
199 cudaStreamSynchronize(stream[2*k]);
200 cudaStreamSynchronize(stream[2*k+1]);
202 cudaStreamSynchronize(stream[nStream-1]);
213 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
214 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
215 (
const void*)cudaStaple1.
Even_p(), (
const void*)cudaStaple1.
Odd_p(),
218 recon,
prec, halfGridDim,
kparam, &stream[nStream-1]);
226 cudaStreamSynchronize(stream[nStream-1]);
236 cudaDeviceSynchronize();
239 for(
int i=0;i < nStream; i++){
240 cudaStreamDestroy(stream[i]);
256 int volume = (param->
X[0])*(param->
X[1])*(param->
X[2])*(param->
X[3]);
258 dim3 halfGridDim(Vh/blockDim.x,1,1);
259 if(Vh % blockDim.x != 0){
264 int volume_1g = (param->
X[0]+2)*(param->
X[1]+2)*(param->
X[2]+2)*(param->
X[3]+2);
265 int Vh_1g = volume_1g/2;
266 dim3 halfGridDim_1g(Vh_1g/blockDim.x,1,1);
267 if(Vh_1g % blockDim.x != 0){
268 halfGridDim_1g.x +=1;
271 int volume_2g = (param->
X[0]+4)*(param->
X[1]+4)*(param->
X[2]+4)*(param->
X[3]+4);
272 int Vh_2g = volume_2g/2;
273 dim3 halfGridDim_2g(Vh_2g/blockDim.x,1,1);
274 if(Vh_2g % blockDim.x != 0){
275 halfGridDim_2g.x +=1;
281 if( ((param->
X[0] % 2 != 0)
282 ||(param->
X[1] % 2 != 0)
283 ||(param->
X[2] % 2 != 0)
284 ||(param->
X[3] % 2 != 0))
286 errorQuda(
"12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
297 kparam.
D1 = param->
X[0];
298 kparam.
D2 = param->
X[1];
299 kparam.
D3 = param->
X[2];
300 kparam.
D4 = param->
X[3];
301 kparam.
D1h = param->
X[0]/2;
306 kparam_1g.
D1 = param->
X[0] + 2;
307 kparam_1g.
D2 = param->
X[1] + 2;
308 kparam_1g.
D3 = param->
X[2] + 2;
309 kparam_1g.
D4 = param->
X[3] + 2;
310 kparam_1g.
D1h = (param->
X[0] + 2)/2;
315 kparam_2g.
D1 = param->
X[0] + 4;
316 kparam_2g.
D2 = param->
X[1] + 4;
317 kparam_2g.
D3 = param->
X[2] + 4;
318 kparam_2g.
D4 = param->
X[3] + 4;
319 kparam_2g.
D1h = (param->
X[0] + 4)/2;
336 param, act_path_coeff, kparam);
339 for(
int nu = 0; nu < 4; nu++){
344 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
345 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
348 recon,
prec, kparam_1g);
350 if(act_path_coeff[5] != 0.0){
352 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
353 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
354 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
360 for(
int rho = 0; rho < 4; rho++){
361 if (rho !=
dir && rho != nu){
364 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
365 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
366 (
const void*)cudaStaple.
Even_p(), (
const void*)cudaStaple.
Odd_p(),
369 recon,
prec, kparam_1g);
375 (
const void*)cudaSiteLink.
Even_p(), (
const void*)cudaSiteLink.
Odd_p(),
376 (
void*)cudaFatLink.
Even_p(), (
void*)cudaFatLink.
Odd_p(),
377 (
const void*)cudaStaple1.
Even_p(), (
const void*)cudaStaple1.
Odd_p(),
391 cudaDeviceSynchronize();