11 #define GF_SITE_MATRIX_LOAD_TEX 1
14 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
15 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink0TexSingle_recon, dir, idx, var, gf.site_ga_stride)
16 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink1TexSingle_recon, dir, idx, var, gf.site_ga_stride)
18 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkEven, dir, idx, var, gf.site_ga_stride)
19 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkOdd, dir, idx, var, gf.site_ga_stride)
21 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
22 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
23 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4
25 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp12
27 #undef LOAD_EVEN_MATRIX
28 #undef LOAD_ODD_MATRIX
29 #undef LOAD_ANTI_HERMITIAN
30 #undef RECONSTRUCT_MATRIX
31 #undef DECLARE_LINK_VARS
33 #undef GAUGE_FORCE_KERN_NAME
36 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
37 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
38 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
40 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkEven, dir, idx, var, gf.site_ga_stride)
41 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkOdd, dir, idx, var, gf.site_ga_stride)
43 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
44 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
45 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
47 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp12
49 #undef LOAD_EVEN_MATRIX
50 #undef LOAD_ODD_MATRIX
51 #undef LOAD_ANTI_HERMITIAN
52 #undef RECONSTRUCT_MATRIX
53 #undef DECLARE_LINK_VARS
55 #undef GAUGE_FORCE_KERN_NAME
58 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
59 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink0TexSingle, dir, idx, var, gf.site_ga_stride)
60 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink1TexSingle, dir, idx, var, gf.site_ga_stride)
62 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
63 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
65 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var,gf.mom_ga_stride)
66 #define RECONSTRUCT_MATRIX(sign, var)
67 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
69 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp18
71 #undef LOAD_EVEN_MATRIX
72 #undef LOAD_ODD_MATRIX
73 #undef LOAD_ANTI_HERMITIAN
74 #undef RECONSTRUCT_MATRIX
75 #undef DECLARE_LINK_VARS
77 #undef GAUGE_FORCE_KERN_NAME
80 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
81 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
82 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
84 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
85 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
87 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
88 #define RECONSTRUCT_MATRIX(sign, var)
89 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
91 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp18
93 #undef LOAD_EVEN_MATRIX
94 #undef LOAD_ODD_MATRIX
95 #undef LOAD_ANTI_HERMITIAN
96 #undef RECONSTRUCT_MATRIX
97 #undef DECLARE_LINK_VARS
99 #undef GAUGE_FORCE_KERN_NAME
105 static int gauge_force_init_cuda_flag = 0;
106 if (gauge_force_init_cuda_flag){
109 gauge_force_init_cuda_flag=1;
113 int Vh = X[0]*X[1]*X[2]*X[3]/2;
117 int Vh_ex = (X[0]+4)*(X[1]+4)*(X[2]+4)*(X[3]+4)/2;
135 const int *input_path;
137 const void *path_coeff;
141 int sharedBytesPerThread()
const {
return 0; }
142 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
145 bool advanceGridDim(TuneParam &
param)
const {
return false; }
146 bool advanceBlockDim(TuneParam &
param)
const {
148 param.grid = dim3((kparam.
threads+param.block.x-1)/param.block.x, 1, 1);
154 const int *input_path,
const int *length,
const void *path_coeff,
156 mom(mom), dir(dir), eb3(eb3), link(link), input_path(input_path), length(length),
157 path_coeff(path_coeff), num_paths(num_paths), kparam(kparam) {
192 parity_compute_gauge_force_kernel_dp18<0><<<tp.
grid, tp.
block>>>((double2*)mom.
Even_p(), (double2*)mom.
Odd_p(),
195 input_path, length, (
double*)path_coeff,
197 parity_compute_gauge_force_kernel_dp18<1><<<tp.
grid, tp.
block>>>((double2*)mom.
Even_p(), (double2*)mom.
Odd_p(),
200 input_path, length, (
double*)path_coeff,
204 parity_compute_gauge_force_kernel_dp12<0><<<tp.
grid, tp.
block>>>((double2*)mom.
Even_p(), (double2*)mom.
Odd_p(),
207 input_path, length, (
double*)path_coeff,
209 parity_compute_gauge_force_kernel_dp12<1><<<tp.
grid, tp.
block>>>((double2*)mom.
Even_p(), (double2*)mom.
Odd_p(),
212 input_path, length, (
double*)path_coeff,
218 parity_compute_gauge_force_kernel_sp18<0><<<tp.
grid, tp.
block>>>((float2*)mom.
Even_p(), (float2*)mom.
Odd_p(),
221 input_path, length, (
float*)path_coeff,
223 parity_compute_gauge_force_kernel_sp18<1><<<tp.
grid, tp.
block>>>((float2*)mom.
Even_p(), (float2*)mom.
Odd_p(),
226 input_path, length, (
float*)path_coeff,
230 parity_compute_gauge_force_kernel_sp12<0><<<tp.
grid, tp.
block>>>((float2*)mom.
Even_p(), (float2*)mom.
Odd_p(),
233 input_path, length, (
float*)path_coeff,
239 parity_compute_gauge_force_kernel_sp12<1><<<tp.
grid, tp.
block>>>((float2*)mom.
Even_p(), (float2*)mom.
Odd_p(),
242 input_path, length, (
float*)path_coeff,
262 long long flops()
const {
return 0; }
265 std::stringstream vol, aux;
266 vol << link.
X()[0] <<
"x";
267 vol << link.
X()[1] <<
"x";
268 vol << link.
X()[2] <<
"x";
269 vol << link.
X()[3] <<
"x";
272 aux <<
"dir=" << dir <<
"num_paths=" << num_paths;
273 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
281 const int num_paths,
const int max_length)
284 size_t bytes = num_paths*max_length*
sizeof(int);
287 cudaMemset(input_path_d, 0, bytes);
291 memset(input_path_h, 0, bytes);
293 for(
int i=0; i < num_paths; i++) {
294 for(
int j=0; j < length[i]; j++) {
295 input_path_h[i*max_length + j] = input_path[i][j];
299 cudaMemcpy(input_path_d, input_path_h, bytes, cudaMemcpyHostToDevice);
302 int* length_d = (
int *)
device_malloc(num_paths*
sizeof(
int));
303 cudaMemcpy(length_d, length, num_paths*
sizeof(
int), cudaMemcpyHostToDevice);
308 cudaMemcpy(path_coeff_d, path_coeff, num_paths*gsize, cudaMemcpyHostToDevice);
311 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
315 for(
int i=0; i<4; i++) {
321 GaugeForceCuda gaugeForce(cudaMom, dir, eb3, cudaSiteLink, input_path_d,
322 length_d, path_coeff_d, num_paths, kparam);
336 int*
length,
void* path_coeff,
int num_paths,
int max_length)
340 length, path_coeff, num_paths, max_length);