13 namespace gaugeforce {
18 using namespace gaugeforce;
21 #define GF_SITE_MATRIX_LOAD_TEX 1
24 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
25 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink0TexSingle_recon, dir, idx, var, gf.site_ga_stride)
26 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink1TexSingle_recon, dir, idx, var, gf.site_ga_stride)
28 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkEven, dir, idx, var, gf.site_ga_stride)
29 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkOdd, dir, idx, var, gf.site_ga_stride)
31 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
32 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
33 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4
35 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp12
37 #undef LOAD_EVEN_MATRIX
38 #undef LOAD_ODD_MATRIX
39 #undef LOAD_ANTI_HERMITIAN
40 #undef RECONSTRUCT_MATRIX
41 #undef DECLARE_LINK_VARS
43 #undef GAUGE_FORCE_KERN_NAME
46 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
47 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
48 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
50 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkEven, dir, idx, var, gf.site_ga_stride)
51 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkOdd, dir, idx, var, gf.site_ga_stride)
53 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
54 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var)
55 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
57 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp12
59 #undef LOAD_EVEN_MATRIX
60 #undef LOAD_ODD_MATRIX
61 #undef LOAD_ANTI_HERMITIAN
62 #undef RECONSTRUCT_MATRIX
63 #undef DECLARE_LINK_VARS
65 #undef GAUGE_FORCE_KERN_NAME
68 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
69 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink0TexSingle, dir, idx, var, gf.site_ga_stride)
70 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink1TexSingle, dir, idx, var, gf.site_ga_stride)
72 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
73 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
75 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var,gf.mom_ga_stride)
76 #define RECONSTRUCT_MATRIX(sign, var)
77 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
79 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp18
81 #undef LOAD_EVEN_MATRIX
82 #undef LOAD_ODD_MATRIX
83 #undef LOAD_ANTI_HERMITIAN
84 #undef RECONSTRUCT_MATRIX
85 #undef DECLARE_LINK_VARS
87 #undef GAUGE_FORCE_KERN_NAME
90 #if (GF_SITE_MATRIX_LOAD_TEX == 1)
91 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, gf.site_ga_stride)
92 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, gf.site_ga_stride)
94 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, gf.site_ga_stride)
95 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, gf.site_ga_stride)
97 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, gf.mom_ga_stride)
98 #define RECONSTRUCT_MATRIX(sign, var)
99 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8
100 #define N_IN_FLOATN 2
101 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp18
103 #undef LOAD_EVEN_MATRIX
104 #undef LOAD_ODD_MATRIX
105 #undef LOAD_ANTI_HERMITIAN
106 #undef RECONSTRUCT_MATRIX
107 #undef DECLARE_LINK_VARS
109 #undef GAUGE_FORCE_KERN_NAME
115 static int gauge_force_init_cuda_flag = 0;
116 if (gauge_force_init_cuda_flag){
119 gauge_force_init_cuda_flag=1;
123 int Vh = X[0]*X[1]*X[2]*X[3]/2;
127 int Vh_ex = (X[0]+4)*(X[1]+4)*(X[2]+4)*(X[3]+4)/2;
145 const int *input_path;
147 const double *path_coeff;
151 unsigned int sharedBytesPerThread()
const {
return 0; }
152 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
155 bool tuneGridDim()
const {
return false; }
156 unsigned int minThreads()
const {
return kparam.threads; }
160 const int *input_path,
const int *
length,
const double *path_coeff,
162 mom(mom), dir(dir), eb3(eb3), link(link), input_path(input_path), length(length),
163 path_coeff(path_coeff), num_paths(num_paths), kparam(kparam) {
198 parity_compute_gauge_force_kernel_dp18<0,double><<<tp.
grid, tp.
block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
200 (double2*)link.Even_p(), (double2*)link.Odd_p(),
201 input_path,
length, path_coeff,
203 parity_compute_gauge_force_kernel_dp18<1,double><<<tp.
grid, tp.
block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
205 (double2*)link.Even_p(), (double2*)link.Odd_p(),
206 input_path,
length, path_coeff,
210 parity_compute_gauge_force_kernel_dp12<0,double><<<tp.
grid, tp.
block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
212 (double2*)link.Even_p(), (double2*)link.Odd_p(),
213 input_path,
length, path_coeff,
215 parity_compute_gauge_force_kernel_dp12<1,double><<<tp.
grid, tp.
block>>>((double2*)mom.Even_p(), (double2*)mom.Odd_p(),
217 (double2*)link.Even_p(), (double2*)link.Odd_p(),
218 input_path,
length, path_coeff,
224 parity_compute_gauge_force_kernel_sp18<0,float><<<tp.
grid, tp.
block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
226 (float2*)link.Even_p(), (float2*)link.Odd_p(),
227 input_path,
length, path_coeff,
229 parity_compute_gauge_force_kernel_sp18<1,float><<<tp.
grid, tp.
block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
231 (float2*)link.Even_p(), (float2*)link.Odd_p(),
232 input_path,
length, path_coeff,
236 parity_compute_gauge_force_kernel_sp12<0,float><<<tp.
grid, tp.
block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
238 (float4*)link.Even_p(), (float4*)link.Odd_p(),
239 input_path,
length, path_coeff,
245 parity_compute_gauge_force_kernel_sp12<1,float><<<tp.
grid, tp.
block>>>((float2*)mom.Even_p(), (float2*)mom.Odd_p(),
247 (float4*)link.Even_p(), (float4*)link.Odd_p(),
248 input_path,
length, path_coeff,
257 long long flops()
const {
return 0; }
260 std::stringstream vol, aux;
261 vol << link.X()[0] <<
"x";
262 vol << link.X()[1] <<
"x";
263 vol << link.X()[2] <<
"x";
265 aux <<
"threads=" << link.Volume() <<
",prec=" << link.Precision();
266 aux <<
"stride=" << link.Stride() <<
",recon=" << link.Reconstruct();
267 aux <<
"dir=" << dir <<
"num_paths=" << num_paths;
268 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
276 const int num_paths,
const int max_length)
279 size_t bytes = num_paths*max_length*
sizeof(int);
282 cudaMemset(input_path_d, 0, bytes);
286 memset(input_path_h, 0, bytes);
288 for(
int i=0; i < num_paths; i++) {
289 for(
int j=0; j < length[i]; j++) {
290 input_path_h[i*max_length + j] = input_path[i][j];
294 cudaMemcpy(input_path_d, input_path_h, bytes, cudaMemcpyHostToDevice);
297 int* length_d = (
int *)
device_malloc(num_paths*
sizeof(
int));
298 cudaMemcpy(length_d, length, num_paths*
sizeof(
int), cudaMemcpyHostToDevice);
301 void* path_coeff_d =
device_malloc(num_paths*
sizeof(
double));
302 cudaMemcpy(path_coeff_d, path_coeff, num_paths*
sizeof(
double), cudaMemcpyHostToDevice);
305 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
309 for(
int i=0; i<4; i++) {
313 kparam.threads = volume/2;
315 GaugeForceCuda gaugeForce(cudaMom, dir, eb3, cudaSiteLink, input_path_d,
316 length_d, reinterpret_cast<double*>(path_coeff_d), num_paths, kparam);
330 int*
length,
double* path_coeff,
int num_paths,
int max_length)
332 for(
int dir=0; dir < 4; dir++){
334 length, path_coeff, num_paths, max_length);
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
__constant__ fat_force_const_t gf
texture< float4, 1, cudaReadModeElementType > siteLink1TexSingle_recon
texture< int4, 1 > siteLink1TexDouble
QudaPrecision Precision() const
texture< float2, 1, cudaReadModeElementType > siteLink0TexSingle
struct quda::kernel_param_s kernel_param_t
texture< float2, 1, cudaReadModeElementType > siteLink1TexSingle
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
void gauge_force_cuda(cudaGaugeField &cudaMom, double eb3, cudaGaugeField &cudaSiteLink, QudaGaugeParam *param, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
texture< float4, 1, cudaReadModeElementType > siteLink0TexSingle_recon
#define safe_malloc(size)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
void * memset(void *s, int c, size_t n)
void apply(const cudaStream_t &stream)
#define device_malloc(size)
GaugeForceCuda(cudaGaugeField &mom, const int dir, const double &eb3, const cudaGaugeField &link, const int *input_path, const int *length, const double *path_coeff, const int num_paths, const kernel_param_t &kparam)
texture< int4, 1 > siteLink0TexDouble
void gauge_force_init_cuda(QudaGaugeParam *param, int max_length)
virtual ~GaugeForceCuda()
void gauge_force_cuda_dir(cudaGaugeField &cudaMom, const int dir, const double eb3, const cudaGaugeField &cudaSiteLink, const QudaGaugeParam *param, int **input_path, const int *length, const double *path_coeff, const int num_paths, const int max_length)