QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <read_gauge.h> 00002 #include <gauge_field.h> 00003 00004 #include "gauge_force_quda.h" 00005 #ifdef MULTI_GPU 00006 #include "face_quda.h" 00007 #endif 00008 00009 00010 __constant__ int path_max_length; 00011 00012 #define GF_SITE_MATRIX_LOAD_TEX 1 00013 00014 //single precsison, 12-reconstruct 00015 #if (GF_SITE_MATRIX_LOAD_TEX == 1) 00016 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink0TexSingle_recon, dir, idx, var, site_ga_stride) 00017 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(siteLink1TexSingle_recon, dir, idx, var, site_ga_stride) 00018 #else 00019 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkEven, dir, idx, var, site_ga_stride) 00020 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE(linkOdd, dir, idx, var, site_ga_stride) 00021 #endif 00022 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, mom_ga_stride) 00023 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var) 00024 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4 00025 #define N_IN_FLOATN 4 00026 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp12 00027 #include "gauge_force_core.h" 00028 #undef LOAD_EVEN_MATRIX 00029 #undef LOAD_ODD_MATRIX 00030 #undef LOAD_ANTI_HERMITIAN 00031 #undef RECONSTRUCT_MATRIX 00032 #undef DECLARE_LINK_VARS 00033 #undef N_IN_FLOATN 00034 #undef GAUGE_FORCE_KERN_NAME 00035 00036 //double precsison, 12-reconstruct 00037 #if (GF_SITE_MATRIX_LOAD_TEX == 1) 00038 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, site_ga_stride) 00039 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, site_ga_stride) 00040 #else 00041 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkEven, dir, idx, var, site_ga_stride) 00042 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE(linkOdd, dir, idx, var, site_ga_stride) 00043 #endif 00044 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, mom_ga_stride) 00045 #define RECONSTRUCT_MATRIX(sign, var) RECONSTRUCT_LINK_12(sign,var) 00046 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8 00047 #define N_IN_FLOATN 2 00048 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp12 00049 #include "gauge_force_core.h" 00050 #undef LOAD_EVEN_MATRIX 00051 #undef LOAD_ODD_MATRIX 00052 #undef LOAD_ANTI_HERMITIAN 00053 #undef RECONSTRUCT_MATRIX 00054 #undef DECLARE_LINK_VARS 00055 #undef N_IN_FLOATN 00056 #undef GAUGE_FORCE_KERN_NAME 00057 00058 //single precision, 18-reconstruct 00059 #if (GF_SITE_MATRIX_LOAD_TEX == 1) 00060 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink0TexSingle, dir, idx, var, site_ga_stride) 00061 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX(siteLink1TexSingle, dir, idx, var, site_ga_stride) 00062 #else 00063 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, site_ga_stride) 00064 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, site_ga_stride) 00065 #endif 00066 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var,mom_ga_stride) 00067 #define RECONSTRUCT_MATRIX(sign, var) 00068 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8 00069 #define N_IN_FLOATN 2 00070 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_sp18 00071 #include "gauge_force_core.h" 00072 #undef LOAD_EVEN_MATRIX 00073 #undef LOAD_ODD_MATRIX 00074 #undef LOAD_ANTI_HERMITIAN 00075 #undef RECONSTRUCT_MATRIX 00076 #undef DECLARE_LINK_VARS 00077 #undef N_IN_FLOATN 00078 #undef GAUGE_FORCE_KERN_NAME 00079 00080 //double precision, 18-reconstruct 00081 #if (GF_SITE_MATRIX_LOAD_TEX == 1) 00082 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink0TexDouble, linkEven, dir, idx, var, site_ga_stride) 00083 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX(siteLink1TexDouble, linkOdd, dir, idx, var, site_ga_stride) 00084 #else 00085 #define LOAD_EVEN_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkEven, dir, idx, var, site_ga_stride) 00086 #define LOAD_ODD_MATRIX(dir, idx, var) LOAD_MATRIX_18(linkOdd, dir, idx, var, site_ga_stride) 00087 #endif 00088 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, mom_ga_stride) 00089 #define RECONSTRUCT_MATRIX(sign, var) 00090 #define DECLARE_LINK_VARS(var) FloatN var##0, var##1, var##2, var##3, var##4, var##5, var##6, var##7, var##8 00091 #define N_IN_FLOATN 2 00092 #define GAUGE_FORCE_KERN_NAME parity_compute_gauge_force_kernel_dp18 00093 #include "gauge_force_core.h" 00094 #undef LOAD_EVEN_MATRIX 00095 #undef LOAD_ODD_MATRIX 00096 #undef LOAD_ANTI_HERMITIAN 00097 #undef RECONSTRUCT_MATRIX 00098 #undef DECLARE_LINK_VARS 00099 #undef N_IN_FLOATN 00100 #undef GAUGE_FORCE_KERN_NAME 00101 00102 void 00103 gauge_force_init_cuda(QudaGaugeParam* param, int path_max_length) 00104 { 00105 00106 static int gauge_force_init_cuda_flag = 0; 00107 if (gauge_force_init_cuda_flag){ 00108 return; 00109 } 00110 gauge_force_init_cuda_flag=1; 00111 00112 00113 #ifdef MULTI_GPU 00114 int E1 = param->X[0] + 4; 00115 int E1h = E1/2; 00116 int E2 = param->X[1] + 4; 00117 int E3 = param->X[2] + 4; 00118 int E4 = param->X[3] + 4; 00119 int E2E1 =E2*E1; 00120 int E3E2E1=E3*E2*E1; 00121 int Vh_ex = E1*E2*E3*E4/2; 00122 00123 cudaMemcpyToSymbol("E1", &E1, sizeof(int)); 00124 cudaMemcpyToSymbol("E1h", &E1h, sizeof(int)); 00125 cudaMemcpyToSymbol("E2", &E2, sizeof(int)); 00126 cudaMemcpyToSymbol("E3", &E3, sizeof(int)); 00127 cudaMemcpyToSymbol("E4", &E4, sizeof(int)); 00128 cudaMemcpyToSymbol("E2E1", &E2E1, sizeof(int)); 00129 cudaMemcpyToSymbol("E3E2E1", &E3E2E1, sizeof(int)); 00130 00131 cudaMemcpyToSymbol("Vh_ex", &Vh_ex, sizeof(int)); 00132 #endif 00133 00134 int* X = param->X; 00135 int Vh = X[0]*X[1]*X[2]*X[3]/2; 00136 cudaMemcpyToSymbol("path_max_length", &path_max_length, sizeof(int)); 00137 00138 #ifdef MULTI_GPU 00139 int site_ga_stride = param->site_ga_pad + Vh_ex; 00140 #else 00141 int site_ga_stride = param->site_ga_pad + Vh; 00142 #endif 00143 00144 cudaMemcpyToSymbol("site_ga_stride", &site_ga_stride, sizeof(int)); 00145 int mom_ga_stride = param->mom_ga_pad + Vh; 00146 cudaMemcpyToSymbol("mom_ga_stride", &mom_ga_stride, sizeof(int)); 00147 00148 } 00149 00150 00151 void 00152 gauge_force_cuda_dir(cudaGaugeField& cudaMom, int dir, double eb3, cudaGaugeField& cudaSiteLink, 00153 QudaGaugeParam* param, int** input_path, 00154 int* length, void* path_coeff, int num_paths, int max_length) 00155 { 00156 int i, j; 00157 //input_path 00158 int bytes = num_paths*max_length* sizeof(int); 00159 int* input_path_d; 00160 00161 if(cudaMalloc((void**)&input_path_d, bytes) != cudaSuccess){ 00162 errorQuda("cudaMalloc failed for input_path_d\n"); 00163 } 00164 00165 cudaMemset(input_path_d, 0, bytes);checkCudaError(); 00166 00167 int* input_path_h = (int*)malloc(bytes); 00168 if (input_path_h == NULL){ 00169 printf("ERROR: malloc failed for input_path_h in function %s\n", __FUNCTION__); 00170 exit(1); 00171 } 00172 00173 memset(input_path_h, 0, bytes); 00174 for(i=0;i < num_paths;i++){ 00175 for(j=0; j < length[i]; j++){ 00176 input_path_h[i*max_length + j] =input_path[i][j]; 00177 } 00178 } 00179 00180 cudaMemcpy(input_path_d, input_path_h, bytes, cudaMemcpyHostToDevice); 00181 00182 //length 00183 int* length_d; 00184 if(cudaMalloc((void**)&length_d, num_paths*sizeof(int)) != cudaSuccess){ 00185 errorQuda("cudaMalloc failed for length_d\n"); 00186 } 00187 cudaMemcpy(length_d, length, num_paths*sizeof(int), cudaMemcpyHostToDevice); 00188 00189 //path_coeff 00190 int gsize = param->cuda_prec; 00191 void* path_coeff_d; 00192 if(cudaMalloc((void**)&path_coeff_d, num_paths*gsize) != cudaSuccess){ 00193 errorQuda("cudaMalloc failed for path_coeff_d\n"); 00194 } 00195 cudaMemcpy(path_coeff_d, path_coeff, num_paths*gsize, cudaMemcpyHostToDevice); 00196 00197 //compute the gauge forces 00198 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 00199 dim3 blockDim(BLOCK_DIM, 1,1); 00200 dim3 gridDim(volume/blockDim.x, 1, 1); 00201 dim3 halfGridDim(volume/(2*blockDim.x), 1, 1); 00202 00203 void* momEven = (void*)cudaMom.Even_p(); 00204 void* momOdd = (void*)cudaMom.Odd_p(); 00205 00206 void* linkEven = (void*)cudaSiteLink.Even_p(); 00207 void* linkOdd = (void*)cudaSiteLink.Odd_p(); 00208 00209 kernel_param_t kparam; 00210 #ifdef MULTI_GPU 00211 for(int i =0;i < 4;i++){ 00212 kparam.ghostDim[i] = commDimPartitioned(i); 00213 } 00214 #endif 00215 00216 kparam.threads = volume/2; 00217 00218 if(param->cuda_prec == QUDA_DOUBLE_PRECISION){ 00219 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()/2); 00220 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()/2); 00221 }else{ //QUDA_SINGLE_PRECISION 00222 if(param->reconstruct == QUDA_RECONSTRUCT_NO){ 00223 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()/2); 00224 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()/2); 00225 }else{//QUDA_RECONSTRUCT_12 00226 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()/2); 00227 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()/2); 00228 } 00229 } 00230 00231 if(param->cuda_prec == QUDA_DOUBLE_PRECISION){ 00232 if(param->reconstruct == QUDA_RECONSTRUCT_NO){ 00233 parity_compute_gauge_force_kernel_dp18<0><<<halfGridDim, blockDim>>>((double2*)momEven, (double2*)momOdd, 00234 dir, eb3, 00235 (double2*)linkEven, (double2*)linkOdd, 00236 input_path_d, length_d, (double*)path_coeff_d, 00237 num_paths, kparam); 00238 parity_compute_gauge_force_kernel_dp18<1><<<halfGridDim, blockDim>>>((double2*)momEven, (double2*)momOdd, 00239 dir, eb3, 00240 (double2*)linkEven, (double2*)linkOdd, 00241 input_path_d, length_d, (double*)path_coeff_d, 00242 num_paths, kparam); 00243 00244 }else{ //QUDA_RECONSTRUCT_12 00245 parity_compute_gauge_force_kernel_dp12<0><<<halfGridDim, blockDim>>>((double2*)momEven, (double2*)momOdd, 00246 dir, eb3, 00247 (double2*)linkEven, (double2*)linkOdd, 00248 input_path_d, length_d, (double*)path_coeff_d, 00249 num_paths, kparam); 00250 parity_compute_gauge_force_kernel_dp12<1><<<halfGridDim, blockDim>>>((double2*)momEven, (double2*)momOdd, 00251 dir, eb3, 00252 (double2*)linkEven, (double2*)linkOdd, 00253 input_path_d, length_d, (double*)path_coeff_d, 00254 num_paths, kparam); 00255 } 00256 }else{ //QUDA_SINGLE_PRECISION 00257 if(param->reconstruct == QUDA_RECONSTRUCT_NO){ 00258 00259 parity_compute_gauge_force_kernel_sp18<0><<<halfGridDim, blockDim>>>((float2*)momEven, (float2*)momOdd, 00260 dir, eb3, 00261 (float2*)linkEven, (float2*)linkOdd, 00262 input_path_d, length_d, (float*)path_coeff_d, 00263 num_paths, kparam); 00264 parity_compute_gauge_force_kernel_sp18<1><<<halfGridDim, blockDim>>>((float2*)momEven, (float2*)momOdd, 00265 dir, eb3, 00266 (float2*)linkEven, (float2*)linkOdd, 00267 input_path_d, length_d, (float*)path_coeff_d, 00268 num_paths, kparam); 00269 00270 }else{ //QUDA_RECONSTRUCT_12 00271 parity_compute_gauge_force_kernel_sp12<0><<<halfGridDim, blockDim>>>((float2*)momEven, (float2*)momOdd, 00272 dir, eb3, 00273 (float4*)linkEven, (float4*)linkOdd, 00274 input_path_d, length_d, (float*)path_coeff_d, 00275 num_paths, kparam); 00276 //odd 00277 /* The reason we do not switch the even/odd function input paramemters and the texture binding 00278 * is that we use the oddbit to decided where to load, in the kernel function 00279 */ 00280 parity_compute_gauge_force_kernel_sp12<1><<<halfGridDim, blockDim>>>((float2*)momEven, (float2*)momOdd, 00281 dir, eb3, 00282 (float4*)linkEven, (float4*)linkOdd, 00283 input_path_d, length_d, (float*)path_coeff_d, 00284 num_paths, kparam); 00285 } 00286 00287 } 00288 00289 00290 if(param->cuda_prec == QUDA_DOUBLE_PRECISION){ 00291 cudaUnbindTexture(siteLink0TexDouble); 00292 cudaUnbindTexture(siteLink1TexDouble); 00293 }else{ //QUDA_SINGLE_PRECISION 00294 if(param->reconstruct == QUDA_RECONSTRUCT_NO){ 00295 cudaUnbindTexture(siteLink0TexSingle); 00296 cudaUnbindTexture(siteLink1TexSingle); 00297 }else{//QUDA_RECONSTRUCT_12 00298 cudaUnbindTexture(siteLink0TexSingle_recon); 00299 cudaUnbindTexture(siteLink1TexSingle_recon); 00300 } 00301 } 00302 00303 00304 checkCudaError(); 00305 00306 cudaFree(input_path_d); checkCudaError(); 00307 free(input_path_h); 00308 cudaFree(length_d); 00309 cudaFree(path_coeff_d); 00310 00311 00312 00313 } 00314 00315 00316 void 00317 gauge_force_cuda(cudaGaugeField& cudaMom, double eb3, cudaGaugeField& cudaSiteLink, 00318 QudaGaugeParam* param, int*** input_path, 00319 int* length, void* path_coeff, int num_paths, int max_length) 00320 { 00321 00322 for(int dir=0; dir < 4; dir++){ 00323 gauge_force_cuda_dir(cudaMom, dir, eb3, cudaSiteLink, param, input_path[dir], 00324 length, path_coeff, num_paths, max_length); 00325 } 00326 00327 }