|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #ifndef _TWIST_QUDA_CUH 00002 #define _TWIST_QUDA_CUH 00003 00004 //action of the operator b*(1 + i*a*gamma5) 00005 //used also macros from io_spinor.h 00006 00007 __device__ float4 operator*(const float &x, const float4 &y) 00008 { 00009 float4 res; 00010 00011 res.x = x * y.x; 00012 res.y = x * y.y; 00013 res.z = x * y.z; 00014 res.w = x * y.w; 00015 00016 return res; 00017 } 00018 00019 00020 #define tmp0_re tmp0.x 00021 #define tmp0_im tmp0.y 00022 #define tmp1_re tmp1.x 00023 #define tmp1_im tmp1.y 00024 #define tmp2_re tmp2.x 00025 #define tmp2_im tmp2.y 00026 #define tmp3_re tmp3.x 00027 #define tmp3_im tmp3.y 00028 00029 #if (__CUDA_ARCH__ >= 130) 00030 __global__ void twistGamma5Kernel(double2 *spinor, float *null, double a, double b) 00031 { 00032 int sp_idx = blockIdx.x * blockDim.x + threadIdx.x; 00033 00034 double2 I0 = fetch_double2(spinorTexDouble, sp_idx + 0 * sp_stride); 00035 double2 I1 = fetch_double2(spinorTexDouble, sp_idx + 1 * sp_stride); 00036 double2 I2 = fetch_double2(spinorTexDouble, sp_idx + 2 * sp_stride); 00037 double2 I3 = fetch_double2(spinorTexDouble, sp_idx + 3 * sp_stride); 00038 double2 I4 = fetch_double2(spinorTexDouble, sp_idx + 4 * sp_stride); 00039 double2 I5 = fetch_double2(spinorTexDouble, sp_idx + 5 * sp_stride); 00040 double2 I6 = fetch_double2(spinorTexDouble, sp_idx + 6 * sp_stride); 00041 double2 I7 = fetch_double2(spinorTexDouble, sp_idx + 7 * sp_stride); 00042 double2 I8 = fetch_double2(spinorTexDouble, sp_idx + 8 * sp_stride); 00043 double2 I9 = fetch_double2(spinorTexDouble, sp_idx + 9 * sp_stride); 00044 double2 I10 = fetch_double2(spinorTexDouble, sp_idx + 10 * sp_stride); 00045 double2 I11 = fetch_double2(spinorTexDouble, sp_idx + 11 * sp_stride); 00046 00047 volatile double2 tmp0, tmp1, tmp2, tmp3; 00048 00049 //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor) 00050 00051 //get the 1st color component: 00052 00053 tmp0_re = I0.x - a * I6.y; 00054 tmp0_im = I0.y + a * I6.x; 00055 00056 tmp2_re = I6.x - a * I0.y; 00057 tmp2_im = I6.y + a * I0.x; 00058 00059 tmp1_re = I3.x - a * I9.y; 00060 tmp1_im = I3.y + a * I9.x; 00061 00062 tmp3_re = I9.x - a * I3.y; 00063 tmp3_im = I9.y + a * I3.x; 00064 00065 I0.x = b * tmp0_re; 00066 I0.y = b * tmp0_im; 00067 I3.x = b * tmp1_re; 00068 I3.y = b * tmp1_im; 00069 I6.x = b * tmp2_re; 00070 I6.y = b * tmp2_im; 00071 I9.x = b * tmp3_re; 00072 I9.y = b * tmp3_im; 00073 00074 //get the 2nd color component: 00075 00076 tmp0_re = I1.x - a * I7.y; 00077 tmp0_im = I1.y + a * I7.x; 00078 00079 tmp2_re = I7.x - a * I1.y; 00080 tmp2_im = I7.y + a * I1.x; 00081 00082 tmp1_re = I4.x - a * I10.y; 00083 tmp1_im = I4.y + a * I10.x; 00084 00085 tmp3_re = I10.x - a * I4.y; 00086 tmp3_im = I10.y + a * I4.x; 00087 00088 I1.x = b * tmp0_re; 00089 I1.y = b * tmp0_im; 00090 I4.x = b * tmp1_re; 00091 I4.y = b * tmp1_im; 00092 I7.x = b * tmp2_re; 00093 I7.y = b * tmp2_im; 00094 I10.x = b * tmp3_re; 00095 I10.y = b * tmp3_im; 00096 00097 //get the 3d color component: 00098 00099 tmp0_re = I2.x - a* I8.y; 00100 tmp0_im = I2.y + a* I8.x; 00101 00102 tmp2_re = I8.x - a* I2.y; 00103 tmp2_im = I8.y + a* I2.x; 00104 00105 tmp1_re = I5.x - a* I11.y; 00106 tmp1_im = I5.y + a* I11.x; 00107 00108 tmp3_re = I11.x - a* I5.y; 00109 tmp3_im = I11.y + a* I5.x; 00110 00111 I2.x = b * tmp0_re; 00112 I2.y = b * tmp0_im; 00113 I5.x = b * tmp1_re; 00114 I5.y = b * tmp1_im; 00115 I8.x = b * tmp2_re; 00116 I8.y = b * tmp2_im; 00117 I11.x = b * tmp3_re; 00118 I11.y = b * tmp3_im; 00119 00120 spinor[sp_idx + 0 * sp_stride] = I0; 00121 spinor[sp_idx + 1 * sp_stride] = I1; 00122 spinor[sp_idx + 2 * sp_stride] = I2; 00123 spinor[sp_idx + 3 * sp_stride] = I3; 00124 spinor[sp_idx + 4 * sp_stride] = I4; 00125 spinor[sp_idx + 5 * sp_stride] = I5; 00126 spinor[sp_idx + 6 * sp_stride] = I6; 00127 spinor[sp_idx + 7 * sp_stride] = I7; 00128 spinor[sp_idx + 8 * sp_stride] = I8; 00129 spinor[sp_idx + 9 * sp_stride] = I9; 00130 spinor[sp_idx + 10 * sp_stride] = I10; 00131 spinor[sp_idx + 11 * sp_stride] = I11; 00132 00133 return; 00134 } 00135 #endif // (__CUDA_ARCH__ >= 130) 00136 00137 #undef tmp0_re 00138 #undef tmp0_im 00139 #undef tmp1_re 00140 #undef tmp1_im 00141 #undef tmp2_re 00142 #undef tmp2_im 00143 #undef tmp3_re 00144 #undef tmp3_im 00145 00146 #define tmp0_re tmp0.x 00147 #define tmp0_im tmp0.y 00148 #define tmp1_re tmp0.z 00149 #define tmp1_im tmp0.w 00150 #define tmp2_re tmp1.x 00151 #define tmp2_im tmp1.y 00152 #define tmp3_re tmp1.z 00153 #define tmp3_im tmp1.w 00154 00155 __global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b) 00156 { 00157 int sp_idx = blockIdx.x * blockDim.x + threadIdx.x; 00158 00159 float4 I0 = tex1Dfetch(spinorTexSingle, sp_idx + 0 * sp_stride); 00160 float4 I1 = tex1Dfetch(spinorTexSingle, sp_idx + 1 * sp_stride); 00161 float4 I2 = tex1Dfetch(spinorTexSingle, sp_idx + 2 * sp_stride); 00162 float4 I3 = tex1Dfetch(spinorTexSingle, sp_idx + 3 * sp_stride); 00163 float4 I4 = tex1Dfetch(spinorTexSingle, sp_idx + 4 * sp_stride); 00164 float4 I5 = tex1Dfetch(spinorTexSingle, sp_idx + 5 * sp_stride); 00165 00166 volatile float4 tmp0, tmp1; 00167 00168 //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor) 00169 00170 //get the 1st color component:(I0.xy, I1.zw, I3.xy, I4.zw) 00171 00172 tmp0_re = I0.x - a * I3.y; 00173 tmp0_im = I0.y + a * I3.x; 00174 00175 tmp1_re = I1.z - a * I4.w; 00176 tmp1_im = I1.w + a * I4.z; 00177 00178 tmp2_re = I3.x - a * I0.y; 00179 tmp2_im = I3.y + a * I0.x; 00180 00181 tmp3_re = I4.z - a * I1.w; 00182 tmp3_im = I4.w + a * I1.z; 00183 00184 I0.x = b * tmp0_re; 00185 I0.y = b * tmp0_im; 00186 I1.z = b * tmp1_re; 00187 I1.w = b * tmp1_im; 00188 I3.x = b * tmp2_re; 00189 I3.y = b * tmp2_im; 00190 I4.z = b * tmp3_re; 00191 I4.w = b * tmp3_im; 00192 00193 //get the 2nd color component:(I0.zw, I2.xy, I3.zw, I5.xy) 00194 00195 tmp0_re = I0.z - a * I3.w; 00196 tmp0_im = I0.w + a * I3.z; 00197 00198 tmp1_re = I2.x - a * I5.y; 00199 tmp1_im = I2.y + a * I5.x; 00200 00201 tmp2_re = I3.z - a * I0.w; 00202 tmp2_im = I3.w + a * I0.z; 00203 00204 tmp3_re = I5.x - a * I2.y; 00205 tmp3_im = I5.y + a * I2.x; 00206 00207 I0.z = b * tmp0_re; 00208 I0.w = b * tmp0_im; 00209 I2.x = b * tmp1_re; 00210 I2.y = b * tmp1_im; 00211 I3.z = b * tmp2_re; 00212 I3.w = b * tmp2_im; 00213 I5.x = b * tmp3_re; 00214 I5.y = b * tmp3_im; 00215 00216 //get the 3d color component:(I1.xy, I2.zw, I4.xy, I5.zw) 00217 00218 tmp0_re = I1.x - a * I4.y; 00219 tmp0_im = I1.y + a * I4.x; 00220 00221 tmp1_re = I2.z - a * I5.w; 00222 tmp1_im = I2.w + a * I5.z; 00223 00224 tmp2_re = I4.x - a * I1.y; 00225 tmp2_im = I4.y + a * I1.x; 00226 00227 tmp3_re = I5.z - a * I2.w; 00228 tmp3_im = I5.w + a * I2.z; 00229 00230 I1.x = b * tmp0_re; 00231 I1.y = b * tmp0_im; 00232 I2.z = b * tmp1_re; 00233 I2.w = b * tmp1_im; 00234 I4.x = b * tmp2_re; 00235 I4.y = b * tmp2_im; 00236 I5.z = b * tmp3_re; 00237 I5.w = b * tmp3_im; 00238 00239 spinor[sp_idx + 0 * sp_stride] = I0; 00240 spinor[sp_idx + 1 * sp_stride] = I1; 00241 spinor[sp_idx + 2 * sp_stride] = I2; 00242 spinor[sp_idx + 3 * sp_stride] = I3; 00243 spinor[sp_idx + 4 * sp_stride] = I4; 00244 spinor[sp_idx + 5 * sp_stride] = I5; 00245 00246 return; 00247 } 00248 00249 00250 __global__ void twistGamma5Kernel(short4* spinor, float *spinorNorm, float a, float b) 00251 { 00252 int sp_idx = blockIdx.x * blockDim.x + threadIdx.x; 00253 00254 float4 I0 = tex1Dfetch(spinorTexHalf, sp_idx + 0 * sp_stride); 00255 float4 I1 = tex1Dfetch(spinorTexHalf, sp_idx + 1 * sp_stride); 00256 float4 I2 = tex1Dfetch(spinorTexHalf, sp_idx + 2 * sp_stride); 00257 float4 I3 = tex1Dfetch(spinorTexHalf, sp_idx + 3 * sp_stride); 00258 float4 I4 = tex1Dfetch(spinorTexHalf, sp_idx + 4 * sp_stride); 00259 float4 I5 = tex1Dfetch(spinorTexHalf, sp_idx + 5 * sp_stride); 00260 00261 float C = tex1Dfetch(spinorTexNorm, sp_idx); 00262 00263 I0 = C * I0; 00264 I1 = C * I1; 00265 I2 = C * I2; 00266 I3 = C * I3; 00267 I4 = C * I4; 00268 I5 = C * I5; 00269 00270 volatile float4 tmp0, tmp1; 00271 00272 //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor) 00273 00274 //get the 1st color component:(I0.xy, I1.zw, I3.xy, I4.zw) 00275 00276 tmp0_re = I0.x - a * I3.y; 00277 tmp0_im = I0.y + a * I3.x; 00278 00279 tmp1_re = I1.z - a * I4.w; 00280 tmp1_im = I1.w + a * I4.z; 00281 00282 tmp2_re = I3.x - a * I0.y; 00283 tmp2_im = I3.y + a * I0.x; 00284 00285 tmp3_re = I4.z - a * I1.w; 00286 tmp3_im = I4.w + a * I1.z; 00287 00288 I0.x = b * tmp0_re; 00289 I0.y = b * tmp0_im; 00290 I1.z = b * tmp1_re; 00291 I1.w = b * tmp1_im; 00292 I3.x = b * tmp2_re; 00293 I3.y = b * tmp2_im; 00294 I4.z = b * tmp3_re; 00295 I4.w = b * tmp3_im; 00296 00297 //get the 2nd color component:(I0.zw, I2.xy, I3.zw, I5.xy) 00298 00299 tmp0_re = I0.z - a * I3.w; 00300 tmp0_im = I0.w + a * I3.z; 00301 00302 tmp1_re = I2.x - a * I5.y; 00303 tmp1_im = I2.y + a * I5.x; 00304 00305 tmp2_re = I3.z - a * I0.w; 00306 tmp2_im = I3.w + a * I0.z; 00307 00308 tmp3_re = I5.x - a * I2.y; 00309 tmp3_im = I5.y + a * I2.x; 00310 00311 I0.z = b * tmp0_re; 00312 I0.w = b * tmp0_im; 00313 I2.x = b * tmp1_re; 00314 I2.y = b * tmp1_im; 00315 I3.z = b * tmp2_re; 00316 I3.w = b * tmp2_im; 00317 I5.x = b * tmp3_re; 00318 I5.y = b * tmp3_im; 00319 00320 //get the 3d color component:(I1.xy, I2.zw, I4.xy, I5.zw) 00321 00322 tmp0_re = I1.x - a * I4.y; 00323 tmp0_im = I1.y + a * I4.x; 00324 00325 tmp1_re = I2.z - a * I5.w; 00326 tmp1_im = I2.w + a * I5.z; 00327 00328 tmp2_re = I4.x - a * I1.y; 00329 tmp2_im = I4.y + a * I1.x; 00330 00331 tmp3_re = I5.z - a * I2.w; 00332 tmp3_im = I5.w + a * I2.z; 00333 00334 I1.x = b * tmp0_re; 00335 I1.y = b * tmp0_im; 00336 I2.z = b * tmp1_re; 00337 I2.w = b * tmp1_im; 00338 I4.x = b * tmp2_re; 00339 I4.y = b * tmp2_im; 00340 I5.z = b * tmp3_re; 00341 I5.w = b * tmp3_im; 00342 00343 00344 float c0 = fmaxf(fabsf(I0.x), fabsf(I0.y)); 00345 float c1 = fmaxf(fabsf(I0.z), fabsf(I0.w)); 00346 float c2 = fmaxf(fabsf(I1.x), fabsf(I1.y)); 00347 float c3 = fmaxf(fabsf(I1.z), fabsf(I1.w)); 00348 float c4 = fmaxf(fabsf(I2.x), fabsf(I2.y)); 00349 float c5 = fmaxf(fabsf(I2.z), fabsf(I2.w)); 00350 float c6 = fmaxf(fabsf(I3.x), fabsf(I3.y)); 00351 float c7 = fmaxf(fabsf(I3.z), fabsf(I3.w)); 00352 float c8 = fmaxf(fabsf(I4.x), fabsf(I4.y)); 00353 float c9 = fmaxf(fabsf(I4.z), fabsf(I4.w)); 00354 float c10 = fmaxf(fabsf(I5.x), fabsf(I5.y)); 00355 float c11 = fmaxf(fabsf(I5.z), fabsf(I5.w)); 00356 c0 = fmaxf(c0, c1); 00357 c1 = fmaxf(c2, c3); 00358 c2 = fmaxf(c4, c5); 00359 c3 = fmaxf(c6, c7); 00360 c4 = fmaxf(c8, c9); 00361 c5 = fmaxf(c10, c11); 00362 c0 = fmaxf(c0, c1); 00363 c1 = fmaxf(c2, c3); 00364 c2 = fmaxf(c4, c5); 00365 c0 = fmaxf(c0, c1); 00366 c0 = fmaxf(c0, c2); 00367 spinorNorm[sp_idx] = c0; 00368 float scale = __fdividef(MAX_SHORT, c0); 00369 00370 I0 = scale * I0; 00371 I1 = scale * I1; 00372 I2 = scale * I2; 00373 I3 = scale * I3; 00374 I4 = scale * I4; 00375 I5 = scale * I5; 00376 00377 spinor[sp_idx+0*(sp_stride)] = make_short4((short)I0.x, (short)I0.y, (short)I0.z, (short)I0.w); 00378 spinor[sp_idx+1*(sp_stride)] = make_short4((short)I1.x, (short)I1.y, (short)I1.z, (short)I1.w); 00379 spinor[sp_idx+2*(sp_stride)] = make_short4((short)I2.x, (short)I2.y, (short)I2.z, (short)I2.w); 00380 spinor[sp_idx+3*(sp_stride)] = make_short4((short)I3.x, (short)I3.y, (short)I3.z, (short)I3.w); 00381 spinor[sp_idx+4*(sp_stride)] = make_short4((short)I4.x, (short)I4.y, (short)I4.z, (short)I4.w); 00382 spinor[sp_idx+5*(sp_stride)] = make_short4((short)I5.x, (short)I5.y, (short)I5.z, (short)I5.w); 00383 00384 return; 00385 } 00386 00387 #undef tmp0_re 00388 #undef tmp0_im 00389 #undef tmp1_re 00390 #undef tmp1_im 00391 #undef tmp2_re 00392 #undef tmp2_im 00393 #undef tmp3_re 00394 #undef tmp3_im 00395 00396 #endif //_TWIST_QUDA_CUH 00397 00398
1.7.3