QUDA v0.4.0
A library for QCD on GPUs
|
00001 #ifndef _TWIST_QUDA_CUH 00002 #define _TWIST_QUDA_CUH 00003 00004 //action of the operator b*(1 + i*a*gamma5) 00005 //used also macros from io_spinor.h 00006 00007 __device__ float4 operator*(const float &x, const float4 &y) 00008 { 00009 float4 res; 00010 00011 res.x = x * y.x; 00012 res.y = x * y.y; 00013 res.z = x * y.z; 00014 res.w = x * y.w; 00015 00016 return res; 00017 } 00018 00019 00020 #define tmp0_re tmp0.x 00021 #define tmp0_im tmp0.y 00022 #define tmp1_re tmp1.x 00023 #define tmp1_im tmp1.y 00024 #define tmp2_re tmp2.x 00025 #define tmp2_im tmp2.y 00026 #define tmp3_re tmp3.x 00027 #define tmp3_im tmp3.y 00028 00029 #if (__COMPUTE_CAPABILITY__ >= 130) 00030 __global__ void twistGamma5Kernel(double2 *spinor, float *null, double a, double b, 00031 const double2 *in, const float *null2, DslashParam param) 00032 { 00033 00034 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00035 if (sid >= param.threads) return; 00036 00037 #ifndef FERMI_NO_DBLE_TEX 00038 double2 I0 = fetch_double2(spinorTexDouble, sid + 0 * sp_stride); 00039 double2 I1 = fetch_double2(spinorTexDouble, sid + 1 * sp_stride); 00040 double2 I2 = fetch_double2(spinorTexDouble, sid + 2 * sp_stride); 00041 double2 I3 = fetch_double2(spinorTexDouble, sid + 3 * sp_stride); 00042 double2 I4 = fetch_double2(spinorTexDouble, sid + 4 * sp_stride); 00043 double2 I5 = fetch_double2(spinorTexDouble, sid + 5 * sp_stride); 00044 double2 I6 = fetch_double2(spinorTexDouble, sid + 6 * sp_stride); 00045 double2 I7 = fetch_double2(spinorTexDouble, sid + 7 * sp_stride); 00046 double2 I8 = fetch_double2(spinorTexDouble, sid + 8 * sp_stride); 00047 double2 I9 = fetch_double2(spinorTexDouble, sid + 9 * sp_stride); 00048 double2 I10 = fetch_double2(spinorTexDouble, sid + 10 * sp_stride); 00049 double2 I11 = fetch_double2(spinorTexDouble, sid + 11 * sp_stride); 00050 #else 00051 double2 I0 = in[sid + 0 * sp_stride]; 00052 double2 I1 = in[sid + 1 * sp_stride]; 00053 double2 I2 = in[sid + 2 * sp_stride]; 00054 double2 I3 = in[sid + 3 * sp_stride]; 00055 double2 I4 = in[sid + 4 * sp_stride]; 00056 double2 I5 = in[sid + 5 * sp_stride]; 00057 double2 I6 = in[sid + 6 * sp_stride]; 00058 double2 I7 = in[sid + 7 * sp_stride]; 00059 double2 I8 = in[sid + 8 * sp_stride]; 00060 double2 I9 = in[sid + 9 * sp_stride]; 00061 double2 I10 = in[sid + 10 * sp_stride]; 00062 double2 I11 = in[sid + 11 * sp_stride]; 00063 #endif 00064 00065 volatile double2 tmp0, tmp1, tmp2, tmp3; 00066 00067 //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor) 00068 00069 //get the 1st color component: 00070 00071 tmp0_re = I0.x - a * I6.y; 00072 tmp0_im = I0.y + a * I6.x; 00073 00074 tmp2_re = I6.x - a * I0.y; 00075 tmp2_im = I6.y + a * I0.x; 00076 00077 tmp1_re = I3.x - a * I9.y; 00078 tmp1_im = I3.y + a * I9.x; 00079 00080 tmp3_re = I9.x - a * I3.y; 00081 tmp3_im = I9.y + a * I3.x; 00082 00083 I0.x = b * tmp0_re; 00084 I0.y = b * tmp0_im; 00085 I3.x = b * tmp1_re; 00086 I3.y = b * tmp1_im; 00087 I6.x = b * tmp2_re; 00088 I6.y = b * tmp2_im; 00089 I9.x = b * tmp3_re; 00090 I9.y = b * tmp3_im; 00091 00092 //get the 2nd color component: 00093 00094 tmp0_re = I1.x - a * I7.y; 00095 tmp0_im = I1.y + a * I7.x; 00096 00097 tmp2_re = I7.x - a * I1.y; 00098 tmp2_im = I7.y + a * I1.x; 00099 00100 tmp1_re = I4.x - a * I10.y; 00101 tmp1_im = I4.y + a * I10.x; 00102 00103 tmp3_re = I10.x - a * I4.y; 00104 tmp3_im = I10.y + a * I4.x; 00105 00106 I1.x = b * tmp0_re; 00107 I1.y = b * tmp0_im; 00108 I4.x = b * tmp1_re; 00109 I4.y = b * tmp1_im; 00110 I7.x = b * tmp2_re; 00111 I7.y = b * tmp2_im; 00112 I10.x = b * tmp3_re; 00113 I10.y = b * tmp3_im; 00114 00115 //get the 3d color component: 00116 00117 tmp0_re = I2.x - a* I8.y; 00118 tmp0_im = I2.y + a* I8.x; 00119 00120 tmp2_re = I8.x - a* I2.y; 00121 tmp2_im = I8.y + a* I2.x; 00122 00123 tmp1_re = I5.x - a* I11.y; 00124 tmp1_im = I5.y + a* I11.x; 00125 00126 tmp3_re = I11.x - a* I5.y; 00127 tmp3_im = I11.y + a* I5.x; 00128 00129 I2.x = b * tmp0_re; 00130 I2.y = b * tmp0_im; 00131 I5.x = b * tmp1_re; 00132 I5.y = b * tmp1_im; 00133 I8.x = b * tmp2_re; 00134 I8.y = b * tmp2_im; 00135 I11.x = b * tmp3_re; 00136 I11.y = b * tmp3_im; 00137 00138 spinor[sid + 0 * sp_stride] = I0; 00139 spinor[sid + 1 * sp_stride] = I1; 00140 spinor[sid + 2 * sp_stride] = I2; 00141 spinor[sid + 3 * sp_stride] = I3; 00142 spinor[sid + 4 * sp_stride] = I4; 00143 spinor[sid + 5 * sp_stride] = I5; 00144 spinor[sid + 6 * sp_stride] = I6; 00145 spinor[sid + 7 * sp_stride] = I7; 00146 spinor[sid + 8 * sp_stride] = I8; 00147 spinor[sid + 9 * sp_stride] = I9; 00148 spinor[sid + 10 * sp_stride] = I10; 00149 spinor[sid + 11 * sp_stride] = I11; 00150 00151 return; 00152 } 00153 #endif // (__COMPUTE_CAPABILITY__ >= 130) 00154 00155 #undef tmp0_re 00156 #undef tmp0_im 00157 #undef tmp1_re 00158 #undef tmp1_im 00159 #undef tmp2_re 00160 #undef tmp2_im 00161 #undef tmp3_re 00162 #undef tmp3_im 00163 00164 #define tmp0_re tmp0.x 00165 #define tmp0_im tmp0.y 00166 #define tmp1_re tmp0.z 00167 #define tmp1_im tmp0.w 00168 #define tmp2_re tmp1.x 00169 #define tmp2_im tmp1.y 00170 #define tmp3_re tmp1.z 00171 #define tmp3_im tmp1.w 00172 00173 __global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b, 00174 const float4 *in, const float *null2, DslashParam param) 00175 { 00176 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00177 if (sid >= param.threads) return; 00178 00179 float4 I0 = tex1Dfetch(spinorTexSingle, sid + 0 * sp_stride); 00180 float4 I1 = tex1Dfetch(spinorTexSingle, sid + 1 * sp_stride); 00181 float4 I2 = tex1Dfetch(spinorTexSingle, sid + 2 * sp_stride); 00182 float4 I3 = tex1Dfetch(spinorTexSingle, sid + 3 * sp_stride); 00183 float4 I4 = tex1Dfetch(spinorTexSingle, sid + 4 * sp_stride); 00184 float4 I5 = tex1Dfetch(spinorTexSingle, sid + 5 * sp_stride); 00185 00186 volatile float4 tmp0, tmp1; 00187 00188 //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor) 00189 00190 //get the 1st color component:(I0.xy, I1.zw, I3.xy, I4.zw) 00191 00192 tmp0_re = I0.x - a * I3.y; 00193 tmp0_im = I0.y + a * I3.x; 00194 00195 tmp1_re = I1.z - a * I4.w; 00196 tmp1_im = I1.w + a * I4.z; 00197 00198 tmp2_re = I3.x - a * I0.y; 00199 tmp2_im = I3.y + a * I0.x; 00200 00201 tmp3_re = I4.z - a * I1.w; 00202 tmp3_im = I4.w + a * I1.z; 00203 00204 I0.x = b * tmp0_re; 00205 I0.y = b * tmp0_im; 00206 I1.z = b * tmp1_re; 00207 I1.w = b * tmp1_im; 00208 I3.x = b * tmp2_re; 00209 I3.y = b * tmp2_im; 00210 I4.z = b * tmp3_re; 00211 I4.w = b * tmp3_im; 00212 00213 //get the 2nd color component:(I0.zw, I2.xy, I3.zw, I5.xy) 00214 00215 tmp0_re = I0.z - a * I3.w; 00216 tmp0_im = I0.w + a * I3.z; 00217 00218 tmp1_re = I2.x - a * I5.y; 00219 tmp1_im = I2.y + a * I5.x; 00220 00221 tmp2_re = I3.z - a * I0.w; 00222 tmp2_im = I3.w + a * I0.z; 00223 00224 tmp3_re = I5.x - a * I2.y; 00225 tmp3_im = I5.y + a * I2.x; 00226 00227 I0.z = b * tmp0_re; 00228 I0.w = b * tmp0_im; 00229 I2.x = b * tmp1_re; 00230 I2.y = b * tmp1_im; 00231 I3.z = b * tmp2_re; 00232 I3.w = b * tmp2_im; 00233 I5.x = b * tmp3_re; 00234 I5.y = b * tmp3_im; 00235 00236 //get the 3d color component:(I1.xy, I2.zw, I4.xy, I5.zw) 00237 00238 tmp0_re = I1.x - a * I4.y; 00239 tmp0_im = I1.y + a * I4.x; 00240 00241 tmp1_re = I2.z - a * I5.w; 00242 tmp1_im = I2.w + a * I5.z; 00243 00244 tmp2_re = I4.x - a * I1.y; 00245 tmp2_im = I4.y + a * I1.x; 00246 00247 tmp3_re = I5.z - a * I2.w; 00248 tmp3_im = I5.w + a * I2.z; 00249 00250 I1.x = b * tmp0_re; 00251 I1.y = b * tmp0_im; 00252 I2.z = b * tmp1_re; 00253 I2.w = b * tmp1_im; 00254 I4.x = b * tmp2_re; 00255 I4.y = b * tmp2_im; 00256 I5.z = b * tmp3_re; 00257 I5.w = b * tmp3_im; 00258 00259 spinor[sid + 0 * sp_stride] = I0; 00260 spinor[sid + 1 * sp_stride] = I1; 00261 spinor[sid + 2 * sp_stride] = I2; 00262 spinor[sid + 3 * sp_stride] = I3; 00263 spinor[sid + 4 * sp_stride] = I4; 00264 spinor[sid + 5 * sp_stride] = I5; 00265 00266 return; 00267 } 00268 00269 00270 __global__ void twistGamma5Kernel(short4* spinor, float *spinorNorm, float a, float b, 00271 const short4 *in, const float *inNorm, DslashParam param) 00272 { 00273 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00274 if (sid >= param.threads) return; 00275 00276 float4 I0 = tex1Dfetch(spinorTexHalf, sid + 0 * sp_stride); 00277 float4 I1 = tex1Dfetch(spinorTexHalf, sid + 1 * sp_stride); 00278 float4 I2 = tex1Dfetch(spinorTexHalf, sid + 2 * sp_stride); 00279 float4 I3 = tex1Dfetch(spinorTexHalf, sid + 3 * sp_stride); 00280 float4 I4 = tex1Dfetch(spinorTexHalf, sid + 4 * sp_stride); 00281 float4 I5 = tex1Dfetch(spinorTexHalf, sid + 5 * sp_stride); 00282 00283 float C = tex1Dfetch(spinorTexHalfNorm, sid); 00284 00285 I0 = C * I0; 00286 I1 = C * I1; 00287 I2 = C * I2; 00288 I3 = C * I3; 00289 I4 = C * I4; 00290 I5 = C * I5; 00291 00292 volatile float4 tmp0, tmp1; 00293 00294 //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor) 00295 00296 //get the 1st color component:(I0.xy, I1.zw, I3.xy, I4.zw) 00297 00298 tmp0_re = I0.x - a * I3.y; 00299 tmp0_im = I0.y + a * I3.x; 00300 00301 tmp1_re = I1.z - a * I4.w; 00302 tmp1_im = I1.w + a * I4.z; 00303 00304 tmp2_re = I3.x - a * I0.y; 00305 tmp2_im = I3.y + a * I0.x; 00306 00307 tmp3_re = I4.z - a * I1.w; 00308 tmp3_im = I4.w + a * I1.z; 00309 00310 I0.x = b * tmp0_re; 00311 I0.y = b * tmp0_im; 00312 I1.z = b * tmp1_re; 00313 I1.w = b * tmp1_im; 00314 I3.x = b * tmp2_re; 00315 I3.y = b * tmp2_im; 00316 I4.z = b * tmp3_re; 00317 I4.w = b * tmp3_im; 00318 00319 //get the 2nd color component:(I0.zw, I2.xy, I3.zw, I5.xy) 00320 00321 tmp0_re = I0.z - a * I3.w; 00322 tmp0_im = I0.w + a * I3.z; 00323 00324 tmp1_re = I2.x - a * I5.y; 00325 tmp1_im = I2.y + a * I5.x; 00326 00327 tmp2_re = I3.z - a * I0.w; 00328 tmp2_im = I3.w + a * I0.z; 00329 00330 tmp3_re = I5.x - a * I2.y; 00331 tmp3_im = I5.y + a * I2.x; 00332 00333 I0.z = b * tmp0_re; 00334 I0.w = b * tmp0_im; 00335 I2.x = b * tmp1_re; 00336 I2.y = b * tmp1_im; 00337 I3.z = b * tmp2_re; 00338 I3.w = b * tmp2_im; 00339 I5.x = b * tmp3_re; 00340 I5.y = b * tmp3_im; 00341 00342 //get the 3d color component:(I1.xy, I2.zw, I4.xy, I5.zw) 00343 00344 tmp0_re = I1.x - a * I4.y; 00345 tmp0_im = I1.y + a * I4.x; 00346 00347 tmp1_re = I2.z - a * I5.w; 00348 tmp1_im = I2.w + a * I5.z; 00349 00350 tmp2_re = I4.x - a * I1.y; 00351 tmp2_im = I4.y + a * I1.x; 00352 00353 tmp3_re = I5.z - a * I2.w; 00354 tmp3_im = I5.w + a * I2.z; 00355 00356 I1.x = b * tmp0_re; 00357 I1.y = b * tmp0_im; 00358 I2.z = b * tmp1_re; 00359 I2.w = b * tmp1_im; 00360 I4.x = b * tmp2_re; 00361 I4.y = b * tmp2_im; 00362 I5.z = b * tmp3_re; 00363 I5.w = b * tmp3_im; 00364 00365 00366 float c0 = fmaxf(fabsf(I0.x), fabsf(I0.y)); 00367 float c1 = fmaxf(fabsf(I0.z), fabsf(I0.w)); 00368 float c2 = fmaxf(fabsf(I1.x), fabsf(I1.y)); 00369 float c3 = fmaxf(fabsf(I1.z), fabsf(I1.w)); 00370 float c4 = fmaxf(fabsf(I2.x), fabsf(I2.y)); 00371 float c5 = fmaxf(fabsf(I2.z), fabsf(I2.w)); 00372 float c6 = fmaxf(fabsf(I3.x), fabsf(I3.y)); 00373 float c7 = fmaxf(fabsf(I3.z), fabsf(I3.w)); 00374 float c8 = fmaxf(fabsf(I4.x), fabsf(I4.y)); 00375 float c9 = fmaxf(fabsf(I4.z), fabsf(I4.w)); 00376 float c10 = fmaxf(fabsf(I5.x), fabsf(I5.y)); 00377 float c11 = fmaxf(fabsf(I5.z), fabsf(I5.w)); 00378 c0 = fmaxf(c0, c1); 00379 c1 = fmaxf(c2, c3); 00380 c2 = fmaxf(c4, c5); 00381 c3 = fmaxf(c6, c7); 00382 c4 = fmaxf(c8, c9); 00383 c5 = fmaxf(c10, c11); 00384 c0 = fmaxf(c0, c1); 00385 c1 = fmaxf(c2, c3); 00386 c2 = fmaxf(c4, c5); 00387 c0 = fmaxf(c0, c1); 00388 c0 = fmaxf(c0, c2); 00389 spinorNorm[sid] = c0; 00390 float scale = __fdividef(MAX_SHORT, c0); 00391 00392 I0 = scale * I0; 00393 I1 = scale * I1; 00394 I2 = scale * I2; 00395 I3 = scale * I3; 00396 I4 = scale * I4; 00397 I5 = scale * I5; 00398 00399 spinor[sid+0*(sp_stride)] = make_short4((short)I0.x, (short)I0.y, (short)I0.z, (short)I0.w); 00400 spinor[sid+1*(sp_stride)] = make_short4((short)I1.x, (short)I1.y, (short)I1.z, (short)I1.w); 00401 spinor[sid+2*(sp_stride)] = make_short4((short)I2.x, (short)I2.y, (short)I2.z, (short)I2.w); 00402 spinor[sid+3*(sp_stride)] = make_short4((short)I3.x, (short)I3.y, (short)I3.z, (short)I3.w); 00403 spinor[sid+4*(sp_stride)] = make_short4((short)I4.x, (short)I4.y, (short)I4.z, (short)I4.w); 00404 spinor[sid+5*(sp_stride)] = make_short4((short)I5.x, (short)I5.y, (short)I5.z, (short)I5.w); 00405 00406 return; 00407 } 00408 00409 #undef tmp0_re 00410 #undef tmp0_im 00411 #undef tmp1_re 00412 #undef tmp1_im 00413 #undef tmp2_re 00414 #undef tmp2_im 00415 #undef tmp3_re 00416 #undef tmp3_im 00417 00418 #endif //_TWIST_QUDA_CUH 00419 00420