|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 00004 #include <quda_internal.h> 00005 #include <blas_quda.h> 00006 #include <color_spinor_field.h> 00007 00008 #include <cuComplex.h> 00009 00010 #define REDUCE_MAX_BLOCKS 65536 00011 00012 #define REDUCE_DOUBLE 64 00013 #define REDUCE_KAHAN 32 00014 00015 #if (__CUDA_ARCH__ >= 130) 00016 #define REDUCE_TYPE REDUCE_DOUBLE 00017 #define QudaSumFloat double 00018 #define QudaSumComplex cuDoubleComplex 00019 #define QudaSumFloat3 double3 00020 #else 00021 #define REDUCE_TYPE REDUCE_KAHAN 00022 #define QudaSumFloat float 00023 #define QudaSumComplex cuComplex 00024 #define QudaSumFloat3 float3 00025 #endif 00026 00027 // Required for the reduction kernels 00028 #ifdef __DEVICE_EMULATION__ 00029 #define EMUSYNC __syncthreads() 00030 #else 00031 #define EMUSYNC 00032 #endif 00033 00034 // These are used for reduction kernels 00035 static QudaSumFloat *d_reduceFloat=0; 00036 static QudaSumComplex *d_reduceComplex=0; 00037 static QudaSumFloat3 *d_reduceFloat3=0; 00038 00039 static QudaSumFloat *h_reduceFloat=0; 00040 static QudaSumComplex *h_reduceComplex=0; 00041 static QudaSumFloat3 *h_reduceFloat3=0; 00042 00043 unsigned long long blas_quda_flops; 00044 unsigned long long blas_quda_bytes; 00045 00046 static dim3 blasBlock; 00047 static dim3 blasGrid; 00048 00049 // generated by blas_test 00050 #include <blas_param.h> 00051 00052 double2 operator+(const double2& x, const double2 &y) { 00053 return make_double2(x.x + y.x, x.y + y.y); 00054 } 00055 00056 double3 operator+(const double3& x, const double3 &y) { 00057 double3 z; 00058 z.x = x.x + y.x; z.y = x.y + y.y; z.z = x.z + y.z; 00059 return z; 00060 } 00061 00062 __device__ float2 operator*(const float a, const float2 x) { 00063 float2 y; 00064 y.x = a*x.x; 00065 y.y = a*x.y; 00066 return y; 00067 } 00068 00069 template <typename Float2> 00070 __device__ Float2 operator+(const Float2 x, const Float2 y) { 00071 Float2 z; 00072 z.x = x.x + y.x; 00073 z.y = x.y + y.y; 00074 return z; 00075 } 00076 00077 template <typename Float2> 00078 __device__ Float2 operator+=(Float2 &x, const Float2 y) { 00079 x.x += y.x; 00080 x.y += y.y; 00081 return x; 00082 } 00083 00084 template <typename Float2> 00085 __device__ Float2 operator-=(Float2 &x, const Float2 y) { 00086 x.x -= y.x; 00087 x.y -= y.y; 00088 return x; 00089 } 00090 00091 template <typename Float, typename Float2> 00092 __device__ Float2 operator*=(Float2 &x, const Float a) { 00093 x.x *= a; 00094 x.y *= a; 00095 return x; 00096 } 00097 00098 template <typename Float> 00099 __device__ float4 operator*=(float4 &a, const Float &b) { 00100 a.x *= b; 00101 a.y *= b; 00102 a.z *= b; 00103 a.w *= b; 00104 return a; 00105 } 00106 00107 00108 00109 void zeroCuda(cudaColorSpinorField &a) { a.zero(); } 00110 00111 void initBlas(void) 00112 { 00113 if (!d_reduceFloat) { 00114 if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) { 00115 errorQuda("Error allocating device reduction array"); 00116 } 00117 } 00118 00119 if (!d_reduceComplex) { 00120 if (cudaMalloc((void**) &d_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) { 00121 errorQuda("Error allocating device reduction array"); 00122 } 00123 } 00124 00125 if (!d_reduceFloat3) { 00126 if (cudaMalloc((void**) &d_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) { 00127 errorQuda("Error allocating device reduction array"); 00128 } 00129 } 00130 00131 if (!h_reduceFloat) { 00132 if (cudaMallocHost((void**) &h_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) { 00133 errorQuda("Error allocating host reduction array"); 00134 } 00135 } 00136 00137 if (!h_reduceComplex) { 00138 if (cudaMallocHost((void**) &h_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) { 00139 errorQuda("Error allocating host reduction array"); 00140 } 00141 } 00142 00143 if (!h_reduceFloat3) { 00144 if (cudaMallocHost((void**) &h_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) { 00145 errorQuda("Error allocating host reduction array"); 00146 } 00147 } 00148 } 00149 00150 void endBlas(void) 00151 { 00152 if (d_reduceFloat) cudaFree(d_reduceFloat); 00153 if (d_reduceComplex) cudaFree(d_reduceComplex); 00154 if (d_reduceFloat3) cudaFree(d_reduceFloat3); 00155 if (h_reduceFloat) cudaFreeHost(h_reduceFloat); 00156 if (h_reduceComplex) cudaFreeHost(h_reduceComplex); 00157 if (h_reduceFloat3) cudaFreeHost(h_reduceFloat3); 00158 } 00159 00160 // blasTuning = 1 turns off error checking 00161 static int blasTuning = 0; 00162 00163 void setBlasTuning(int tuning) 00164 { 00165 blasTuning = tuning; 00166 } 00167 00168 void setBlasParam(int kernel, int prec, int threads, int blocks) 00169 { 00170 blas_threads[kernel][prec] = threads; 00171 blas_blocks[kernel][prec] = blocks; 00172 } 00173 00174 void setBlock(int kernel, int length, QudaPrecision precision) 00175 { 00176 int prec; 00177 switch(precision) { 00178 case QUDA_HALF_PRECISION: 00179 prec = 0; 00180 break; 00181 case QUDA_SINGLE_PRECISION: 00182 prec = 1; 00183 break; 00184 case QUDA_DOUBLE_PRECISION: 00185 prec = 2; 00186 break; 00187 } 00188 00189 int blocks = min(blas_blocks[kernel][prec], max(length/blas_threads[kernel][prec], 1)); 00190 blasBlock.x = blas_threads[kernel][prec]; 00191 blasBlock.y = 1; 00192 blasBlock.z = 1; 00193 00194 blasGrid.x = blocks; 00195 blasGrid.y = 1; 00196 blasGrid.z = 1; 00197 } 00198 00199 #if (__CUDA_ARCH__ >= 130) 00200 static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i) 00201 { 00202 int4 v = tex1Dfetch(t,i); 00203 return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); 00204 } 00205 #else 00206 static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i) 00207 { 00208 // do nothing 00209 return make_double2(0.0, 0.0); 00210 } 00211 #endif 00212 00213 float2 __device__ read_Float2(float2 *x, int i) { 00214 return make_float2(x[i].x, x[i].y); 00215 } 00216 00217 double2 __device__ read_Float2(double2 *x, int i) { 00218 return make_double2(x[i].x, x[i].y); 00219 } 00220 00221 #define READ_DOUBLE2_TEXTURE(x, i) \ 00222 fetch_double2(x##TexDouble2, i) 00223 00224 #define READ_FLOAT2_TEXTURE(x, i) \ 00225 tex1Dfetch(x##TexSingle2, i) 00226 00227 float2 __device__ make_Float2(float2 x) { 00228 return make_float2(x.x, x.y); 00229 } 00230 00231 double2 __device__ make_Float2(double2 x) { 00232 return make_double2(x.x, x.y); 00233 } 00234 00235 #define RECONSTRUCT_HALF_SPINOR(a, texHalf, texNorm, length) \ 00236 float a##c = tex1Dfetch(texNorm, i); \ 00237 float4 a##0 = tex1Dfetch(texHalf, i + 0*length); \ 00238 float4 a##1 = tex1Dfetch(texHalf, i + 1*length); \ 00239 float4 a##2 = tex1Dfetch(texHalf, i + 2*length); \ 00240 float4 a##3 = tex1Dfetch(texHalf, i + 3*length); \ 00241 float4 a##4 = tex1Dfetch(texHalf, i + 4*length); \ 00242 float4 a##5 = tex1Dfetch(texHalf, i + 5*length); \ 00243 a##0 *= a##c; \ 00244 a##1 *= a##c; \ 00245 a##2 *= a##c; \ 00246 a##3 *= a##c; \ 00247 a##4 *= a##c; \ 00248 a##5 *= a##c; 00249 00250 #define RECONSTRUCT_HALF_SPINOR_ST(a, texHalf, texNorm, length) \ 00251 float a##c = tex1Dfetch(texNorm, i); \ 00252 float2 a##0 = tex1Dfetch(texHalf, i + 0*length); \ 00253 float2 a##1 = tex1Dfetch(texHalf, i + 1*length); \ 00254 float2 a##2 = tex1Dfetch(texHalf, i + 2*length); \ 00255 (a##0) *= a##c; \ 00256 (a##1) *= a##c; \ 00257 (a##2) *= a##c; 00258 00259 00260 // Some musings on how to clean up the blas code using Boost 00261 /*#define BOOST_RECONSTRUCT_HALF_SPINOR(z, j, a, texHalf, length) \ 00262 float4 a##k tex1Dfetch(texHalf, i + j*length); \ 00263 a##k *= a##c; 00264 00265 #define RECONSTRUCT_HALF_SPINOR(a, texHalf, texNorm, length) \ 00266 BOOST_PP_REPEAT(6, BOOST_RECONSTRUCT_HALF_SPINOR, a, texHalf, length) \ 00267 */ 00268 00269 #define READ_HALF_SPINOR_TEX(a, tex, texNorm, length) \ 00270 float a##c = tex1Dfetch(texNorm, i); \ 00271 float4 a##0 = tex1Dfetch(tex, i + 0*length); \ 00272 float4 a##1 = tex1Dfetch(tex, i + 1*length); \ 00273 float4 a##2 = tex1Dfetch(tex, i + 2*length); \ 00274 float4 a##3 = tex1Dfetch(tex, i + 3*length); \ 00275 float4 a##4 = tex1Dfetch(tex, i + 4*length); \ 00276 float4 a##5 = tex1Dfetch(tex, i + 5*length); \ 00277 00278 #define READ_HALF_SPINOR(a, tex, length) \ 00279 float4 a##0 = tex1Dfetch(tex, i + 0*length); \ 00280 float4 a##1 = tex1Dfetch(tex, i + 1*length); \ 00281 float4 a##2 = tex1Dfetch(tex, i + 2*length); \ 00282 float4 a##3 = tex1Dfetch(tex, i + 3*length); \ 00283 float4 a##4 = tex1Dfetch(tex, i + 4*length); \ 00284 float4 a##5 = tex1Dfetch(tex, i + 5*length); \ 00285 float a##c = a##N[i]; 00286 00287 #define READ_HALF_SPINOR_ST(a, tex, length) \ 00288 float2 a##0 = tex1Dfetch(tex, i + 0*length); \ 00289 float2 a##1 = tex1Dfetch(tex, i + 1*length); \ 00290 float2 a##2 = tex1Dfetch(tex, i + 2*length); \ 00291 float a##c = a##N[i]; 00292 00293 #define SHORT_LENGTH 65536 00294 #define SCALE_FLOAT ((SHORT_LENGTH-1) * 0.5) 00295 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1)) 00296 00297 __device__ short float2short(float c, float a) { 00298 //return (short)(a*MAX_SHORT); 00299 short rtn = (short)((a+SHIFT_FLOAT)*SCALE_FLOAT*c); 00300 return rtn; 00301 } 00302 00303 __device__ float short2float(short a) { 00304 return (float)a/SCALE_FLOAT - SHIFT_FLOAT; 00305 } 00306 00307 __device__ short4 float42short4(float c, float4 a) { 00308 return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w)); 00309 } 00310 00311 #define FAST_ABS_MAX(a, b) fmaxf(fabsf(a), fabsf(b)); 00312 #define FAST_MAX(a, b) fmaxf(a, b); 00313 00314 __device__ float fast_abs_max(float4 a) { 00315 float c0 = FAST_ABS_MAX(a.x, a.y); 00316 float c1 = FAST_ABS_MAX(a.z, a.w); 00317 return FAST_MAX(c0, c1); 00318 } 00319 00320 #define CONSTRUCT_HALF_SPINOR_FROM_SINGLE(h, n, a, length) { \ 00321 float c0 = fast_abs_max(a##0); \ 00322 float c1 = fast_abs_max(a##1); \ 00323 c0 = FAST_MAX(c0, c1); \ 00324 float c2 = fast_abs_max(a##2); \ 00325 float c3 = fast_abs_max(a##3); \ 00326 c1 = FAST_MAX(c2, c3); \ 00327 c0 = FAST_MAX(c0, c1); \ 00328 c2 = fast_abs_max(a##4); \ 00329 c3 = fast_abs_max(a##5); \ 00330 c1 = FAST_MAX(c2, c3); \ 00331 c0 = FAST_MAX(c0, c1); \ 00332 n[i] = c0; \ 00333 float C = __fdividef(MAX_SHORT, c0); \ 00334 h[i+0*length] = make_short4((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y), \ 00335 (short)(C*(float)(a##0).z), (short)(C*(float)(a##0).w)); \ 00336 h[i+1*length] = make_short4((short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y), \ 00337 (short)(C*(float)(a##1).z), (short)(C*(float)(a##1).w)); \ 00338 h[i+2*length] = make_short4((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y), \ 00339 (short)(C*(float)(a##2).z), (short)(C*(float)(a##2).w)); \ 00340 h[i+3*length] = make_short4((short)(C*(float)(a##3).x), (short)(C*(float)(a##3).y), \ 00341 (short)(C*(float)(a##3).z), (short)(C*(float)(a##3).w)); \ 00342 h[i+4*length] = make_short4((short)(C*(float)(a##4).x), (short)(C*(float)(a##4).y), \ 00343 (short)(C*(float)(a##4).z), (short)(C*(float)(a##4).w)); \ 00344 h[i+5*length] = make_short4((short)(C*(float)(a##5).x), (short)(C*(float)(a##5).y), \ 00345 (short)(C*(float)(a##5).z), (short)(C*(float)(a##5).w));} 00346 00347 #define CONSTRUCT_HALF_SPINOR_FROM_DOUBLE(h, n, a, length) \ 00348 {float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y)); \ 00349 float c1 = fmaxf(fabsf((a##1).x), fabsf((a##1).y)); \ 00350 float c2 = fmaxf(fabsf((a##2).x), fabsf((a##2).y)); \ 00351 float c3 = fmaxf(fabsf((a##3).x), fabsf((a##3).y)); \ 00352 float c4 = fmaxf(fabsf((a##4).x), fabsf((a##4).y)); \ 00353 float c5 = fmaxf(fabsf((a##5).x), fabsf((a##5).y)); \ 00354 float c6 = fmaxf(fabsf((a##6).x), fabsf((a##6).y)); \ 00355 float c7 = fmaxf(fabsf((a##7).x), fabsf((a##7).y)); \ 00356 float c8 = fmaxf(fabsf((a##8).x), fabsf((a##8).y)); \ 00357 float c9 = fmaxf(fabsf((a##9).x), fabsf((a##9).y)); \ 00358 float c10 = fmaxf(fabsf((a##10).x), fabsf((a##10).y)); \ 00359 float c11 = fmaxf(fabsf((a##11).x), fabsf((a##11).y)); \ 00360 c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); c3 = fmaxf(c6, c7); \ 00361 c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); \ 00362 c2 = fmaxf(c4, c5); c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); \ 00363 n[i] = c0; \ 00364 float C = __fdividef(MAX_SHORT, c0); \ 00365 h[i+0*length] = make_short4((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y), \ 00366 (short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y)); \ 00367 h[i+1*length] = make_short4((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y), \ 00368 (short)(C*(float)(a##3).x), (short)(C*(float)(a##3).y)); \ 00369 h[i+2*length] = make_short4((short)(C*(float)(a##4).x), (short)(C*(float)(a##4).y), \ 00370 (short)(C*(float)(a##5).x), (short)(C*(float)(a##5).y)); \ 00371 h[i+3*length] = make_short4((short)(C*(float)(a##6).x), (short)(C*(float)(a##6).y), \ 00372 (short)(C*(float)(a##7).x), (short)(C*(float)(a##7).y)); \ 00373 h[i+4*length] = make_short4((short)(C*(float)(a##8).x), (short)(C*(float)(a##8).y), \ 00374 (short)(C*(float)(a##9).x), (short)(C*(float)(a##9).y)); \ 00375 h[i+5*length] = make_short4((short)(C*(float)(a##10).x), (short)(C*(float)(a##10).y), \ 00376 (short)(C*(float)(a##11).x), (short)(C*(float)(a##11).y));} 00377 00378 #define CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(h, n, a, length) \ 00379 {float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y)); \ 00380 float c1 = fmaxf(fabsf((a##1).x), fabsf((a##1).y)); \ 00381 float c2 = fmaxf(fabsf((a##2).x), fabsf((a##2).y)); \ 00382 c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); \ 00383 n[i] = c0; \ 00384 float C = __fdividef(MAX_SHORT, c0); \ 00385 h[i+0*length] = make_short2((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y)); \ 00386 h[i+1*length] = make_short2((short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y)); \ 00387 h[i+2*length] = make_short2((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y));} 00388 00389 #define CONSTRUCT_HALF_SPINOR_FROM_DOUBLE_ST(h, n, a, length) \ 00390 {float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y)); \ 00391 float c1 = fmaxf(fabsf((a##1).x), fabsf((a##1).y)); \ 00392 float c2 = fmaxf(fabsf((a##2).x), fabsf((a##2).y)); \ 00393 c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); \ 00394 n[i] = c0; \ 00395 float C = __fdividef(MAX_SHORT, c0); \ 00396 h[i+0*length] = make_short2((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y)); \ 00397 h[i+1*length] = make_short2((short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y)); \ 00398 h[i+2*length] = make_short2((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y));} 00399 00400 00401 #define SUM_FLOAT4(sum, a) \ 00402 float sum = a.x + a.y + a.z + a.w; 00403 00404 #define SUM_FLOAT2(sum, a) \ 00405 float sum = a.x + a.y; 00406 00407 #if (__CUDA_ARCH__ < 200) 00408 #define REAL_DOT_FLOAT4(dot, a, b) \ 00409 float dot = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; 00410 #else 00411 #define REAL_DOT_FLOAT4(dot, a, b) \ 00412 float dot = fmaf(a.x, b.x, 0.0f); \ 00413 dot = fmaf(a.y, b.y, dot); \ 00414 dot = fmaf(a.z, b.z, dot); \ 00415 dot = fmaf(a.w, b.w, dot) 00416 #endif 00417 00418 #define REAL_DOT_FLOAT2(dot, a, b) \ 00419 float dot = a.x*b.x + a.y*b.y; 00420 00421 #if (__CUDA_ARCH__ < 200) 00422 #define IMAG_DOT_FLOAT4(dot, a, b) \ 00423 float dot = a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z; 00424 #else 00425 #define IMAG_DOT_FLOAT4(dot, a, b) \ 00426 float dot = fmaf(a.x, b.y, 0.0f); \ 00427 dot = fmaf(-a.y, b.x, dot); \ 00428 dot = fmaf(a.z, b.w, dot); \ 00429 dot = fmaf(-a.w, b.z, dot) 00430 #endif 00431 00432 #define IMAG_DOT_FLOAT2(dot, a, b) \ 00433 float dot = a.x*b.y - a.y*b.x; 00434 00435 #define AX_FLOAT4(a, X) \ 00436 X.x *= a; X.y *= a; X.z *= a; X.w *= a; 00437 00438 #define AX_FLOAT2(a, X) \ 00439 X.x *= a; X.y *= a; 00440 00441 #define XPY_FLOAT4(X, Y) \ 00442 Y.x += X.x; Y.y += X.y; Y.z += X.z; Y.w += X.w; 00443 00444 #define XPY_FLOAT2(X, Y) \ 00445 Y.x += X.x; Y.y += X.y; 00446 00447 #define XMY_FLOAT4(X, Y) \ 00448 Y.x = X.x - Y.x; Y.y = X.y - Y.y; Y.z = X.z - Y.z; Y.w = X.w - Y.w; 00449 00450 #define XMY_FLOAT2(X, Y) \ 00451 Y.x = X.x - Y.x; Y.y = X.y - Y.y; 00452 00453 #define MXPY_FLOAT4(X, Y) \ 00454 Y.x -= X.x; Y.y -= X.y; Y.z -= X.z; Y.w -= X.w; 00455 00456 #define MXPY_FLOAT2(X, Y) \ 00457 Y.x -= X.x; Y.y -= X.y; 00458 00459 #if (__CUDA_ARCH__ < 200) 00460 #define AXPY_FLOAT4(a, X, Y) \ 00461 Y.x += a*X.x; Y.y += a*X.y; \ 00462 Y.z += a*X.z; Y.w += a*X.w; 00463 #else 00464 #define AXPY_FLOAT4(a, X, Y) \ 00465 Y.x = fmaf(a, X.x, Y.x); Y.y = fmaf(a, X.y, Y.y); \ 00466 Y.z = fmaf(a, X.z, Y.z); Y.w = fmaf(a, X.w, Y.w); 00467 #endif 00468 00469 #define AXPY_FLOAT2(a, X, Y) \ 00470 Y.x += a*X.x; Y.y += a*X.y; 00471 00472 #define AXPBY_FLOAT4(a, X, b, Y) \ 00473 Y.x = b*Y.x; Y.x += a*X.x; Y.y = b*Y.y; Y.y += a*X.y; \ 00474 Y.z = b*Y.z; Y.z += a*X.z; Y.w = b*Y.w; Y.w += a*X.w; 00475 00476 #define AXPBY_FLOAT2(a, X, b, Y) \ 00477 Y.x = b*Y.x; Y.x += a*X.x; Y.y = b*Y.y; Y.y += a*X.y; \ 00478 00479 #if (__CUDA_ARCH__ < 200) 00480 #define XPAY_FLOAT4(X, a, Y) \ 00481 Y.x = X.x + a*Y.x; Y.y = X.y + a*Y.y; \ 00482 Y.z = X.z + a*Y.z; Y.w = X.w + a*Y.w; 00483 #else 00484 #define XPAY_FLOAT4(X, a, Y) \ 00485 Y.x = fmaf(a, Y.x, X.x); Y.y = fmaf(a, Y.y, X.y); \ 00486 Y.z = fmaf(a, Y.z, X.z); Y.w = fmaf(a, Y.w, X.w); 00487 #endif 00488 00489 #define XPAY_FLOAT2(X, a, Y) \ 00490 Y.x = X.x + a*Y.x; Y.y = X.y + a*Y.y; 00491 00492 #if (__CUDA_ARCH__ < 200) 00493 #define CAXPY_FLOAT4(a, X, Y) \ 00494 Y.x += a.x*X.x; Y.x -= a.y*X.y; \ 00495 Y.y += a.y*X.x; Y.y += a.x*X.y; \ 00496 Y.z += a.x*X.z; Y.z -= a.y*X.w; \ 00497 Y.w += a.y*X.z; Y.w += a.x*X.w; 00498 #else 00499 #define CAXPY_FLOAT4(a, X, Y) \ 00500 Y.x = fmaf(a.x, X.x, Y.x); Y.x = fmaf(-a.y, X.y, Y.x); \ 00501 Y.y = fmaf(a.y, X.x, Y.y); Y.y = fmaf( a.x, X.y, Y.y); \ 00502 Y.z = fmaf(a.x, X.z, Y.z); Y.z = fmaf(-a.y, X.w, Y.z); \ 00503 Y.w = fmaf(a.y, X.z, Y.w); Y.w = fmaf( a.x, X.w, Y.w); 00504 #endif // (__CUDA_ARCH__ < 200) 00505 00506 #if (__CUDA_ARCH__ < 200) 00507 #define CAXPY_FLOAT2(a, X, Y) \ 00508 Y.x += a.x*X.x; Y.x -= a.y*X.y; \ 00509 Y.y += a.y*X.x; Y.y += a.x*X.y; 00510 #else 00511 #define CAXPY_FLOAT2(a, X, Y) \ 00512 Y.x = fmaf(a.x, X.x, Y.x); Y.x = fmaf(-a.y, X.y, Y.x); \ 00513 Y.y = fmaf(a.y, X.x, Y.y); Y.y = fmaf( a.x, X.y, Y.y); 00514 #endif // (__CUDA_ARCH__ < 200) 00515 00516 #define CMAXPY_FLOAT4(a, X, Y) \ 00517 Y.x -= a.x*X.x; Y.x += a.y*X.y; \ 00518 Y.y -= a.y*X.x; Y.y -= a.x*X.y; \ 00519 Y.z -= a.x*X.z; Y.z += a.y*X.w; \ 00520 Y.w -= a.y*X.z; Y.w -= a.x*X.w; 00521 00522 #define CAXPBY_FLOAT4(a, X, b, Y) \ 00523 { float2 y; \ 00524 y.x = a.x*X.x; y.x -= a.y*X.y; y.x += b.x*Y.x; y.x -= b.y*Y.y; \ 00525 y.y = a.y*X.x; y.y += a.x*X.y; y.y += b.y*Y.x; y.y += b.x*Y.y; \ 00526 Y.x = y.x; Y.y = y.y; \ 00527 y.x = a.x*X.z; y.x -= a.y*X.w; y.x += b.x*Y.z; y.x -= b.y*Y.w; \ 00528 y.y = a.y*X.z; y.y += a.x*X.w; y.y += b.y*Y.z; y.y += b.x*Y.w; \ 00529 Y.z = y.x; Y.w = y.y;} 00530 00531 #define CAXPBY_FLOAT2(a, X, b, Y) \ 00532 { float2 y; \ 00533 y.x = a.x*X.x; y.x -= a.y*X.y; y.x += b.x*Y.x; y.x -= b.y*Y.y; \ 00534 y.y = a.y*X.x; y.y += a.x*X.y; y.y += b.y*Y.x; y.y += b.x*Y.y; \ 00535 Y.x = y.x; Y.y = y.y;} 00536 00537 00538 #define CXPAYPBZ_FLOAT4(X, a, Y, b, Z) \ 00539 {float2 z; \ 00540 z.x = X.x + a.x*Y.x; z.x -= a.y*Y.y; z.x += b.x*Z.x; z.x -= b.y*Z.y; \ 00541 z.y = X.y + a.y*Y.x; z.y += a.x*Y.y; z.y += b.y*Z.x; z.y += b.x*Z.y; \ 00542 Z.x = z.x; Z.y = z.y; \ 00543 z.x = X.z + a.x*Y.z; z.x -= a.y*Y.w; z.x += b.x*Z.z; z.x -= b.y*Z.w; \ 00544 z.y = X.w + a.y*Y.z; z.y += a.x*Y.w; z.y += b.y*Z.z; z.y += b.x*Z.w; \ 00545 Z.z = z.x; Z.w = z.y;} 00546 00547 #define CXPAYPBZ_FLOAT2(X, a, Y, b, Z) \ 00548 {float2 z; \ 00549 z.x = X.x + a.x*Y.x; z.x -= a.y*Y.y; z.x += b.x*Z.x; z.x -= b.y*Z.y; \ 00550 z.y = X.y + a.y*Y.x; z.y += a.x*Y.y; z.y += b.y*Z.x; z.y += b.x*Z.y; \ 00551 Z.x = z.x; Z.y = z.y;} 00552 00553 #if (__CUDA_ARCH__ < 200) 00554 #define CAXPBYPZ_FLOAT4(a, X, b, Y, Z) \ 00555 Z.x += a.x*X.x - a.y*X.y + b.x*Y.x - b.y*Y.y; \ 00556 Z.y += a.y*X.x + a.x*X.y + b.y*Y.x + b.x*Y.y; \ 00557 Z.z += a.x*X.z - a.y*X.w + b.x*Y.z - b.y*Y.w; \ 00558 Z.w += a.y*X.z + a.x*X.w + b.y*Y.z + b.x*Y.w; 00559 #else 00560 #define CAXPBYPZ_FLOAT4(a, X, b, Y, Z) \ 00561 Z.x = fmaf(a.x, X.x, Z.x); Z.x = fmaf(-a.y, X.y, Z.x); Z.x = fmaf(b.x, Y.x, Z.x); Z.x = fmaf(-b.y, Y.y, Z.x); \ 00562 Z.y = fmaf(a.y, X.x, Z.y); Z.y = fmaf( a.x, X.y, Z.y); Z.y = fmaf(b.y, Y.x, Z.y); Z.y = fmaf( b.x, Y.y, Z.y); \ 00563 Z.z = fmaf(a.x, X.z, Z.z); Z.z = fmaf(-a.y, X.w, Z.z); Z.z = fmaf(b.x, Y.z, Z.z); Z.z = fmaf(-b.y, Y.w, Z.z); \ 00564 Z.w = fmaf(a.y, X.z, Z.w); Z.w = fmaf( a.x, X.w, Z.w); Z.w = fmaf(b.y, Y.z, Z.w); Z.w = fmaf( b.x, Y.w, Z.w); 00565 #endif // (__CUDA_ARCH__ < 200) 00566 00567 #if (__CUDA_ARCH__ < 200) 00568 #define CAXPBYPZ_FLOAT2(a, X, b, Y, Z) \ 00569 Z.x += a.x*X.x - a.y*X.y + b.x*Y.x - b.y*Y.y; \ 00570 Z.y += a.y*X.x + a.x*X.y + b.y*Y.x + b.x*Y.y; 00571 #else 00572 #define CAXPBYPZ_FLOAT2(a, X, b, Y, Z) \ 00573 Z.x = fmaf(a.x, X.x, Z.x); Z.x = fmaf(-a.y, X.y, Z.x); Z.x = fmaf(b.x, Y.x, Z.x); Z.x = fmaf(-b.y, Y.y, Z.x); \ 00574 Z.y = fmaf(a.y, X.x, Z.y); Z.y = fmaf( a.x, X.y, Z.y); Z.y = fmaf(b.y, Y.x, Z.y); Z.y = fmaf( b.x, Y.y, Z.y); 00575 #endif // (__CUDA_ARCH__ < 200) 00576 00577 // Double precision input spinor field 00578 texture<int4, 1> xTexDouble2; 00579 texture<int4, 1> yTexDouble2; 00580 texture<int4, 1> zTexDouble2; 00581 texture<int4, 1> wTexDouble2; 00582 texture<int4, 1> uTexDouble2; 00583 00584 // Single precision input spinor field 00585 texture<float2, 1> xTexSingle2; 00586 texture<float2, 1> yTexSingle2; 00587 00588 texture<float4, 1> xTexSingle4; 00589 00590 // Half precision input spinor field 00591 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf1; 00592 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt1; 00593 texture<float, 1, cudaReadModeElementType> texNorm1; 00594 00595 // Half precision input spinor field 00596 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf2; 00597 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt2; 00598 texture<float, 1, cudaReadModeElementType> texNorm2; 00599 00600 // Half precision input spinor field 00601 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf3; 00602 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt3; 00603 texture<float, 1, cudaReadModeElementType> texNorm3; 00604 00605 // Half precision input spinor field 00606 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf4; 00607 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt4; 00608 texture<float, 1, cudaReadModeElementType> texNorm4; 00609 00610 // Half precision input spinor field 00611 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf5; 00612 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt5; 00613 texture<float, 1, cudaReadModeElementType> texNorm5; 00614 00615 #define checkSpinor(a, b) \ 00616 { \ 00617 if (a.Precision() != b.Precision()) \ 00618 errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \ 00619 if (a.Length() != b.Length()) \ 00620 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \ 00621 if (a.Stride() != b.Stride()) \ 00622 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \ 00623 } 00624 00625 // For kernels with precision conversion built in 00626 #define checkSpinorLength(a, b) \ 00627 { \ 00628 if (a.Length() != b.Length()) { \ 00629 errorQuda("engths do not match: %d %d", a.Length(), b.Length()); \ 00630 } 00631 00632 __global__ void convertDSKernel(double2 *dst, float4 *src, int length) { 00633 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00634 unsigned int gridSize = gridDim.x*blockDim.x; 00635 while (i < length) { 00636 for (int k=0; k<6; k++) { 00637 dst[2*k*length+i].x = src[k*length+i].x; 00638 dst[2*k*length+i].y = src[k*length+i].y; 00639 dst[(2*k+1)*length+i].x = src[k*length+i].z; 00640 dst[(2*k+1)*length+i].y = src[k*length+i].w; 00641 } 00642 i += gridSize; 00643 } 00644 } 00645 00646 __global__ void convertDSKernel(double2 *dst, float2 *src, int length) { 00647 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00648 unsigned int gridSize = gridDim.x*blockDim.x; 00649 while (i < length) { 00650 for (int k=0; k<3; k++) { 00651 dst[k*length+i].x = src[k*length+i].x; 00652 dst[k*length+i].y = src[k*length+i].y; 00653 } 00654 i += gridSize; 00655 } 00656 } 00657 00658 __global__ void convertSDKernel(float4 *dst, double2 *src, int length) { 00659 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00660 unsigned int gridSize = gridDim.x*blockDim.x; 00661 while (i < length) { 00662 for (int k=0; k<6; k++) { 00663 dst[k*length+i].x = src[2*k*length+i].x; 00664 dst[k*length+i].y = src[2*k*length+i].y; 00665 dst[k*length+i].z = src[(2*k+1)*length+i].x; 00666 dst[k*length+i].w = src[(2*k+1)*length+i].y; 00667 } 00668 i += gridSize; 00669 } 00670 } 00671 00672 __global__ void convertSDKernel(float2 *dst, double2 *src, int length) { 00673 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00674 unsigned int gridSize = gridDim.x*blockDim.x; 00675 while (i < length) { 00676 for (int k=0; k<3; k++) { 00677 dst[k*length+i].x = src[k*length+i].x; 00678 dst[k*length+i].y = src[k*length+i].y; 00679 } 00680 i += gridSize; 00681 } 00682 } 00683 00684 __global__ void convertHSKernel(short4 *h, float *norm, int length, int real_length) { 00685 00686 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00687 unsigned int gridSize = gridDim.x*blockDim.x; 00688 00689 while(i < real_length) { 00690 float4 F0 = tex1Dfetch(xTexSingle4, i + 0*length); 00691 float4 F1 = tex1Dfetch(xTexSingle4, i + 1*length); 00692 float4 F2 = tex1Dfetch(xTexSingle4, i + 2*length); 00693 float4 F3 = tex1Dfetch(xTexSingle4, i + 3*length); 00694 float4 F4 = tex1Dfetch(xTexSingle4, i + 4*length); 00695 float4 F5 = tex1Dfetch(xTexSingle4, i + 5*length); 00696 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(h, norm, F, length); 00697 i += gridSize; 00698 } 00699 00700 } 00701 00702 __global__ void convertHSKernel(short2 *h, float *norm, int length, int real_length) { 00703 00704 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00705 unsigned int gridSize = gridDim.x*blockDim.x; 00706 00707 while(i < real_length) { 00708 float2 F0 = tex1Dfetch(xTexSingle2, i + 0*length); 00709 float2 F1 = tex1Dfetch(xTexSingle2, i + 1*length); 00710 float2 F2 = tex1Dfetch(xTexSingle2, i + 2*length); 00711 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(h, norm, F, length); 00712 i += gridSize; 00713 } 00714 00715 } 00716 00717 00718 __global__ void convertSHKernel(float4 *res, int length, int real_length) { 00719 00720 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00721 unsigned int gridSize = gridDim.x*blockDim.x; 00722 00723 while (i<real_length) { 00724 RECONSTRUCT_HALF_SPINOR(I, texHalf1, texNorm1, length); 00725 res[0*length+i] = I0; 00726 res[1*length+i] = I1; 00727 res[2*length+i] = I2; 00728 res[3*length+i] = I3; 00729 res[4*length+i] = I4; 00730 res[5*length+i] = I5; 00731 i += gridSize; 00732 } 00733 } 00734 00735 __global__ void convertSHKernel(float2 *res, int length, int real_length) { 00736 00737 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00738 unsigned int gridSize = gridDim.x*blockDim.x; 00739 00740 while (i<real_length) { 00741 RECONSTRUCT_HALF_SPINOR_ST(I, texHalfSt1, texNorm1, length); 00742 res[0*length+i] = I0; 00743 res[1*length+i] = I1; 00744 res[2*length+i] = I2; 00745 i += gridSize; 00746 } 00747 } 00748 00749 __global__ void convertHDKernel(short4 *h, float *norm, int length, int real_length) { 00750 00751 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00752 unsigned int gridSize = gridDim.x*blockDim.x; 00753 00754 while(i < real_length) { 00755 double2 F0 = fetch_double2(xTexDouble2, i+0*length); 00756 double2 F1 = fetch_double2(xTexDouble2, i+1*length); 00757 double2 F2 = fetch_double2(xTexDouble2, i+2*length); 00758 double2 F3 = fetch_double2(xTexDouble2, i+3*length); 00759 double2 F4 = fetch_double2(xTexDouble2, i+4*length); 00760 double2 F5 = fetch_double2(xTexDouble2, i+5*length); 00761 double2 F6 = fetch_double2(xTexDouble2, i+6*length); 00762 double2 F7 = fetch_double2(xTexDouble2, i+7*length); 00763 double2 F8 = fetch_double2(xTexDouble2, i+8*length); 00764 double2 F9 = fetch_double2(xTexDouble2, i+9*length); 00765 double2 F10 = fetch_double2(xTexDouble2, i+10*length); 00766 double2 F11 = fetch_double2(xTexDouble2, i+11*length); 00767 CONSTRUCT_HALF_SPINOR_FROM_DOUBLE(h, norm, F, length); 00768 i += gridSize; 00769 } 00770 } 00771 00772 __global__ void convertHDKernel(short2 *h, float *norm, int length, int real_length) { 00773 00774 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00775 unsigned int gridSize = gridDim.x*blockDim.x; 00776 00777 while(i < real_length) { 00778 double2 F0 = fetch_double2(xTexDouble2, i+0*length); 00779 double2 F1 = fetch_double2(xTexDouble2, i+1*length); 00780 double2 F2 = fetch_double2(xTexDouble2, i+2*length); 00781 CONSTRUCT_HALF_SPINOR_FROM_DOUBLE_ST(h, norm, F, length); 00782 i += gridSize; 00783 } 00784 } 00785 00786 __global__ void convertDHKernel(double2 *res, int length, int real_length) { 00787 00788 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00789 unsigned int gridSize = gridDim.x*blockDim.x; 00790 00791 while(i < real_length) { 00792 RECONSTRUCT_HALF_SPINOR(I, texHalf1, texNorm1, length); 00793 res[0*length+i] = make_double2(I0.x, I0.y); 00794 res[1*length+i] = make_double2(I0.z, I0.w); 00795 res[2*length+i] = make_double2(I1.x, I1.y); 00796 res[3*length+i] = make_double2(I1.z, I1.w); 00797 res[4*length+i] = make_double2(I2.x, I2.y); 00798 res[5*length+i] = make_double2(I2.z, I2.w); 00799 res[6*length+i] = make_double2(I3.x, I3.y); 00800 res[7*length+i] = make_double2(I3.z, I3.w); 00801 res[8*length+i] = make_double2(I4.x, I4.y); 00802 res[9*length+i] = make_double2(I4.z, I4.w); 00803 res[10*length+i] = make_double2(I5.x, I5.y); 00804 res[11*length+i] = make_double2(I5.z, I5.w); 00805 i += gridSize; 00806 } 00807 00808 } 00809 00810 __global__ void convertDHKernelSt(double2 *res, int length, int real_length) { 00811 00812 int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00813 unsigned int gridSize = gridDim.x*blockDim.x; 00814 00815 while(i < real_length) { 00816 RECONSTRUCT_HALF_SPINOR_ST(I, texHalfSt1, texNorm1, length); 00817 res[0*length+i] = make_double2(I0.x, I0.y); 00818 res[1*length+i] = make_double2(I1.x, I1.y); 00819 res[2*length+i] = make_double2(I2.x, I2.y); 00820 i += gridSize; 00821 } 00822 00823 } 00824 00825 00826 00827 void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src) { 00828 00829 if (src.nSpin != 1 && src.nSpin != 4){ 00830 errorQuda("nSpin(%d) not supported in function %s, line %d\n", src.nSpin, __FUNCTION__, __LINE__); 00831 } 00832 if ((dst.precision == QUDA_HALF_PRECISION || src.precision == QUDA_HALF_PRECISION) && 00833 (dst.siteSubset == QUDA_FULL_SITE_SUBSET || src.siteSubset == QUDA_FULL_SITE_SUBSET)) { 00834 copyCuda(dst.Even(), src.Even()); 00835 copyCuda(dst.Odd(), src.Odd()); 00836 return; 00837 } 00838 00839 // For a given dst precision, there are two non-trivial possibilities for the 00840 // src precision. The higher one corresponds to kernel index 0 (in the table 00841 // of block and grid dimensions), while the lower one corresponds to index 1. 00842 int id; 00843 if (src.Precision() == QUDA_DOUBLE_PRECISION || 00844 dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) { 00845 id = 0; 00846 } else { 00847 id = 1; 00848 } 00849 setBlock(id, dst.stride, dst.Precision()); 00850 00851 blas_quda_bytes += src.real_length*((int)src.precision + (int)dst.precision); 00852 00853 if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION) { 00854 if (src.nSpin == 4){ 00855 convertDSKernel<<<blasGrid, blasBlock>>>((double2*)dst.v, (float4*)src.v, src.stride); 00856 }else{ //src.nSpin == 1 00857 convertDSKernel<<<blasGrid, blasBlock>>>((double2*)dst.v, (float2*)src.v, src.stride); 00858 } 00859 00860 } else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) { 00861 if (src.nSpin == 4){ 00862 convertSDKernel<<<blasGrid, blasBlock>>>((float4*)dst.v, (double2*)src.v, src.stride); 00863 }else{ //src.nSpin ==1 00864 convertSDKernel<<<blasGrid, blasBlock>>>((float2*)dst.v, (double2*)src.v, src.stride); 00865 } 00866 } else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_HALF_PRECISION) { 00867 blas_quda_bytes += src.volume*sizeof(float); 00868 int spinor_bytes = src.length*sizeof(short); 00869 if (src.nSpin == 4){ 00870 cudaBindTexture(0, texHalf1, src.v, spinor_bytes); 00871 cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/12); 00872 convertSHKernel<<<blasGrid, blasBlock>>>((float4*)dst.v, src.stride, src.volume); 00873 cudaUnbindTexture(texHalf1); 00874 cudaUnbindTexture(texNorm1); 00875 }else{ //nSpin== 1; 00876 cudaBindTexture(0, texHalfSt1, src.v, spinor_bytes); 00877 cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/3); 00878 convertSHKernel<<<blasGrid, blasBlock>>>((float2*)dst.v, src.stride, src.volume); 00879 cudaUnbindTexture(texHalfSt1); 00880 cudaUnbindTexture(texNorm1); 00881 } 00882 } else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_SINGLE_PRECISION) { 00883 blas_quda_bytes += dst.volume*sizeof(float); 00884 int spinor_bytes = src.length*sizeof(float); 00885 if (src.nSpin == 4){ 00886 cudaBindTexture(0, xTexSingle4, src.v, spinor_bytes); 00887 convertHSKernel<<<blasGrid, blasBlock>>>((short4*)dst.v, (float*)dst.norm, src.stride, src.volume); 00888 cudaUnbindTexture(xTexSingle4); 00889 }else{ //nSpinr == 1 00890 cudaBindTexture(0, xTexSingle2, src.v, spinor_bytes); 00891 convertHSKernel<<<blasGrid, blasBlock>>>((short2*)dst.v, (float*)dst.norm, src.stride, src.volume); 00892 cudaUnbindTexture(xTexSingle2); 00893 } 00894 } else if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_HALF_PRECISION) { 00895 blas_quda_bytes += src.volume*sizeof(float); 00896 int spinor_bytes = src.length*sizeof(short); 00897 if (src.nSpin == 4){ 00898 cudaBindTexture(0, texHalf1, src.v, spinor_bytes); 00899 cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/12); 00900 convertDHKernel<<<blasGrid, blasBlock>>>((double2*)dst.v, src.stride, src.volume); 00901 cudaUnbindTexture(texHalf1); 00902 cudaUnbindTexture(texNorm1); 00903 }else{//nSpinr == 1 00904 cudaBindTexture(0, texHalfSt1, src.v, spinor_bytes); 00905 cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/3); 00906 convertDHKernelSt<<<blasGrid, blasBlock>>>((double2*)dst.v, src.stride, src.volume); 00907 cudaUnbindTexture(texHalfSt1); 00908 cudaUnbindTexture(texNorm1); 00909 } 00910 } else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) { 00911 blas_quda_bytes += dst.volume*sizeof(float); 00912 int spinor_bytes = src.length*sizeof(double); 00913 cudaBindTexture(0, xTexDouble2, src.v, spinor_bytes); 00914 if (src.nSpin == 4){ 00915 convertHDKernel<<<blasGrid, blasBlock>>>((short4*)dst.v, (float*)dst.norm, src.stride, src.volume); 00916 }else{ //nSpinr == 1 00917 convertHDKernel<<<blasGrid, blasBlock>>>((short2*)dst.v, (float*)dst.norm, src.stride, src.volume); 00918 } 00919 cudaUnbindTexture(xTexDouble2); 00920 } else { 00921 cudaMemcpy(dst.v, src.v, dst.bytes, cudaMemcpyDeviceToDevice); 00922 if (dst.precision == QUDA_HALF_PRECISION) { 00923 cudaMemcpy(dst.norm, src.norm, dst.bytes/(dst.nColor*dst.nSpin), cudaMemcpyDeviceToDevice); 00924 blas_quda_bytes += 2*dst.real_length*sizeof(float); 00925 } 00926 } 00927 00928 cudaThreadSynchronize(); 00929 if (!blasTuning) checkCudaError(); 00930 00931 } 00932 00933 00934 template <typename Float, typename Float2> 00935 __global__ void axpbyKernel(Float a, Float2 *x, Float b, Float2 *y, int length) { 00936 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00937 unsigned int gridSize = gridDim.x*blockDim.x; 00938 while (i < length) { 00939 y[i] = a*x[i] + b*y[i]; 00940 i += gridSize; 00941 } 00942 } 00943 00944 __global__ void axpbyHKernel(float a, float b, short4 *yH, float *yN, int stride, int length) { 00945 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00946 unsigned int gridSize = gridDim.x*blockDim.x; 00947 while (i < length) { 00948 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 00949 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 00950 AXPBY_FLOAT4(a, x0, b, y0); 00951 AXPBY_FLOAT4(a, x1, b, y1); 00952 AXPBY_FLOAT4(a, x2, b, y2); 00953 AXPBY_FLOAT4(a, x3, b, y3); 00954 AXPBY_FLOAT4(a, x4, b, y4); 00955 AXPBY_FLOAT4(a, x5, b, y5); 00956 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 00957 i += gridSize; 00958 } 00959 00960 } 00961 __global__ void axpbyHKernel(float a, float b, short2 *yH, float *yN, int stride, int length) { 00962 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00963 unsigned int gridSize = gridDim.x*blockDim.x; 00964 while (i < length) { 00965 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 00966 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 00967 AXPBY_FLOAT2(a, x0, b, y0); 00968 AXPBY_FLOAT2(a, x1, b, y1); 00969 AXPBY_FLOAT2(a, x2, b, y2); 00970 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 00971 i += gridSize; 00972 } 00973 00974 } 00975 00976 00977 // performs the operation y[i] = a*x[i] + b*y[i] 00978 void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y) { 00979 setBlock(2, x.length, x.precision); 00980 checkSpinor(x, y); 00981 if (x.precision == QUDA_DOUBLE_PRECISION) { 00982 axpbyKernel<<<blasGrid, blasBlock>>>(a, (double*)x.v, b, (double*)y.v, x.length); 00983 } else if (x.precision == QUDA_SINGLE_PRECISION) { 00984 axpbyKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, (float)b, (float2*)y.v, x.length/2); 00985 } else { 00986 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 00987 axpbyCuda(a, x.Even(), b, y.Even()); 00988 axpbyCuda(a, x.Odd(), b, y.Odd()); 00989 return; 00990 } 00991 int spinor_bytes = x.length*sizeof(short); 00992 if (x.nSpin == 4){ //wilson 00993 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 00994 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 00995 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 00996 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 00997 axpbyHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short4*)y.v, 00998 (float*)y.norm, y.stride, y.volume); 00999 }else if (x.nSpin == 1) {//staggered 01000 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01001 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01002 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01003 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01004 axpbyHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short2*)y.v, 01005 (float*)y.norm, y.stride, y.volume); 01006 }else{ 01007 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01008 } 01009 blas_quda_bytes += 3*x.volume*sizeof(float); 01010 } 01011 blas_quda_bytes += 3*x.real_length*x.precision; 01012 blas_quda_flops += 3*x.real_length; 01013 01014 if (!blasTuning) checkCudaError(); 01015 } 01016 01017 template <typename Float> 01018 __global__ void xpyKernel(Float *x, Float *y, int len) { 01019 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01020 unsigned int gridSize = gridDim.x*blockDim.x; 01021 while (i < len) { 01022 y[i] += x[i]; 01023 i += gridSize; 01024 } 01025 } 01026 01027 __global__ void xpyHKernel(short4 *yH, float *yN, int stride, int length) { 01028 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01029 unsigned int gridSize = gridDim.x*blockDim.x; 01030 while (i < length) { 01031 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01032 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01033 XPY_FLOAT4(x0, y0); 01034 XPY_FLOAT4(x1, y1); 01035 XPY_FLOAT4(x2, y2); 01036 XPY_FLOAT4(x3, y3); 01037 XPY_FLOAT4(x4, y4); 01038 XPY_FLOAT4(x5, y5); 01039 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01040 i += gridSize; 01041 } 01042 01043 } 01044 01045 __global__ void xpyHKernel(short2 *yH, float *yN, int stride, int length) { 01046 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01047 unsigned int gridSize = gridDim.x*blockDim.x; 01048 while (i < length) { 01049 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01050 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01051 XPY_FLOAT2(x0, y0); 01052 XPY_FLOAT2(x1, y1); 01053 XPY_FLOAT2(x2, y2); 01054 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01055 i += gridSize; 01056 } 01057 01058 } 01059 01060 // performs the operation y[i] = x[i] + y[i] 01061 void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 01062 checkSpinor(x,y); 01063 setBlock(3, x.length, x.precision); 01064 if (x.precision == QUDA_DOUBLE_PRECISION) { 01065 xpyKernel<<<blasGrid, blasBlock>>>((double*)x.v, (double*)y.v, x.length); 01066 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01067 xpyKernel<<<blasGrid, blasBlock>>>((float2*)x.v, (float2*)y.v, x.length/2); 01068 } else { 01069 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01070 xpyCuda(x.Even(), y.Even()); 01071 xpyCuda(x.Odd(), y.Odd()); 01072 return; 01073 } 01074 int spinor_bytes = x.length*sizeof(short); 01075 if (x.nSpin == 4){ //wilson 01076 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01077 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01078 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01079 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01080 xpyHKernel<<<blasGrid, blasBlock>>>((short4*)y.v, (float*)y.norm, y.stride, y.volume); 01081 }else if (x.nSpin == 1){ //staggered 01082 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01083 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01084 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01085 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01086 xpyHKernel<<<blasGrid, blasBlock>>>((short2*)y.v, (float*)y.norm, y.stride, y.volume); 01087 }else{ 01088 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01089 } 01090 blas_quda_bytes += 3*x.volume*sizeof(float); 01091 } 01092 blas_quda_bytes += 3*x.real_length*x.precision; 01093 blas_quda_flops += x.real_length; 01094 01095 if (!blasTuning) checkCudaError(); 01096 } 01097 01098 template <typename Float, typename Float2> 01099 __global__ void axpyKernel(Float a, Float2 *x, Float2 *y, int len) { 01100 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01101 unsigned int gridSize = gridDim.x*blockDim.x; 01102 while (i < len) { 01103 y[i] += a*x[i]; 01104 i += gridSize; 01105 } 01106 } 01107 01108 __global__ void axpyHKernel(float a, short4 *yH, float *yN, int stride, int length) { 01109 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01110 unsigned int gridSize = gridDim.x*blockDim.x; 01111 while (i < length) { 01112 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01113 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01114 AXPY_FLOAT4(a, x0, y0); 01115 AXPY_FLOAT4(a, x1, y1); 01116 AXPY_FLOAT4(a, x2, y2); 01117 AXPY_FLOAT4(a, x3, y3); 01118 AXPY_FLOAT4(a, x4, y4); 01119 AXPY_FLOAT4(a, x5, y5); 01120 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01121 i += gridSize; 01122 } 01123 01124 } 01125 01126 01127 __global__ void axpyHKernel(float a, short2 *yH, float *yN, int stride, int length) { 01128 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01129 unsigned int gridSize = gridDim.x*blockDim.x; 01130 while (i < length) { 01131 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01132 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01133 AXPY_FLOAT2(a, x0, y0); 01134 AXPY_FLOAT2(a, x1, y1); 01135 AXPY_FLOAT2(a, x2, y2); 01136 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01137 i += gridSize; 01138 } 01139 01140 } 01141 01142 // performs the operation y[i] = a*x[i] + y[i] 01143 void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 01144 checkSpinor(x,y); 01145 setBlock(4, x.length, x.precision); 01146 if (x.precision == QUDA_DOUBLE_PRECISION) { 01147 axpyKernel<<<blasGrid, blasBlock>>>(a, (double*)x.v, (double*)y.v, x.length); 01148 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01149 axpyKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, (float2*)y.v, x.length/2); 01150 } else { 01151 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01152 axpyCuda(a, x.Even(), y.Even()); 01153 axpyCuda(a, x.Odd(), y.Odd()); 01154 return; 01155 } 01156 int spinor_bytes = x.length*sizeof(short); 01157 if (x.nSpin == 4){ //wilson 01158 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01159 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01160 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01161 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01162 axpyHKernel<<<blasGrid, blasBlock>>>((float)a, (short4*)y.v, (float*)y.norm, y.stride, y.volume); 01163 }else if (x.nSpin == 1){ //staggered 01164 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01165 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01166 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01167 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01168 axpyHKernel<<<blasGrid, blasBlock>>>((float)a, (short2*)y.v, (float*)y.norm, y.stride, y.volume); 01169 }else{ 01170 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01171 } 01172 blas_quda_bytes += 3*x.volume*sizeof(float); 01173 } 01174 blas_quda_bytes += 3*x.real_length*x.precision; 01175 blas_quda_flops += 2*x.real_length; 01176 01177 if (!blasTuning) checkCudaError(); 01178 } 01179 01180 template <typename Float, typename Float2> 01181 __global__ void xpayKernel(const Float2 *x, Float a, Float2 *y, int len) { 01182 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01183 unsigned int gridSize = gridDim.x*blockDim.x; 01184 while (i < len) { 01185 y[i] = x[i] + a*y[i]; 01186 i += gridSize; 01187 } 01188 } 01189 01190 __global__ void xpayHKernel(float a, short4 *yH, float *yN, int stride, int length) { 01191 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01192 unsigned int gridSize = gridDim.x*blockDim.x; 01193 while (i < length) { 01194 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01195 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01196 XPAY_FLOAT4(x0, a, y0); 01197 XPAY_FLOAT4(x1, a, y1); 01198 XPAY_FLOAT4(x2, a, y2); 01199 XPAY_FLOAT4(x3, a, y3); 01200 XPAY_FLOAT4(x4, a, y4); 01201 XPAY_FLOAT4(x5, a, y5); 01202 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01203 i += gridSize; 01204 } 01205 } 01206 01207 __global__ void xpayHKernel(float a, short2 *yH, float *yN, int stride, int length) { 01208 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01209 unsigned int gridSize = gridDim.x*blockDim.x; 01210 while (i < length) { 01211 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01212 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01213 XPAY_FLOAT2(x0, a, y0); 01214 XPAY_FLOAT2(x1, a, y1); 01215 XPAY_FLOAT2(x2, a, y2); 01216 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01217 i += gridSize; 01218 } 01219 } 01220 01221 01222 // performs the operation y[i] = x[i] + a*y[i] 01223 void xpayCuda(const cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y) { 01224 checkSpinor(x,y); 01225 setBlock(5, x.length, x.precision); 01226 if (x.precision == QUDA_DOUBLE_PRECISION) { 01227 xpayKernel<<<blasGrid, blasBlock>>>((double*)x.v, a, (double*)y.v, x.length); 01228 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01229 xpayKernel<<<blasGrid, blasBlock>>>((float2*)x.v, (float)a, (float2*)y.v, x.length/2); 01230 } else { 01231 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01232 xpayCuda(x.Even(), a, y.Even()); 01233 xpayCuda(x.Odd(), a, y.Odd()); 01234 return; 01235 } 01236 int spinor_bytes = x.length*sizeof(short); 01237 if (x.nSpin == 4){ //wilson 01238 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01239 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01240 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01241 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01242 xpayHKernel<<<blasGrid, blasBlock>>>((float)a, (short4*)y.v, (float*)y.norm, y.stride, y.volume); 01243 }else if (x.nSpin ==1){ //staggered 01244 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01245 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01246 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01247 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01248 xpayHKernel<<<blasGrid, blasBlock>>>((float)a, (short2*)y.v, (float*)y.norm, y.stride, y.volume); 01249 }else{ 01250 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01251 } 01252 blas_quda_bytes += 3*x.volume*sizeof(float); 01253 } 01254 blas_quda_bytes += 3*x.real_length*x.precision; 01255 blas_quda_flops += 2*x.real_length; 01256 01257 if (!blasTuning) checkCudaError(); 01258 } 01259 01260 template <typename Float> 01261 __global__ void mxpyKernel(Float *x, Float *y, int len) { 01262 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01263 unsigned int gridSize = gridDim.x*blockDim.x; 01264 while (i < len) { 01265 y[i] -= x[i]; 01266 i += gridSize; 01267 } 01268 } 01269 01270 __global__ void mxpyHKernel(short4 *yH, float *yN, int stride, int length) { 01271 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01272 unsigned int gridSize = gridDim.x*blockDim.x; 01273 while (i < length) { 01274 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01275 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01276 MXPY_FLOAT4(x0, y0); 01277 MXPY_FLOAT4(x1, y1); 01278 MXPY_FLOAT4(x2, y2); 01279 MXPY_FLOAT4(x3, y3); 01280 MXPY_FLOAT4(x4, y4); 01281 MXPY_FLOAT4(x5, y5); 01282 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01283 i += gridSize; 01284 } 01285 01286 } 01287 01288 __global__ void mxpyHKernel(short2 *yH, float *yN, int stride, int length) { 01289 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01290 unsigned int gridSize = gridDim.x*blockDim.x; 01291 while (i < length) { 01292 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01293 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01294 MXPY_FLOAT2(x0, y0); 01295 MXPY_FLOAT2(x1, y1); 01296 MXPY_FLOAT2(x2, y2); 01297 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01298 i += gridSize; 01299 } 01300 01301 } 01302 01303 01304 // performs the operation y[i] -= x[i] (minus x plus y) 01305 void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 01306 checkSpinor(x,y); 01307 setBlock(6, x.length, x.precision); 01308 if (x.precision == QUDA_DOUBLE_PRECISION) { 01309 mxpyKernel<<<blasGrid, blasBlock>>>((double*)x.v, (double*)y.v, x.length); 01310 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01311 mxpyKernel<<<blasGrid, blasBlock>>>((float2*)x.v, (float2*)y.v, x.length/2); 01312 } else { 01313 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01314 mxpyCuda(x.Even(), y.Even()); 01315 mxpyCuda(x.Odd(), y.Odd()); 01316 return; 01317 } 01318 int spinor_bytes = x.length*sizeof(short); 01319 if (x.nSpin == 4){ //wilson 01320 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01321 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01322 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01323 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01324 mxpyHKernel<<<blasGrid, blasBlock>>>((short4*)y.v, (float*)y.norm, y.stride, y.volume); 01325 }else if (x.nSpin == 1) { //staggered 01326 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01327 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01328 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01329 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01330 mxpyHKernel<<<blasGrid, blasBlock>>>((short2*)y.v, (float*)y.norm, y.stride, y.volume); 01331 }else{ 01332 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01333 } 01334 blas_quda_bytes += 3*x.volume*sizeof(float); 01335 } 01336 blas_quda_bytes += 3*x.real_length*x.precision; 01337 blas_quda_flops += x.real_length; 01338 01339 if (!blasTuning) checkCudaError(); 01340 } 01341 01342 template <typename Float, typename Float2> 01343 __global__ void axKernel(Float a, Float2 *x, int len) { 01344 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01345 unsigned int gridSize = gridDim.x*blockDim.x; 01346 while (i < len) { 01347 x[i] *= a; 01348 i += gridSize; 01349 } 01350 } 01351 01352 __global__ void axHKernel(float a, short4 *xH, float *xN, int stride, int length) { 01353 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01354 unsigned int gridSize = gridDim.x*blockDim.x; 01355 while (i < length) { 01356 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01357 AX_FLOAT4(a, x0); AX_FLOAT4(a, x1); AX_FLOAT4(a, x2); 01358 AX_FLOAT4(a, x3); AX_FLOAT4(a, x4); AX_FLOAT4(a, x5); 01359 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(xH, xN, x, stride); 01360 i += gridSize; 01361 } 01362 01363 } 01364 01365 __global__ void axHKernel(float a, short2 *xH, float *xN, int stride, int length) { 01366 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01367 unsigned int gridSize = gridDim.x*blockDim.x; 01368 while (i < length) { 01369 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01370 AX_FLOAT2(a, x0); AX_FLOAT2(a, x1); AX_FLOAT2(a, x2); 01371 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(xH, xN, x, stride); 01372 i += gridSize; 01373 } 01374 01375 } 01376 01377 // performs the operation x[i] = a*x[i] 01378 void axCuda(const double &a, cudaColorSpinorField &x) { 01379 setBlock(7, x.length, x.precision); 01380 if (x.precision == QUDA_DOUBLE_PRECISION) { 01381 axKernel<<<blasGrid, blasBlock>>>(a, (double*)x.v, x.length); 01382 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01383 axKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, x.length/2); 01384 } else { 01385 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01386 axCuda(a, x.Even()); 01387 axCuda(a, x.Odd()); 01388 return; 01389 } 01390 int spinor_bytes = x.length*sizeof(short); 01391 if (x.nSpin == 4){ //wilson 01392 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01393 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01394 axHKernel<<<blasGrid, blasBlock>>>((float)a, (short4*)x.v, (float*)x.norm, x.stride, x.volume); 01395 }else if (x.nSpin ==1){ //staggered 01396 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01397 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01398 axHKernel<<<blasGrid, blasBlock>>>((float)a, (short2*)x.v, (float*)x.norm, x.stride, x.volume); 01399 }else{ 01400 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01401 } 01402 blas_quda_bytes += 2*x.volume*sizeof(float); 01403 } 01404 blas_quda_bytes += 2*x.real_length*x.precision; 01405 blas_quda_flops += x.real_length; 01406 01407 if (!blasTuning) checkCudaError(); 01408 } 01409 01410 template <typename Float2> 01411 __global__ void caxpyDKernel(Float2 a, Float2 *x, Float2 *y, int len) { 01412 01413 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01414 unsigned int gridSize = gridDim.x*blockDim.x; 01415 while (i < len) { 01416 Float2 Z = READ_DOUBLE2_TEXTURE(x, i); 01417 y[i].x += a.x*Z.x - a.y*Z.y; 01418 y[i].y += a.y*Z.x + a.x*Z.y; 01419 i += gridSize; 01420 } 01421 01422 } 01423 01424 template <typename Float2> 01425 __global__ void caxpySKernel(Float2 a, Float2 *x, Float2 *y, int len) { 01426 01427 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01428 unsigned int gridSize = gridDim.x*blockDim.x; 01429 while (i < len) { 01430 Float2 Z = read_Float2(x, i); 01431 y[i].x += a.x*Z.x - a.y*Z.y; 01432 y[i].y += a.y*Z.x + a.x*Z.y; 01433 i += gridSize; 01434 } 01435 01436 } 01437 01438 __global__ void caxpyHKernel(float2 a, short4 *yH, float *yN, int stride, int length) { 01439 01440 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01441 unsigned int gridSize = gridDim.x*blockDim.x; 01442 while (i < length) { 01443 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01444 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01445 CAXPY_FLOAT4(a, x0, y0); 01446 CAXPY_FLOAT4(a, x1, y1); 01447 CAXPY_FLOAT4(a, x2, y2); 01448 CAXPY_FLOAT4(a, x3, y3); 01449 CAXPY_FLOAT4(a, x4, y4); 01450 CAXPY_FLOAT4(a, x5, y5); 01451 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01452 i += gridSize; 01453 } 01454 01455 } 01456 01457 __global__ void caxpyHKernel(float2 a, short2 *yH, float *yN, int stride, int length) { 01458 01459 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01460 unsigned int gridSize = gridDim.x*blockDim.x; 01461 while (i < length) { 01462 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01463 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01464 CAXPY_FLOAT2(a, x0, y0); 01465 CAXPY_FLOAT2(a, x1, y1); 01466 CAXPY_FLOAT2(a, x2, y2); 01467 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01468 i += gridSize; 01469 } 01470 01471 } 01472 01473 // performs the operation y[i] += a*x[i] 01474 void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 01475 checkSpinor(x,y); 01476 int length = x.length/2; 01477 setBlock(8, length, x.precision); 01478 blas_quda_bytes += 3*x.real_length*x.precision; 01479 blas_quda_flops += 4*x.real_length; 01480 if (x.precision == QUDA_DOUBLE_PRECISION) { 01481 int spinor_bytes = x.length*sizeof(double); 01482 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 01483 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 01484 double2 a2 = make_double2(real(a), imag(a)); 01485 caxpyDKernel<<<blasGrid, blasBlock>>>(a2, (double2*)x.v, (double2*)y.v, length); 01486 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01487 float2 a2 = make_float2(real(a), imag(a)); 01488 caxpySKernel<<<blasGrid, blasBlock>>>(a2, (float2*)x.v, (float2*)y.v, length); 01489 } else { 01490 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01491 caxpyCuda(a, x.Even(), y.Even()); 01492 caxpyCuda(a, x.Odd(), y.Odd()); 01493 return; 01494 } 01495 int spinor_bytes = x.length*sizeof(short); 01496 if (x.nSpin == 4){ //wilson 01497 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01498 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01499 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01500 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01501 float2 a2 = make_float2(real(a), imag(a)); 01502 caxpyHKernel<<<blasGrid, blasBlock>>>(a2, (short4*)y.v, (float*)y.norm, y.stride, y.volume); 01503 } else if (x.nSpin == 1){ //staggered 01504 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01505 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01506 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01507 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01508 float2 a2 = make_float2(real(a), imag(a)); 01509 caxpyHKernel<<<blasGrid, blasBlock>>>(a2, (short2*)y.v, (float*)y.norm, y.stride, y.volume); 01510 }else{ 01511 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01512 } 01513 blas_quda_bytes += 3*x.volume*sizeof(float); 01514 } 01515 01516 if (!blasTuning) checkCudaError(); 01517 } 01518 01519 template <typename Float2> 01520 __global__ void caxpbyDKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, int len) { 01521 01522 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01523 unsigned int gridSize = gridDim.x*blockDim.x; 01524 while (i < len) { 01525 Float2 Z1 = READ_DOUBLE2_TEXTURE(x, i); 01526 Float2 Z2 = READ_DOUBLE2_TEXTURE(y, i); 01527 y[i].x = a.x*Z1.x + b.x*Z2.x - a.y*Z1.y - b.y*Z2.y; 01528 y[i].y = a.y*Z1.x + b.y*Z2.x + a.x*Z1.y + b.x*Z2.y; 01529 i += gridSize; 01530 } 01531 } 01532 01533 template <typename Float2> 01534 __global__ void caxpbySKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, int len) { 01535 01536 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01537 unsigned int gridSize = gridDim.x*blockDim.x; 01538 while (i < len) { 01539 Float2 Z1 = read_Float2(x, i); 01540 Float2 Z2 = read_Float2(y, i); 01541 y[i].x = a.x*Z1.x + b.x*Z2.x - a.y*Z1.y - b.y*Z2.y; 01542 y[i].y = a.y*Z1.x + b.y*Z2.x + a.x*Z1.y + b.x*Z2.y; 01543 i += gridSize; 01544 } 01545 } 01546 01547 __global__ void caxpbyHKernel(float2 a, float2 b, short4 *yH, float *yN, int stride, int length) { 01548 01549 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01550 unsigned int gridSize = gridDim.x*blockDim.x; 01551 while (i < length) { 01552 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01553 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01554 CAXPBY_FLOAT4(a, x0, b, y0); 01555 CAXPBY_FLOAT4(a, x1, b, y1); 01556 CAXPBY_FLOAT4(a, x2, b, y2); 01557 CAXPBY_FLOAT4(a, x3, b, y3); 01558 CAXPBY_FLOAT4(a, x4, b, y4); 01559 CAXPBY_FLOAT4(a, x5, b, y5); 01560 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01561 i += gridSize; 01562 } 01563 } 01564 01565 __global__ void caxpbyHKernel(float2 a, float2 b, short2 *yH, float *yN, int stride, int length) { 01566 01567 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01568 unsigned int gridSize = gridDim.x*blockDim.x; 01569 while (i < length) { 01570 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01571 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01572 CAXPBY_FLOAT2(a, x0, b, y0); 01573 CAXPBY_FLOAT2(a, x1, b, y1); 01574 CAXPBY_FLOAT2(a, x2, b, y2); 01575 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01576 i += gridSize; 01577 } 01578 } 01579 01580 01581 // performs the operation y[i] = c*x[i] + b*y[i] 01582 void caxpbyCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y) { 01583 checkSpinor(x,y); 01584 int length = x.length/2; 01585 setBlock(9, length, x.precision); 01586 blas_quda_bytes += 3*x.real_length*x.precision; 01587 blas_quda_flops += 7*x.real_length; 01588 if (x.precision == QUDA_DOUBLE_PRECISION) { 01589 int spinor_bytes = x.length*sizeof(double); 01590 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 01591 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 01592 double2 a2 = make_double2(real(a), imag(a)); 01593 double2 b2 = make_double2(real(b), imag(b)); 01594 caxpbyDKernel<<<blasGrid, blasBlock>>>(a2, (double2*)x.v, b2, (double2*)y.v, length); 01595 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01596 float2 a2 = make_float2(real(a), imag(a)); 01597 float2 b2 = make_float2(real(b), imag(b)); 01598 caxpbySKernel<<<blasGrid, blasBlock>>>(a2, (float2*)x.v, b2, (float2*)y.v, length); 01599 } else { 01600 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01601 caxpbyCuda(a, x.Even(), b, y.Even()); 01602 caxpbyCuda(a, x.Odd(), b, y.Odd()); 01603 return; 01604 } 01605 int spinor_bytes = x.length*sizeof(short); 01606 if (x.nSpin == 4){ //wilson 01607 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01608 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01609 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01610 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01611 float2 a2 = make_float2(real(a), imag(a)); 01612 float2 b2 = make_float2(real(b), imag(b)); 01613 caxpbyHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short4*)y.v, (float*)y.norm, y.stride, y.volume); 01614 }else if (x.nSpin == 1){ //staggered 01615 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01616 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01617 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01618 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01619 float2 a2 = make_float2(real(a), imag(a)); 01620 float2 b2 = make_float2(real(b), imag(b)); 01621 caxpbyHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short2*)y.v, (float*)y.norm, y.stride, y.volume); 01622 }else{ 01623 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01624 } 01625 blas_quda_bytes += 3*x.volume*sizeof(float); 01626 } 01627 01628 if (!blasTuning) checkCudaError(); 01629 } 01630 01631 template <typename Float2> 01632 __global__ void cxpaypbzDKernel(Float2 *x, Float2 a, Float2 *y, Float2 b, Float2 *z, int len) { 01633 01634 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01635 unsigned int gridSize = gridDim.x*blockDim.x; 01636 while (i < len) { 01637 Float2 T1 = READ_DOUBLE2_TEXTURE(x, i); 01638 Float2 T2 = READ_DOUBLE2_TEXTURE(y, i); 01639 Float2 T3 = read_Float2(z, i); 01640 01641 T1.x += a.x*T2.x - a.y*T2.y; 01642 T1.y += a.y*T2.x + a.x*T2.y; 01643 T1.x += b.x*T3.x - b.y*T3.y; 01644 T1.y += b.y*T3.x + b.x*T3.y; 01645 01646 z[i] = make_Float2(T1); 01647 i += gridSize; 01648 } 01649 01650 } 01651 01652 template <typename Float2> 01653 __global__ void cxpaypbzSKernel(Float2 *x, Float2 a, Float2 *y, Float2 b, Float2 *z, int len) { 01654 01655 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01656 unsigned int gridSize = gridDim.x*blockDim.x; 01657 while (i < len) { 01658 Float2 T1 = read_Float2(x, i); 01659 Float2 T2 = read_Float2(y, i); 01660 Float2 T3 = read_Float2(z, i); 01661 01662 T1.x += a.x*T2.x - a.y*T2.y; 01663 T1.y += a.y*T2.x + a.x*T2.y; 01664 T1.x += b.x*T3.x - b.y*T3.y; 01665 T1.y += b.y*T3.x + b.x*T3.y; 01666 01667 z[i] = make_Float2(T1); 01668 i += gridSize; 01669 } 01670 01671 } 01672 01673 __global__ void cxpaypbzHKernel(float2 a, float2 b, short4 *zH, float *zN, int stride, int length) { 01674 01675 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01676 unsigned int gridSize = gridDim.x*blockDim.x; 01677 while (i < length) { 01678 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01679 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01680 RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride); 01681 CXPAYPBZ_FLOAT4(x0, a, y0, b, z0); 01682 CXPAYPBZ_FLOAT4(x1, a, y1, b, z1); 01683 CXPAYPBZ_FLOAT4(x2, a, y2, b, z2); 01684 CXPAYPBZ_FLOAT4(x3, a, y3, b, z3); 01685 CXPAYPBZ_FLOAT4(x4, a, y4, b, z4); 01686 CXPAYPBZ_FLOAT4(x5, a, y5, b, z5); 01687 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(zH, zN, z, stride); 01688 i += gridSize; 01689 } 01690 } 01691 01692 01693 __global__ void cxpaypbzHKernel(float2 a, float2 b, short2 *zH, float *zN, int stride, int length) { 01694 01695 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01696 unsigned int gridSize = gridDim.x*blockDim.x; 01697 while (i < length) { 01698 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01699 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01700 RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride); 01701 CXPAYPBZ_FLOAT2(x0, a, y0, b, z0); 01702 CXPAYPBZ_FLOAT2(x1, a, y1, b, z1); 01703 CXPAYPBZ_FLOAT2(x2, a, y2, b, z2); 01704 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(zH, zN, z, stride); 01705 i += gridSize; 01706 } 01707 } 01708 01709 01710 // performs the operation z[i] = x[i] + a*y[i] + b*z[i] 01711 void cxpaypbzCuda(cudaColorSpinorField &x, const Complex &a, cudaColorSpinorField &y, 01712 const Complex &b, cudaColorSpinorField &z) { 01713 checkSpinor(x,y); 01714 checkSpinor(x,z); 01715 int length = x.length/2; 01716 setBlock(10, length, x.precision); 01717 blas_quda_bytes += 4*x.real_length*x.precision; 01718 blas_quda_flops += 8*x.real_length; 01719 if (x.precision == QUDA_DOUBLE_PRECISION) { 01720 int spinor_bytes = x.length*sizeof(double); 01721 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 01722 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 01723 double2 a2 = make_double2(real(a), imag(a)); 01724 double2 b2 = make_double2(real(b), imag(b)); 01725 cxpaypbzDKernel<<<blasGrid, blasBlock>>>((double2*)x.v, a2, (double2*)y.v, b2, (double2*)z.v, length); 01726 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01727 float2 a2 = make_float2(real(a), imag(a)); 01728 float2 b2 = make_float2(real(b), imag(b)); 01729 cxpaypbzSKernel<<<blasGrid, blasBlock>>>((float2*)x.v, a2, (float2*)y.v, b2, (float2*)z.v, length); 01730 } else { 01731 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 01732 cxpaypbzCuda(x.Even(), a, y.Even(), b, z.Even()); 01733 cxpaypbzCuda(x.Odd(), a, y.Odd(), b, z.Odd()); 01734 return; 01735 } 01736 int spinor_bytes = x.length*sizeof(short); 01737 blas_quda_bytes += 4*x.volume*sizeof(float); 01738 if (x.nSpin ==4 ){//wilson 01739 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01740 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01741 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01742 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01743 cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 01744 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12); 01745 float2 a2 = make_float2(real(a), imag(a)); 01746 float2 b2 = make_float2(real(b), imag(b)); 01747 cxpaypbzHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short4*)z.v, (float*)z.norm, z.stride, z.volume); 01748 } else if (x.nSpin ==1 ){//staggered 01749 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01750 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01751 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01752 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01753 cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 01754 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3); 01755 float2 a2 = make_float2(real(a), imag(a)); 01756 float2 b2 = make_float2(real(b), imag(b)); 01757 cxpaypbzHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short2*)z.v, (float*)z.norm, z.stride, z.volume); 01758 }else{ 01759 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01760 } 01761 } 01762 01763 if (!blasTuning) checkCudaError(); 01764 } 01765 01766 01767 template <typename Float, typename Float2> 01768 __global__ void axpyBzpcxDKernel(Float a, Float2 *x, Float2 *y, Float b, Float2 *z, Float c, int len) 01769 { 01770 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01771 unsigned int gridSize = gridDim.x*blockDim.x; 01772 while (i < len) { 01773 Float2 x_i = READ_DOUBLE2_TEXTURE(x, i); 01774 Float2 z_i = READ_DOUBLE2_TEXTURE(z, i); 01775 01776 y[i].x += a*x_i.x; 01777 y[i].y += a*x_i.y; 01778 01779 x[i].x = b*z_i.x + c*x_i.x; 01780 x[i].y = b*z_i.y + c*x_i.y; 01781 01782 i += gridSize; 01783 } 01784 } 01785 01786 01787 template <typename Float, typename Float2> 01788 __global__ void axpyBzpcxSKernel(Float a, Float2 *x, Float2 *y, Float b, Float2 *z, Float c, int len) 01789 { 01790 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01791 unsigned int gridSize = gridDim.x*blockDim.x; 01792 while (i < len) { 01793 Float2 x_i = read_Float2(x, i); 01794 Float2 z_i = read_Float2(z, i); 01795 01796 y[i].x += a*x_i.x; 01797 y[i].y += a*x_i.y; 01798 01799 x[i].x = b*z_i.x + c*x_i.x; 01800 x[i].y = b*z_i.y + c*x_i.y; 01801 01802 i += gridSize; 01803 } 01804 } 01805 01806 __global__ void axpyBzpcxHKernel(float a, float b, float c, short4 *xH, float *xN, short4 *yH, float *yN, int stride, int length) 01807 { 01808 01809 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01810 unsigned int gridSize = gridDim.x*blockDim.x; 01811 while (i < length) { 01812 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01813 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01814 RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride); 01815 AXPY_FLOAT4(a, x0, y0); 01816 AXPBY_FLOAT4(b, z0, c, x0); 01817 AXPY_FLOAT4(a, x1, y1); 01818 AXPBY_FLOAT4(b, z1, c, x1); 01819 AXPY_FLOAT4(a, x2, y2); 01820 AXPBY_FLOAT4(b, z2, c, x2); 01821 AXPY_FLOAT4(a, x3, y3); 01822 AXPBY_FLOAT4(b, z3, c, x3); 01823 AXPY_FLOAT4(a, x4, y4); 01824 AXPBY_FLOAT4(b, z4, c, x4); 01825 AXPY_FLOAT4(a, x5, y5); 01826 AXPBY_FLOAT4(b, z5, c, x5); 01827 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01828 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(xH, xN, x, stride); 01829 i += gridSize; 01830 } 01831 } 01832 01833 01834 __global__ void axpyBzpcxHKernel(float a, float b, float c, short2 *xH, float *xN, short2 *yH, float *yN, int stride, int length) 01835 { 01836 01837 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01838 unsigned int gridSize = gridDim.x*blockDim.x; 01839 while (i < length) { 01840 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01841 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01842 RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride); 01843 AXPY_FLOAT2(a, x0, y0); 01844 AXPBY_FLOAT2(b, z0, c, x0); 01845 AXPY_FLOAT2(a, x1, y1); 01846 AXPBY_FLOAT2(b, z1, c, x1); 01847 AXPY_FLOAT2(a, x2, y2); 01848 AXPBY_FLOAT2(b, z2, c, x2); 01849 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01850 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(xH, xN, x, stride); 01851 i += gridSize; 01852 } 01853 } 01854 01855 // performs the operations: {y[i] = a*x[i] + y[i]; x[i] = b*z[i] + c*x[i]} 01856 void axpyBzpcxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y, const double &b, 01857 cudaColorSpinorField& z, const double &c) 01858 { 01859 checkSpinor(x,y); 01860 checkSpinor(x,z); 01861 setBlock(11, x.length, x.precision); 01862 if (x.precision == QUDA_DOUBLE_PRECISION) { 01863 int spinor_bytes = x.length*sizeof(double); 01864 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 01865 cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 01866 axpyBzpcxDKernel<<<blasGrid, blasBlock>>>(a, (double2*)x.v, (double2*)y.v, b, (double2*)z.v, c, x.length/2); 01867 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01868 axpyBzpcxSKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, (float2*)y.v, (float)b, (float2*)z.v, (float)c, x.length/2); 01869 } else { 01870 if (x.siteSubset == QUDA_FULL_SITE_SUBSET){ 01871 axpyBzpcxCuda(a, x.Even(), y.Even(), b, z.Even(), c); 01872 axpyBzpcxCuda(a, x.Odd(), y.Odd(), b, z.Odd(), c); 01873 return ; 01874 } 01875 01876 int spinor_bytes = x.length*sizeof(short); 01877 if (x.nSpin == 4){ //wilson 01878 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 01879 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 01880 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 01881 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 01882 cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 01883 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12); 01884 axpyBzpcxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (float)c, (short4*)x.v, (float*)x.norm, 01885 (short4*)y.v, (float*)y.norm, z.stride, z.volume); 01886 }else if (x.nSpin == 1){ //staggered 01887 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 01888 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 01889 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 01890 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 01891 cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 01892 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3); 01893 axpyBzpcxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (float)c, (short2*)x.v, (float*)x.norm, 01894 (short2*)y.v, (float*)y.norm, z.stride, z.volume); 01895 }else{ 01896 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 01897 } 01898 blas_quda_bytes += 5*x.volume*sizeof(float); 01899 } 01900 blas_quda_bytes += 5*x.real_length*x.precision; 01901 blas_quda_flops += 10*x.real_length; 01902 01903 if (!blasTuning) checkCudaError(); 01904 } 01905 01906 01907 01908 template <typename Float, typename Float2> 01909 __global__ void axpyZpbxDKernel(Float a, Float2 *x, Float2 *y, Float2 *z, Float b, int len) { 01910 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01911 unsigned int gridSize = gridDim.x*blockDim.x; 01912 while (i < len) { 01913 Float2 x_i = READ_DOUBLE2_TEXTURE(x, i); 01914 Float2 z_i = READ_DOUBLE2_TEXTURE(z, i); 01915 y[i].x += a*x_i.x; 01916 y[i].y += a*x_i.y; 01917 x[i].x = z_i.x + b*x_i.x; 01918 x[i].y = z_i.y + b*x_i.y; 01919 i += gridSize; 01920 } 01921 } 01922 01923 template <typename Float, typename Float2> 01924 __global__ void axpyZpbxSKernel(Float a, Float2 *x, Float2 *y, Float2 *z, Float b, int len) { 01925 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01926 unsigned int gridSize = gridDim.x*blockDim.x; 01927 while (i < len) { 01928 Float2 x_i = read_Float2(x, i); 01929 Float2 z_i = read_Float2(z, i); 01930 y[i].x += a*x_i.x; 01931 y[i].y += a*x_i.y; 01932 x[i].x = z_i.x + b*x_i.x; 01933 x[i].y = z_i.y + b*x_i.y; 01934 i += gridSize; 01935 } 01936 } 01937 01938 __global__ void axpyZpbxHKernel(float a, float b, short4 *xH, float *xN, short4 *yH, float *yN, int stride, int length) { 01939 01940 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01941 unsigned int gridSize = gridDim.x*blockDim.x; 01942 while (i < length) { 01943 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 01944 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 01945 AXPY_FLOAT4(a, x0, y0); 01946 AXPY_FLOAT4(a, x1, y1); 01947 AXPY_FLOAT4(a, x2, y2); 01948 AXPY_FLOAT4(a, x3, y3); 01949 AXPY_FLOAT4(a, x4, y4); 01950 AXPY_FLOAT4(a, x5, y5); 01951 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 01952 RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride); 01953 XPAY_FLOAT4(z0, b, x0); 01954 XPAY_FLOAT4(z1, b, x1); 01955 XPAY_FLOAT4(z2, b, x2); 01956 XPAY_FLOAT4(z3, b, x3); 01957 XPAY_FLOAT4(z4, b, x4); 01958 XPAY_FLOAT4(z5, b, x5); 01959 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(xH, xN, x, stride); 01960 i += gridSize; 01961 } 01962 } 01963 01964 __global__ void axpyZpbxHKernel(float a, float b, short2 *xH, float *xN, short2 *yH, float *yN, int stride, int length) { 01965 01966 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 01967 unsigned int gridSize = gridDim.x*blockDim.x; 01968 while (i < length) { 01969 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 01970 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 01971 RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride); 01972 AXPY_FLOAT2(a, x0, y0); 01973 XPAY_FLOAT2(z0, b, x0); 01974 AXPY_FLOAT2(a, x1, y1); 01975 XPAY_FLOAT2(z1, b, x1); 01976 AXPY_FLOAT2(a, x2, y2); 01977 XPAY_FLOAT2(z2, b, x2); 01978 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 01979 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(xH, xN, x, stride); 01980 i += gridSize; 01981 } 01982 } 01983 01984 01985 // performs the operations: {y[i] = a*x[i] + y[i]; x[i] = z[i] + b*x[i]} 01986 void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, 01987 cudaColorSpinorField &z, const double &b) { 01988 checkSpinor(x,y); 01989 checkSpinor(x,z); 01990 setBlock(12, x.length, x.precision); 01991 if (x.precision == QUDA_DOUBLE_PRECISION) { 01992 int spinor_bytes = x.length*sizeof(double); 01993 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 01994 cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 01995 axpyZpbxDKernel<<<blasGrid, blasBlock>>> 01996 (a, (double2*)x.v, (double2*)y.v, (double2*)z.v, b, x.length/2); 01997 } else if (x.precision == QUDA_SINGLE_PRECISION) { 01998 axpyZpbxSKernel<<<blasGrid, blasBlock>>> 01999 ((float)a, (float2*)x.v, (float2*)y.v, (float2*)z.v, (float)b, x.length/2); 02000 } else { 02001 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 02002 axpyZpbxCuda(a, x.Even(), y.Even(), z.Even(), b); 02003 axpyZpbxCuda(a, x.Odd(), y.Odd(), z.Odd(), b); 02004 return; 02005 } 02006 int spinor_bytes = x.length*sizeof(short); 02007 if (x.nSpin ==4){ //wilson 02008 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 02009 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 02010 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 02011 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 02012 cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 02013 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12); 02014 axpyZpbxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short4*)x.v, (float*)x.norm, 02015 (short4*)y.v, (float*)y.norm, z.stride, z.volume); 02016 }else if (x.nSpin == 1){ //staggered 02017 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 02018 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 02019 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 02020 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 02021 cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 02022 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3); 02023 axpyZpbxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short2*)x.v, (float*)x.norm, 02024 (short2*)y.v, (float*)y.norm, z.stride, z.volume); 02025 }else{ 02026 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 02027 } 02028 blas_quda_bytes += 5*x.volume*sizeof(float); 02029 } 02030 blas_quda_bytes += 5*x.real_length*x.precision; 02031 blas_quda_flops += 8*x.real_length; 02032 02033 if (!blasTuning) checkCudaError(); 02034 } 02035 02036 template <typename Float2> 02037 __global__ void caxpbypzYmbwDKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, int len) { 02038 02039 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 02040 unsigned int gridSize = gridDim.x*blockDim.x; 02041 while (i < len) { 02042 Float2 X = READ_DOUBLE2_TEXTURE(x, i); 02043 Float2 Z = read_Float2(z, i); 02044 02045 Z.x += a.x*X.x - a.y*X.y; 02046 Z.y += a.y*X.x + a.x*X.y; 02047 02048 Float2 Y = READ_DOUBLE2_TEXTURE(y, i); 02049 Z.x += b.x*Y.x - b.y*Y.y; 02050 Z.y += b.y*Y.x + b.x*Y.y; 02051 z[i] = make_Float2(Z); 02052 02053 Float2 W = read_Float2(w, i); 02054 02055 Y.x -= b.x*W.x - b.y*W.y; 02056 Y.y -= b.y*W.x + b.x*W.y; 02057 02058 y[i] = make_Float2(Y); 02059 i += gridSize; 02060 } 02061 } 02062 02063 template <typename Float2> 02064 __global__ void caxpbypzYmbwSKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, int len) { 02065 02066 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 02067 unsigned int gridSize = gridDim.x*blockDim.x; 02068 while (i < len) { 02069 Float2 X = read_Float2(x, i); 02070 Float2 Z = read_Float2(z, i); 02071 02072 Z.x += a.x*X.x - a.y*X.y; 02073 Z.y += a.y*X.x + a.x*X.y; 02074 02075 Float2 Y = read_Float2(y, i); 02076 Z.x += b.x*Y.x - b.y*Y.y; 02077 Z.y += b.y*Y.x + b.x*Y.y; 02078 z[i] = make_Float2(Z); 02079 02080 Float2 W = read_Float2(w, i); 02081 02082 Y.x -= b.x*W.x - b.y*W.y; 02083 Y.y -= b.y*W.x + b.x*W.y; 02084 02085 y[i] = make_Float2(Y); 02086 i += gridSize; 02087 } 02088 } 02089 02090 __global__ void caxpbypzYmbwHKernel(float2 a, float2 b, float *xN, short4 *yH, float *yN, 02091 short4 *zH, float *zN, float *wN, int stride, int length) { 02092 02093 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 02094 unsigned int gridSize = gridDim.x*blockDim.x; 02095 while (i < length) { 02096 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); 02097 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); 02098 RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride); 02099 CAXPBYPZ_FLOAT4(a, x0, b, y0, z0); 02100 CAXPBYPZ_FLOAT4(a, x1, b, y1, z1); 02101 CAXPBYPZ_FLOAT4(a, x2, b, y2, z2); 02102 CAXPBYPZ_FLOAT4(a, x3, b, y3, z3); 02103 CAXPBYPZ_FLOAT4(a, x4, b, y4, z4); 02104 CAXPBYPZ_FLOAT4(a, x5, b, y5, z5); 02105 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(zH, zN, z, stride); 02106 READ_HALF_SPINOR(w, texHalf4, stride); 02107 float2 b2 = -wc*b; 02108 CAXPY_FLOAT4(b2, w0, y0); 02109 CAXPY_FLOAT4(b2, w1, y1); 02110 CAXPY_FLOAT4(b2, w2, y2); 02111 CAXPY_FLOAT4(b2, w3, y3); 02112 CAXPY_FLOAT4(b2, w4, y4); 02113 CAXPY_FLOAT4(b2, w5, y5); 02114 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 02115 i += gridSize; 02116 } 02117 } 02118 02119 __global__ void caxpbypzYmbwHKernel(float2 a, float2 b, float *xN, short2 *yH, float *yN, 02120 short2 *zH, float *zN, float *wN, int stride, int length) { 02121 02122 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 02123 unsigned int gridSize = gridDim.x*blockDim.x; 02124 while (i < length) { 02125 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); 02126 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); 02127 RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride); 02128 CAXPBYPZ_FLOAT2(a, x0, b, y0, z0); 02129 CAXPBYPZ_FLOAT2(a, x1, b, y1, z1); 02130 CAXPBYPZ_FLOAT2(a, x2, b, y2, z2); 02131 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(zH, zN, z, stride); 02132 READ_HALF_SPINOR_ST(w, texHalfSt4, stride); 02133 float2 b2 = -wc*b; 02134 CAXPY_FLOAT2(b2, w0, y0); 02135 CAXPY_FLOAT2(b2, w1, y1); 02136 CAXPY_FLOAT2(b2, w2, y2); 02137 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 02138 i += gridSize; 02139 } 02140 } 02141 02142 // performs the operation z[i] = a*x[i] + b*y[i] + z[i] and y[i] -= b*w[i] 02143 void caxpbypzYmbwCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y, 02144 cudaColorSpinorField &z, cudaColorSpinorField &w) { 02145 checkSpinor(x,y); 02146 checkSpinor(x,z); 02147 checkSpinor(x,w); 02148 int length = x.length/2; 02149 setBlock(13, length, x.precision); 02150 if (x.precision == QUDA_DOUBLE_PRECISION) { 02151 int spinor_bytes = x.length*sizeof(double); 02152 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 02153 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 02154 cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 02155 double2 a2 = make_double2(real(a), imag(a)); 02156 double2 b2 = make_double2(real(b), imag(b)); 02157 caxpbypzYmbwDKernel<<<blasGrid, blasBlock>>>(a2, (double2*)x.v, b2, (double2*)y.v, 02158 (double2*)z.v, (double2*)w.v, length); 02159 } else if (x.precision == QUDA_SINGLE_PRECISION) { 02160 float2 a2 = make_float2(real(a), imag(a)); 02161 float2 b2 = make_float2(real(b), imag(b)); 02162 caxpbypzYmbwSKernel<<<blasGrid, blasBlock>>>(a2, (float2*)x.v, b2, (float2*)y.v, 02163 (float2*)z.v, (float2*)w.v, length); 02164 } else { 02165 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) { 02166 caxpbypzYmbwCuda(a, x.Even(), b, y.Even(), z.Even(), w.Even()); 02167 caxpbypzYmbwCuda(a, x.Odd(), b, y.Odd(), z.Odd(), w.Odd()); 02168 return; 02169 } 02170 int spinor_bytes = x.length*sizeof(short); 02171 blas_quda_bytes += 6*x.volume*sizeof(float); 02172 float2 a2 = make_float2(real(a), imag(a)); 02173 float2 b2 = make_float2(real(b), imag(b)); 02174 if (x.nSpin == 4){ //wilson 02175 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 02176 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 02177 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 02178 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 02179 cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 02180 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12); 02181 cudaBindTexture(0, texHalf4, w.v, spinor_bytes); 02182 cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/12); 02183 caxpbypzYmbwHKernel<<<blasGrid, blasBlock>>>(a2, b2, (float*)x.norm, (short4*)y.v, (float*)y.norm, 02184 (short4*)z.v, (float*)z.norm, (float*)w.norm, 02185 z.stride, z.volume); 02186 } else if (x.nSpin == 1){ //staggered 02187 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 02188 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 02189 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 02190 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 02191 cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 02192 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3); 02193 cudaBindTexture(0, texHalfSt4, w.v, spinor_bytes); 02194 cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/3); 02195 caxpbypzYmbwHKernel<<<blasGrid, blasBlock>>>(a2, b2, (float*)x.norm, (short2*)y.v, (float*)y.norm, 02196 (short2*)z.v, (float*)z.norm, (float*)w.norm, 02197 z.stride, z.volume); 02198 }else{ 02199 errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin); 02200 } 02201 } 02202 blas_quda_bytes += 6*x.real_length*x.precision; 02203 blas_quda_flops += 12*x.real_length; 02204 02205 if (!blasTuning) checkCudaError(); 02206 } 02207 02208 02209 // Computes c = a + b in "double single" precision. 02210 __device__ void dsadd(volatile QudaSumFloat &c0, volatile QudaSumFloat &c1, const volatile QudaSumFloat &a0, 02211 const volatile QudaSumFloat &a1, const float b0, const float b1) { 02212 // Compute dsa + dsb using Knuth's trick. 02213 QudaSumFloat t1 = a0 + b0; 02214 QudaSumFloat e = t1 - a0; 02215 QudaSumFloat t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1; 02216 // The result is t1 + t2, after normalization. 02217 c0 = e = t1 + t2; 02218 c1 = t2 - (e - t1); 02219 } 02220 02221 // Computes c = a + b in "double single" precision (complex version) 02222 __device__ void zcadd(volatile QudaSumComplex &c0, volatile QudaSumComplex &c1, const volatile QudaSumComplex &a0, 02223 const volatile QudaSumComplex &a1, const volatile QudaSumComplex &b0, const volatile QudaSumComplex &b1) { 02224 // Compute dsa + dsb using Knuth's trick. 02225 QudaSumFloat t1 = a0.x + b0.x; 02226 QudaSumFloat e = t1 - a0.x; 02227 QudaSumFloat t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x; 02228 // The result is t1 + t2, after normalization. 02229 c0.x = e = t1 + t2; 02230 c1.x = t2 - (e - t1); 02231 02232 // Compute dsa + dsb using Knuth's trick. 02233 t1 = a0.y + b0.y; 02234 e = t1 - a0.y; 02235 t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y; 02236 // The result is t1 + t2, after normalization. 02237 c0.y = e = t1 + t2; 02238 c1.y = t2 - (e - t1); 02239 } 02240 02241 // Computes c = a + b in "double single" precision (float3 version) 02242 __device__ void dsadd3(volatile QudaSumFloat3 &c0, volatile QudaSumFloat3 &c1, const volatile QudaSumFloat3 &a0, 02243 const volatile QudaSumFloat3 &a1, const volatile QudaSumFloat3 &b0, const volatile QudaSumFloat3 &b1) { 02244 // Compute dsa + dsb using Knuth's trick. 02245 QudaSumFloat t1 = a0.x + b0.x; 02246 QudaSumFloat e = t1 - a0.x; 02247 QudaSumFloat t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x; 02248 // The result is t1 + t2, after normalization. 02249 c0.x = e = t1 + t2; 02250 c1.x = t2 - (e - t1); 02251 02252 // Compute dsa + dsb using Knuth's trick. 02253 t1 = a0.y + b0.y; 02254 e = t1 - a0.y; 02255 t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y; 02256 // The result is t1 + t2, after normalization. 02257 c0.y = e = t1 + t2; 02258 c1.y = t2 - (e - t1); 02259 02260 // Compute dsa + dsb using Knuth's trick. 02261 t1 = a0.z + b0.z; 02262 e = t1 - a0.z; 02263 t2 = ((b0.z - e) + (a0.z - (t1 - e))) + a1.z + b1.z; 02264 // The result is t1 + t2, after normalization. 02265 c0.z = e = t1 + t2; 02266 c1.z = t2 - (e - t1); 02267 } 02268 02269 // 02270 // double sumCuda(float *a, int n) {} 02271 // 02272 template <unsigned int reduce_threads, typename Float> 02273 #define REDUCE_FUNC_NAME(suffix) sumD##suffix 02274 #define REDUCE_TYPES Float *a 02275 #define REDUCE_PARAMS a 02276 #define REDUCE_AUXILIARY(i) 02277 #define REDUCE_OPERATION(i) a[i] 02278 #include "reduce_core.h" 02279 #undef REDUCE_FUNC_NAME 02280 #undef REDUCE_TYPES 02281 #undef REDUCE_PARAMS 02282 #undef REDUCE_AUXILIARY 02283 #undef REDUCE_OPERATION 02284 02285 template <unsigned int reduce_threads, typename Float> 02286 #define REDUCE_FUNC_NAME(suffix) sumS##suffix 02287 #define REDUCE_TYPES Float *a 02288 #define REDUCE_PARAMS a 02289 #define REDUCE_AUXILIARY(i) 02290 #define REDUCE_OPERATION(i) a[i].x + a[i].y 02291 #include "reduce_core.h" 02292 #undef REDUCE_FUNC_NAME 02293 #undef REDUCE_TYPES 02294 #undef REDUCE_PARAMS 02295 #undef REDUCE_AUXILIARY 02296 #undef REDUCE_OPERATION 02297 02298 template <unsigned int reduce_threads, typename Float> 02299 #define REDUCE_FUNC_NAME(suffix) sumH##suffix 02300 #define REDUCE_TYPES Float *aN, int stride 02301 #define REDUCE_PARAMS aN, stride 02302 #define REDUCE_AUXILIARY(i) \ 02303 READ_HALF_SPINOR(a, texHalf1, stride); \ 02304 SUM_FLOAT4(s0, a0); \ 02305 SUM_FLOAT4(s1, a1); \ 02306 SUM_FLOAT4(s2, a2); \ 02307 SUM_FLOAT4(s3, a3); \ 02308 SUM_FLOAT4(s4, a4); \ 02309 SUM_FLOAT4(s5, a5); \ 02310 s0 += s1; s2 += s3; s4 += s5; s0 += s2; s0 += s4; 02311 #define REDUCE_OPERATION(i) (ac*s0) 02312 #include "reduce_core.h" 02313 #undef REDUCE_FUNC_NAME 02314 #undef REDUCE_TYPES 02315 #undef REDUCE_PARAMS 02316 #undef REDUCE_AUXILIARY 02317 #undef REDUCE_OPERATION 02318 02319 template <unsigned int reduce_threads, typename Float> 02320 #define REDUCE_FUNC_NAME(suffix) sumHSt##suffix 02321 #define REDUCE_TYPES Float *aN, int stride 02322 #define REDUCE_PARAMS aN, stride 02323 #define REDUCE_AUXILIARY(i) \ 02324 READ_HALF_SPINOR_ST(a, texHalfSt1, stride); \ 02325 SUM_FLOAT2(s0, a0); \ 02326 SUM_FLOAT2(s1, a1); \ 02327 SUM_FLOAT2(s2, a2); \ 02328 s0 += s1; s0 += s2; 02329 #define REDUCE_OPERATION(i) (ac*s0) 02330 #include "reduce_core.h" 02331 #undef REDUCE_FUNC_NAME 02332 #undef REDUCE_TYPES 02333 #undef REDUCE_PARAMS 02334 #undef REDUCE_AUXILIARY 02335 #undef REDUCE_OPERATION 02336 02337 double sumCuda(cudaColorSpinorField &a) { 02338 const int id = 14; 02339 blas_quda_flops += a.real_length; 02340 blas_quda_bytes += a.real_length*a.precision; 02341 if (a.precision == QUDA_DOUBLE_PRECISION) { 02342 return sumDCuda((double*)a.v, a.length, id, a.precision); 02343 } else if (a.precision == QUDA_SINGLE_PRECISION) { 02344 return sumSCuda((float2*)a.v, a.length/2, id, a.precision); 02345 } else { 02346 if (a.siteSubset == QUDA_FULL_SITE_SUBSET) return sumCuda(a.Even()) + sumCuda(a.Odd()); 02347 int spinor_bytes = a.length*sizeof(short); 02348 blas_quda_bytes += a.volume*sizeof(float); 02349 if (a.nSpin ==4) { // wilson 02350 cudaBindTexture(0, texHalf1, a.v, spinor_bytes); 02351 cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/12); 02352 return sumHCuda((float*)a.norm, a.stride, a.volume, id, a.precision); 02353 } else if (a.nSpin ==1) { // staggered 02354 cudaBindTexture(0, texHalfSt1, a.v, spinor_bytes); 02355 cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/3); 02356 return sumHStCuda((float*)a.norm, a.stride, a.volume, id, a.precision); 02357 } else { 02358 errorQuda("ERROR: nSpin=%d is not supported\n", a.nSpin); 02359 return 0; 02360 } 02361 } 02362 } 02363 02364 // 02365 // double normCuda(float *a, int n) {} 02366 // 02367 template <unsigned int reduce_threads, typename Float> 02368 #define REDUCE_FUNC_NAME(suffix) normD##suffix 02369 #define REDUCE_TYPES Float *a 02370 #define REDUCE_PARAMS a 02371 #define REDUCE_AUXILIARY(i) 02372 #define REDUCE_OPERATION(i) (a[i]*a[i]) 02373 #include "reduce_core.h" 02374 #undef REDUCE_FUNC_NAME 02375 #undef REDUCE_TYPES 02376 #undef REDUCE_PARAMS 02377 #undef REDUCE_AUXILIARY 02378 #undef REDUCE_OPERATION 02379 02380 template <unsigned int reduce_threads, typename Float> 02381 #define REDUCE_FUNC_NAME(suffix) normS##suffix 02382 #define REDUCE_TYPES Float *a 02383 #define REDUCE_PARAMS a 02384 #define REDUCE_AUXILIARY(i) 02385 #define REDUCE_OPERATION(i) (a[i].x*a[i].x + a[i].y*a[i].y) 02386 #include "reduce_core.h" 02387 #undef REDUCE_FUNC_NAME 02388 #undef REDUCE_TYPES 02389 #undef REDUCE_PARAMS 02390 #undef REDUCE_AUXILIARY 02391 #undef REDUCE_OPERATION 02392 02393 // 02394 // double normHCuda(char *, int n) {} 02395 // 02396 template <unsigned int reduce_threads, typename Float> 02397 #define REDUCE_FUNC_NAME(suffix) normH##suffix 02398 #define REDUCE_TYPES Float *aN, int stride // dummy type 02399 #define REDUCE_PARAMS aN, stride 02400 #define REDUCE_AUXILIARY(i) \ 02401 READ_HALF_SPINOR(a, texHalf1, stride); \ 02402 REAL_DOT_FLOAT4(norm0, a0, a0); \ 02403 REAL_DOT_FLOAT4(norm1, a1, a1); \ 02404 REAL_DOT_FLOAT4(norm2, a2, a2); \ 02405 REAL_DOT_FLOAT4(norm3, a3, a3); \ 02406 REAL_DOT_FLOAT4(norm4, a4, a4); \ 02407 REAL_DOT_FLOAT4(norm5, a5, a5); \ 02408 norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4; 02409 #define REDUCE_OPERATION(i) (ac*ac*norm0) 02410 #include "reduce_core.h" 02411 #undef REDUCE_FUNC_NAME 02412 #undef REDUCE_TYPES 02413 #undef REDUCE_PARAMS 02414 #undef REDUCE_AUXILIARY 02415 #undef REDUCE_OPERATION 02416 02417 template <unsigned int reduce_threads, typename Float> 02418 #define REDUCE_FUNC_NAME(suffix) normHSt##suffix 02419 #define REDUCE_TYPES Float *aN, int stride // dummy type 02420 #define REDUCE_PARAMS aN, stride 02421 #define REDUCE_AUXILIARY(i) \ 02422 READ_HALF_SPINOR_ST(a, texHalfSt1, stride); \ 02423 REAL_DOT_FLOAT2(norm0, a0, a0); \ 02424 REAL_DOT_FLOAT2(norm1, a1, a1); \ 02425 REAL_DOT_FLOAT2(norm2, a2, a2); \ 02426 norm0 += norm1; norm0 += norm2; 02427 #define REDUCE_OPERATION(i) (ac*ac*norm0) 02428 #include "reduce_core.h" 02429 #undef REDUCE_FUNC_NAME 02430 #undef REDUCE_TYPES 02431 #undef REDUCE_PARAMS 02432 #undef REDUCE_AUXILIARY 02433 #undef REDUCE_OPERATION 02434 02435 double normCuda(const cudaColorSpinorField &a) { 02436 const int id = 15; 02437 blas_quda_flops += 2*a.real_length; 02438 blas_quda_bytes += a.real_length*a.precision; 02439 if (a.precision == QUDA_DOUBLE_PRECISION) { 02440 return normDCuda((double*)a.v, a.length, id, a.precision); 02441 } else if (a.precision == QUDA_SINGLE_PRECISION) { 02442 return normSCuda((float2*)a.v, a.length/2, id, a.precision); 02443 } else { 02444 if (a.siteSubset == QUDA_FULL_SITE_SUBSET) return normCuda(a.Even()) + normCuda(a.Odd()); 02445 int spinor_bytes = a.length*sizeof(short); 02446 int half_norm_ratio = (a.nColor*a.nSpin*2*sizeof(short))/sizeof(float); 02447 blas_quda_bytes += (a.real_length*a.precision) / (a.nColor * a.nSpin); 02448 cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/half_norm_ratio); 02449 if (a.nSpin == 4){ //wilson 02450 cudaBindTexture(0, texHalf1, a.v, spinor_bytes); 02451 return normHCuda((float*)a.norm, a.stride, a.volume, id, a.precision); 02452 }else if (a.nSpin == 1) { //staggered 02453 cudaBindTexture(0, texHalfSt1, a.v, spinor_bytes); 02454 return normHStCuda((float*)a.norm, a.stride, a.volume, id, a.precision); 02455 }else{ 02456 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, a.nSpin); 02457 return 0; 02458 } 02459 } 02460 02461 } 02462 02463 // 02464 // double reDotProductFCuda(float *a, float *b, int n) {} 02465 // 02466 template <unsigned int reduce_threads, typename Float> 02467 #define REDUCE_FUNC_NAME(suffix) reDotProductD##suffix 02468 #define REDUCE_TYPES Float *a, Float *b 02469 #define REDUCE_PARAMS a, b 02470 #define REDUCE_AUXILIARY(i) 02471 #define REDUCE_OPERATION(i) (a[i]*b[i]) 02472 #include "reduce_core.h" 02473 #undef REDUCE_FUNC_NAME 02474 #undef REDUCE_TYPES 02475 #undef REDUCE_PARAMS 02476 #undef REDUCE_AUXILIARY 02477 #undef REDUCE_OPERATION 02478 02479 template <unsigned int reduce_threads, typename Float> 02480 #define REDUCE_FUNC_NAME(suffix) reDotProductS##suffix 02481 #define REDUCE_TYPES Float *a, Float *b 02482 #define REDUCE_PARAMS a, b 02483 #define REDUCE_AUXILIARY(i) 02484 #define REDUCE_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y) 02485 #include "reduce_core.h" 02486 #undef REDUCE_FUNC_NAME 02487 #undef REDUCE_TYPES 02488 #undef REDUCE_PARAMS 02489 #undef REDUCE_AUXILIARY 02490 #undef REDUCE_OPERATION 02491 02492 // 02493 // double reDotProductHCuda(float *a, float *b, int n) {} 02494 // 02495 template <unsigned int reduce_threads, typename Float> 02496 #define REDUCE_FUNC_NAME(suffix) reDotProductH##suffix 02497 #define REDUCE_TYPES Float *aN, Float *bN, int stride 02498 #define REDUCE_PARAMS aN, bN, stride 02499 #define REDUCE_AUXILIARY(i) \ 02500 READ_HALF_SPINOR(a, texHalf1, stride); \ 02501 READ_HALF_SPINOR(b, texHalf2, stride); \ 02502 REAL_DOT_FLOAT4(rdot0, a0, b0); \ 02503 REAL_DOT_FLOAT4(rdot1, a1, b1); \ 02504 REAL_DOT_FLOAT4(rdot2, a2, b2); \ 02505 REAL_DOT_FLOAT4(rdot3, a3, b3); \ 02506 REAL_DOT_FLOAT4(rdot4, a4, b4); \ 02507 REAL_DOT_FLOAT4(rdot5, a5, b5); \ 02508 rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; 02509 #define REDUCE_OPERATION(i) (ac*bc*rdot0) 02510 #include "reduce_core.h" 02511 #undef REDUCE_FUNC_NAME 02512 #undef REDUCE_TYPES 02513 #undef REDUCE_PARAMS 02514 #undef REDUCE_AUXILIARY 02515 #undef REDUCE_OPERATION 02516 02517 template <unsigned int reduce_threads, typename Float> 02518 #define REDUCE_FUNC_NAME(suffix) reDotProductHSt##suffix 02519 #define REDUCE_TYPES Float *aN, Float *bN, int stride 02520 #define REDUCE_PARAMS aN, bN, stride 02521 #define REDUCE_AUXILIARY(i) \ 02522 READ_HALF_SPINOR_ST(a, texHalfSt1, stride); \ 02523 READ_HALF_SPINOR_ST(b, texHalfSt2, stride); \ 02524 REAL_DOT_FLOAT2(rdot0, a0, b0); \ 02525 REAL_DOT_FLOAT2(rdot1, a1, b1); \ 02526 REAL_DOT_FLOAT2(rdot2, a2, b2); \ 02527 rdot0 += rdot1; rdot0 += rdot2; 02528 #define REDUCE_OPERATION(i) (ac*bc*rdot0) 02529 #include "reduce_core.h" 02530 #undef REDUCE_FUNC_NAME 02531 #undef REDUCE_TYPES 02532 #undef REDUCE_PARAMS 02533 #undef REDUCE_AUXILIARY 02534 #undef REDUCE_OPERATION 02535 02536 double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b) { 02537 const int id = 16; 02538 blas_quda_flops += 2*a.real_length; 02539 checkSpinor(a, b); 02540 blas_quda_bytes += 2*a.real_length*a.precision; 02541 if (a.precision == QUDA_DOUBLE_PRECISION) { 02542 return reDotProductDCuda((double*)a.v, (double*)b.v, a.length, id, a.precision); 02543 } else if (a.precision == QUDA_SINGLE_PRECISION) { 02544 return reDotProductSCuda((float2*)a.v, (float2*)b.v, a.length/2, id, a.precision); 02545 } else { 02546 if (a.siteSubset == QUDA_FULL_SITE_SUBSET) { 02547 return reDotProductCuda(a.Even(), b.Even()) + reDotProductCuda(a.Odd(), b.Odd()); 02548 } 02549 blas_quda_bytes += 2*a.volume*sizeof(float); 02550 int spinor_bytes = a.length*sizeof(short); 02551 if (a.nSpin == 4){ //wilson 02552 cudaBindTexture(0, texHalf1, a.v, spinor_bytes); 02553 cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/12); 02554 cudaBindTexture(0, texHalf2, b.v, spinor_bytes); 02555 cudaBindTexture(0, texNorm2, b.norm, spinor_bytes/12); 02556 return reDotProductHCuda((float*)a.norm, (float*)b.norm, a.stride, a.volume, id, a.precision); 02557 }else if (a.nSpin == 1){ //staggered 02558 cudaBindTexture(0, texHalfSt1, a.v, spinor_bytes); 02559 cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/3); 02560 cudaBindTexture(0, texHalfSt2, b.v, spinor_bytes); 02561 cudaBindTexture(0, texNorm2, b.norm, spinor_bytes/3); 02562 return reDotProductHStCuda((float*)a.norm, (float*)b.norm, a.stride, a.volume, id, a.precision); 02563 }else{ 02564 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, a.nSpin); 02565 return 0; 02566 } 02567 } 02568 02569 } 02570 02571 // 02572 // double axpyNormCuda(float a, float *x, float *y, n){} 02573 // 02574 // First performs the operation y[i] = a*x[i] + y[i] 02575 // Second returns the norm of y 02576 // 02577 02578 template <unsigned int reduce_threads, typename Float> 02579 #define REDUCE_FUNC_NAME(suffix) axpyNormF##suffix 02580 #define REDUCE_TYPES Float a, Float *x, Float *y 02581 #define REDUCE_PARAMS a, x, y 02582 #define REDUCE_AUXILIARY(i) y[i] = a*x[i] + y[i] 02583 #define REDUCE_OPERATION(i) (y[i]*y[i]) 02584 #include "reduce_core.h" 02585 #undef REDUCE_FUNC_NAME 02586 #undef REDUCE_TYPES 02587 #undef REDUCE_PARAMS 02588 #undef REDUCE_AUXILIARY 02589 #undef REDUCE_OPERATION 02590 02591 template <unsigned int reduce_threads, typename Float> 02592 #define REDUCE_FUNC_NAME(suffix) axpyNormH##suffix 02593 #define REDUCE_TYPES Float a, short4 *yH, float *yN, int stride 02594 #define REDUCE_PARAMS a, yH, yN, stride 02595 #define REDUCE_AUXILIARY(i) \ 02596 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); \ 02597 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); \ 02598 AXPY_FLOAT4(a, x0, y0); \ 02599 REAL_DOT_FLOAT4(norm0, y0, y0); \ 02600 AXPY_FLOAT4(a, x1, y1); \ 02601 REAL_DOT_FLOAT4(norm1, y1, y1); \ 02602 AXPY_FLOAT4(a, x2, y2); \ 02603 REAL_DOT_FLOAT4(norm2, y2, y2); \ 02604 AXPY_FLOAT4(a, x3, y3); \ 02605 REAL_DOT_FLOAT4(norm3, y3, y3); \ 02606 AXPY_FLOAT4(a, x4, y4); \ 02607 REAL_DOT_FLOAT4(norm4, y4, y4); \ 02608 AXPY_FLOAT4(a, x5, y5); \ 02609 REAL_DOT_FLOAT4(norm5, y5, y5); \ 02610 norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2; norm0 += norm4; \ 02611 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 02612 #define REDUCE_OPERATION(i) (norm0) 02613 #include "reduce_core.h" 02614 #undef REDUCE_FUNC_NAME 02615 #undef REDUCE_TYPES 02616 #undef REDUCE_PARAMS 02617 #undef REDUCE_AUXILIARY 02618 #undef REDUCE_OPERATION 02619 02620 template <unsigned int reduce_threads, typename Float> 02621 #define REDUCE_FUNC_NAME(suffix) axpyNormH##suffix 02622 #define REDUCE_TYPES Float a, short2 *yH, float *yN, int stride 02623 #define REDUCE_PARAMS a, yH, yN, stride 02624 #define REDUCE_AUXILIARY(i) \ 02625 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); \ 02626 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); \ 02627 AXPY_FLOAT2(a, x0, y0); \ 02628 REAL_DOT_FLOAT2(norm0, y0, y0); \ 02629 AXPY_FLOAT2(a, x1, y1); \ 02630 REAL_DOT_FLOAT2(norm1, y1, y1); \ 02631 AXPY_FLOAT2(a, x2, y2); \ 02632 REAL_DOT_FLOAT2(norm2, y2, y2); \ 02633 norm0 += norm1; norm0 += norm2; \ 02634 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 02635 #define REDUCE_OPERATION(i) (norm0) 02636 #include "reduce_core.h" 02637 #undef REDUCE_FUNC_NAME 02638 #undef REDUCE_TYPES 02639 #undef REDUCE_PARAMS 02640 #undef REDUCE_AUXILIARY 02641 #undef REDUCE_OPERATION 02642 02643 double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 02644 const int id = 17; 02645 blas_quda_flops += 4*x.real_length; 02646 checkSpinor(x,y); 02647 blas_quda_bytes += 3*x.real_length*x.precision; 02648 if (x.precision == QUDA_DOUBLE_PRECISION) { 02649 return axpyNormFCuda(a, (double*)x.v, (double*)y.v, x.length, id, x.precision); 02650 } else if (x.precision == QUDA_SINGLE_PRECISION) { 02651 return axpyNormFCuda((float)a, (float*)x.v, (float*)y.v, x.length, id, x.precision); 02652 } else { 02653 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 02654 return axpyNormCuda(a, x.Even(), y.Even()) + axpyNormCuda(a, x.Odd(), y.Odd()); 02655 02656 cudaBindTexture(0, texNorm1, x.norm, x.bytes/(x.nColor*x.nSpin)); 02657 cudaBindTexture(0, texNorm2, y.norm, x.bytes/(x.nColor*x.nSpin)); 02658 blas_quda_bytes += 3*x.volume*sizeof(float); 02659 02660 if (x.nSpin == 4){ //wilson 02661 cudaBindTexture(0, texHalf1, x.v, x.bytes); 02662 cudaBindTexture(0, texHalf2, y.v, x.bytes); 02663 return axpyNormHCuda((float)a, (short4*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision); 02664 }else if (x.nSpin == 1){ //staggered 02665 cudaBindTexture(0, texHalfSt1, x.v, x.bytes); 02666 cudaBindTexture(0, texHalfSt2, y.v, x.bytes); 02667 return axpyNormHCuda((float)a, (short2*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision); 02668 }else{ 02669 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 02670 return 0; 02671 } 02672 } 02673 02674 } 02675 02676 // 02677 // double xmyNormCuda(float a, float *x, float *y, n){} 02678 // 02679 // First performs the operation y[i] = x[i] - y[i] 02680 // Second returns the norm of y 02681 // 02682 02683 template <unsigned int reduce_threads, typename Float> 02684 #define REDUCE_FUNC_NAME(suffix) xmyNormF##suffix 02685 #define REDUCE_TYPES Float *x, Float *y 02686 #define REDUCE_PARAMS x, y 02687 #define REDUCE_AUXILIARY(i) y[i] = x[i] - y[i] 02688 #define REDUCE_OPERATION(i) (y[i]*y[i]) 02689 #include "reduce_core.h" 02690 #undef REDUCE_FUNC_NAME 02691 #undef REDUCE_TYPES 02692 #undef REDUCE_PARAMS 02693 #undef REDUCE_AUXILIARY 02694 #undef REDUCE_OPERATION 02695 02696 template <unsigned int reduce_threads, typename Float> 02697 #define REDUCE_FUNC_NAME(suffix) xmyNormH##suffix 02698 #define REDUCE_TYPES Float *d1, Float *d2, short4 *yH, float *yN, int stride 02699 #define REDUCE_PARAMS d1, d2, yH, yN, stride 02700 #define REDUCE_AUXILIARY(i) \ 02701 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); \ 02702 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); \ 02703 XMY_FLOAT4(x0, y0); \ 02704 REAL_DOT_FLOAT4(norm0, y0, y0); \ 02705 XMY_FLOAT4(x1, y1); \ 02706 REAL_DOT_FLOAT4(norm1, y1, y1); \ 02707 XMY_FLOAT4(x2, y2); \ 02708 REAL_DOT_FLOAT4(norm2, y2, y2); \ 02709 XMY_FLOAT4(x3, y3); \ 02710 REAL_DOT_FLOAT4(norm3, y3, y3); \ 02711 XMY_FLOAT4(x4, y4); \ 02712 REAL_DOT_FLOAT4(norm4, y4, y4); \ 02713 XMY_FLOAT4(x5, y5); \ 02714 REAL_DOT_FLOAT4(norm5, y5, y5); \ 02715 norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2; norm0 += norm4; \ 02716 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 02717 #define REDUCE_OPERATION(i) (norm0) 02718 #include "reduce_core.h" 02719 #undef REDUCE_FUNC_NAME 02720 #undef REDUCE_TYPES 02721 #undef REDUCE_PARAMS 02722 #undef REDUCE_AUXILIARY 02723 #undef REDUCE_OPERATION 02724 02725 template <unsigned int reduce_threads, typename Float> 02726 #define REDUCE_FUNC_NAME(suffix) xmyNormH##suffix 02727 #define REDUCE_TYPES Float *d1, Float *d2, short2 *yH, float *yN, int stride 02728 #define REDUCE_PARAMS d1, d2, yH, yN, stride 02729 #define REDUCE_AUXILIARY(i) \ 02730 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); \ 02731 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); \ 02732 XMY_FLOAT2(x0, y0); \ 02733 REAL_DOT_FLOAT2(norm0, y0, y0); \ 02734 XMY_FLOAT2(x1, y1); \ 02735 REAL_DOT_FLOAT2(norm1, y1, y1); \ 02736 XMY_FLOAT2(x2, y2); \ 02737 REAL_DOT_FLOAT2(norm2, y2, y2); \ 02738 norm0 += norm1; norm0 += norm2; \ 02739 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 02740 #define REDUCE_OPERATION(i) (norm0) 02741 #include "reduce_core.h" 02742 #undef REDUCE_FUNC_NAME 02743 #undef REDUCE_TYPES 02744 #undef REDUCE_PARAMS 02745 #undef REDUCE_AUXILIARY 02746 #undef REDUCE_OPERATION 02747 02748 02749 double xmyNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 02750 const int id = 18; 02751 blas_quda_flops += 3*x.real_length; 02752 checkSpinor(x,y); 02753 blas_quda_bytes += 3*x.real_length*x.precision; 02754 if (x.precision == QUDA_DOUBLE_PRECISION) { 02755 return xmyNormFCuda((double*)x.v, (double*)y.v, x.length, id, x.precision); 02756 } else if (x.precision == QUDA_SINGLE_PRECISION) { 02757 return xmyNormFCuda((float*)x.v, (float*)y.v, x.length, id, x.precision); 02758 } else { 02759 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 02760 return xmyNormCuda(x.Even(), y.Even()) + xmyNormCuda(x.Odd(), y.Odd()); 02761 02762 cudaBindTexture(0, texNorm1, x.norm, x.bytes/(x.nColor*x.nSpin)); 02763 cudaBindTexture(0, texNorm2, y.norm, x.bytes/(x.nColor*x.nSpin)); 02764 blas_quda_bytes += 3*x.volume*sizeof(float); 02765 02766 if (x.nSpin ==4 ){ //wilsin 02767 cudaBindTexture(0, texHalf1, x.v, x.bytes); 02768 cudaBindTexture(0, texHalf2, y.v, x.bytes); 02769 return xmyNormHCuda((char*)0, (char*)0, (short4*)y.v, (float*)y.norm, y.stride, y.volume, id, x.precision); 02770 }else if (x.nSpin == 1){ 02771 cudaBindTexture(0, texHalfSt1, x.v, x.bytes); 02772 cudaBindTexture(0, texHalfSt2, y.v, x.bytes); 02773 return xmyNormHCuda((char*)0, (char*)0, (short2*)y.v, (float*)y.norm, y.stride, y.volume, id, x.precision); 02774 }else{ 02775 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 02776 } 02777 } 02778 02779 } 02780 02781 02782 // 02783 // double2 cDotProductCuda(float2 *x, float2 *y, int n) {} 02784 // 02785 template <unsigned int reduce_threads, typename Float, typename Float2> 02786 #define REDUCE_FUNC_NAME(suffix) cDotProductD##suffix 02787 #define REDUCE_TYPES Float2 *x, Float2 *y, Float c 02788 #define REDUCE_PARAMS x, y, c 02789 #define REDUCE_REAL_AUXILIARY(i) Float2 a = READ_DOUBLE2_TEXTURE(x, i); 02790 #define REDUCE_IMAG_AUXILIARY(i) Float2 b = READ_DOUBLE2_TEXTURE(y, i); 02791 #define REDUCE_REAL_OPERATION(i) (a.x*b.x + a.y*b.y) 02792 #define REDUCE_IMAG_OPERATION(i) (a.x*b.y - a.y*b.x) 02793 #include "reduce_complex_core.h" 02794 #undef REDUCE_FUNC_NAME 02795 #undef REDUCE_TYPES 02796 #undef REDUCE_PARAMS 02797 #undef REDUCE_REAL_AUXILIARY 02798 #undef REDUCE_IMAG_AUXILIARY 02799 #undef REDUCE_REAL_OPERATION 02800 #undef REDUCE_IMAG_OPERATION 02801 02802 template <unsigned int reduce_threads, typename Float, typename Float2> 02803 #define REDUCE_FUNC_NAME(suffix) cDotProductS##suffix 02804 #define REDUCE_TYPES Float2 *x, Float2 *y, Float c 02805 #define REDUCE_PARAMS x, y, c 02806 #define REDUCE_REAL_AUXILIARY(i) Float2 a = read_Float2(x, i); 02807 #define REDUCE_IMAG_AUXILIARY(i) Float2 b = read_Float2(y, i); 02808 #define REDUCE_REAL_OPERATION(i) (a.x*b.x + a.y*b.y) 02809 #define REDUCE_IMAG_OPERATION(i) (a.x*b.y - a.y*b.x) 02810 #include "reduce_complex_core.h" 02811 #undef REDUCE_FUNC_NAME 02812 #undef REDUCE_TYPES 02813 #undef REDUCE_PARAMS 02814 #undef REDUCE_REAL_AUXILIARY 02815 #undef REDUCE_IMAG_AUXILIARY 02816 #undef REDUCE_REAL_OPERATION 02817 #undef REDUCE_IMAG_OPERATION 02818 02819 template <unsigned int reduce_threads, typename Float, typename Float2> 02820 #define REDUCE_FUNC_NAME(suffix) cDotProductH##suffix 02821 #define REDUCE_TYPES Float *aN, Float2 *bN, int stride 02822 #define REDUCE_PARAMS aN, bN, stride 02823 #define REDUCE_REAL_AUXILIARY(i) \ 02824 READ_HALF_SPINOR(a, texHalf1, stride); \ 02825 READ_HALF_SPINOR(b, texHalf2, stride); \ 02826 REAL_DOT_FLOAT4(rdot0, a0, b0); \ 02827 REAL_DOT_FLOAT4(rdot1, a1, b1); \ 02828 REAL_DOT_FLOAT4(rdot2, a2, b2); \ 02829 REAL_DOT_FLOAT4(rdot3, a3, b3); \ 02830 REAL_DOT_FLOAT4(rdot4, a4, b4); \ 02831 REAL_DOT_FLOAT4(rdot5, a5, b5); \ 02832 rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; 02833 #define REDUCE_IMAG_AUXILIARY(i) \ 02834 IMAG_DOT_FLOAT4(idot0, a0, b0); \ 02835 IMAG_DOT_FLOAT4(idot1, a1, b1); \ 02836 IMAG_DOT_FLOAT4(idot2, a2, b2); \ 02837 IMAG_DOT_FLOAT4(idot3, a3, b3); \ 02838 IMAG_DOT_FLOAT4(idot4, a4, b4); \ 02839 IMAG_DOT_FLOAT4(idot5, a5, b5); \ 02840 idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4; 02841 #define REDUCE_REAL_OPERATION(i) (ac*bc*rdot0) 02842 #define REDUCE_IMAG_OPERATION(i) (ac*bc*idot0) 02843 #include "reduce_complex_core.h" 02844 #undef REDUCE_FUNC_NAME 02845 #undef REDUCE_TYPES 02846 #undef REDUCE_PARAMS 02847 #undef REDUCE_REAL_AUXILIARY 02848 #undef REDUCE_IMAG_AUXILIARY 02849 #undef REDUCE_REAL_OPERATION 02850 #undef REDUCE_IMAG_OPERATION 02851 02852 template <unsigned int reduce_threads, typename Float, typename Float2> 02853 #define REDUCE_FUNC_NAME(suffix) cDotProductHSt##suffix 02854 #define REDUCE_TYPES Float *aN, Float2 *bN, int stride 02855 #define REDUCE_PARAMS aN, bN, stride 02856 #define REDUCE_REAL_AUXILIARY(i) \ 02857 READ_HALF_SPINOR_ST(a, texHalfSt1, stride); \ 02858 READ_HALF_SPINOR_ST(b, texHalfSt2, stride); \ 02859 REAL_DOT_FLOAT2(rdot0, a0, b0); \ 02860 REAL_DOT_FLOAT2(rdot1, a1, b1); \ 02861 REAL_DOT_FLOAT2(rdot2, a2, b2); \ 02862 rdot0 += rdot1; rdot0 += rdot2; 02863 #define REDUCE_IMAG_AUXILIARY(i) \ 02864 IMAG_DOT_FLOAT2(idot0, a0, b0); \ 02865 IMAG_DOT_FLOAT2(idot1, a1, b1); \ 02866 IMAG_DOT_FLOAT2(idot2, a2, b2); \ 02867 idot0 += idot1; idot0 += idot2; 02868 #define REDUCE_REAL_OPERATION(i) (ac*bc*rdot0) 02869 #define REDUCE_IMAG_OPERATION(i) (ac*bc*idot0) 02870 #include "reduce_complex_core.h" 02871 #undef REDUCE_FUNC_NAME 02872 #undef REDUCE_TYPES 02873 #undef REDUCE_PARAMS 02874 #undef REDUCE_REAL_AUXILIARY 02875 #undef REDUCE_IMAG_AUXILIARY 02876 #undef REDUCE_REAL_OPERATION 02877 #undef REDUCE_IMAG_OPERATION 02878 02879 Complex cDotProductCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 02880 const int id = 19; 02881 blas_quda_flops += 4*x.real_length; 02882 checkSpinor(x,y); 02883 int length = x.length/2; 02884 blas_quda_bytes += 2*x.real_length*x.precision; 02885 double2 dot; 02886 if (x.precision == QUDA_DOUBLE_PRECISION) { 02887 char c = 0; 02888 int spinor_bytes = x.length*sizeof(double); 02889 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 02890 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 02891 dot = cDotProductDCuda((double2*)x.v, (double2*)y.v, c, length, id, x.precision); 02892 } else if (x.precision == QUDA_SINGLE_PRECISION) { 02893 char c = 0; 02894 int spinor_bytes = x.length*sizeof(float); 02895 cudaBindTexture(0, xTexSingle2, x.v, spinor_bytes); 02896 cudaBindTexture(0, yTexSingle2, y.v, spinor_bytes); 02897 dot = cDotProductSCuda((float2*)x.v, (float2*)y.v, c, length, id, x.precision); 02898 } else { 02899 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 02900 return cDotProductCuda(x.Even(), y.Even()) + cDotProductCuda(x.Odd(), y.Odd()); 02901 int spinor_bytes = x.length*sizeof(short); 02902 blas_quda_bytes += 2*x.volume*sizeof(float); 02903 if (x.nSpin == 4){ 02904 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 02905 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 02906 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 02907 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 02908 dot = cDotProductHCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision); 02909 } else if (x.nSpin == 1){ 02910 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 02911 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 02912 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 02913 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 02914 dot = cDotProductHStCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision); 02915 }else{ 02916 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 02917 } 02918 } 02919 02920 return Complex(dot.x, dot.y); 02921 } 02922 02923 // 02924 // double2 xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int n) {} 02925 // 02926 // First performs the operation y = x + a*y 02927 // Second returns complex dot product (z,y) 02928 // 02929 02930 template <unsigned int reduce_threads, typename Float, typename Float2> 02931 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyD##suffix 02932 #define REDUCE_TYPES Float2 *x, Float a, Float2 *y, Float2 *z 02933 #define REDUCE_PARAMS x, a, y, z 02934 #define REDUCE_REAL_AUXILIARY(i) \ 02935 Float2 X = READ_DOUBLE2_TEXTURE(x, i); \ 02936 Float2 Y = READ_DOUBLE2_TEXTURE(y, i); \ 02937 Float2 Z = READ_DOUBLE2_TEXTURE(z, i); 02938 #define REDUCE_IMAG_AUXILIARY(i) y[i].x = X.x + a*Y.x; y[i].y = X.y + a*Y.y 02939 #define REDUCE_REAL_OPERATION(i) (Z.x*y[i].x + Z.y*y[i].y) 02940 #define REDUCE_IMAG_OPERATION(i) (Z.x*y[i].y - Z.y*y[i].x) 02941 #include "reduce_complex_core.h" 02942 #undef REDUCE_FUNC_NAME 02943 #undef REDUCE_TYPES 02944 #undef REDUCE_PARAMS 02945 #undef REDUCE_REAL_AUXILIARY 02946 #undef REDUCE_IMAG_AUXILIARY 02947 #undef REDUCE_REAL_OPERATION 02948 #undef REDUCE_IMAG_OPERATION 02949 02950 template <unsigned int reduce_threads, typename Float, typename Float2> 02951 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyS##suffix 02952 #define REDUCE_TYPES Float2 *x, Float a, Float2 *y, Float2 *z 02953 #define REDUCE_PARAMS x, a, y, z 02954 #define REDUCE_REAL_AUXILIARY(i) y[i].x = x[i].x + a*y[i].x 02955 #define REDUCE_IMAG_AUXILIARY(i) y[i].y = x[i].y + a*y[i].y 02956 #define REDUCE_REAL_OPERATION(i) (z[i].x*y[i].x + z[i].y*y[i].y) 02957 #define REDUCE_IMAG_OPERATION(i) (z[i].x*y[i].y - z[i].y*y[i].x) 02958 #include "reduce_complex_core.h" 02959 #undef REDUCE_FUNC_NAME 02960 #undef REDUCE_TYPES 02961 #undef REDUCE_PARAMS 02962 #undef REDUCE_REAL_AUXILIARY 02963 #undef REDUCE_IMAG_AUXILIARY 02964 #undef REDUCE_REAL_OPERATION 02965 #undef REDUCE_IMAG_OPERATION 02966 02967 template <unsigned int reduce_threads, typename Float, typename Float2> 02968 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyH##suffix 02969 #define REDUCE_TYPES Float a, short4 *yH, Float2 *yN, int stride 02970 #define REDUCE_PARAMS a, yH, yN, stride 02971 #define REDUCE_REAL_AUXILIARY(i) \ 02972 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); \ 02973 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); \ 02974 RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride); \ 02975 XPAY_FLOAT4(x0, a, y0); \ 02976 XPAY_FLOAT4(x1, a, y1); \ 02977 XPAY_FLOAT4(x2, a, y2); \ 02978 XPAY_FLOAT4(x3, a, y3); \ 02979 XPAY_FLOAT4(x4, a, y4); \ 02980 XPAY_FLOAT4(x5, a, y5); \ 02981 REAL_DOT_FLOAT4(rdot0, z0, y0); \ 02982 REAL_DOT_FLOAT4(rdot1, z1, y1); \ 02983 REAL_DOT_FLOAT4(rdot2, z2, y2); \ 02984 REAL_DOT_FLOAT4(rdot3, z3, y3); \ 02985 REAL_DOT_FLOAT4(rdot4, z4, y4); \ 02986 REAL_DOT_FLOAT4(rdot5, z5, y5); \ 02987 rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; 02988 #define REDUCE_IMAG_AUXILIARY(i) \ 02989 IMAG_DOT_FLOAT4(idot0, z0, y0); \ 02990 IMAG_DOT_FLOAT4(idot1, z1, y1); \ 02991 IMAG_DOT_FLOAT4(idot2, z2, y2); \ 02992 IMAG_DOT_FLOAT4(idot3, z3, y3); \ 02993 IMAG_DOT_FLOAT4(idot4, z4, y4); \ 02994 IMAG_DOT_FLOAT4(idot5, z5, y5); \ 02995 idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4; \ 02996 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 02997 #define REDUCE_REAL_OPERATION(i) (rdot0) 02998 #define REDUCE_IMAG_OPERATION(i) (idot0) 02999 #include "reduce_complex_core.h" 03000 #undef REDUCE_FUNC_NAME 03001 #undef REDUCE_TYPES 03002 #undef REDUCE_PARAMS 03003 #undef REDUCE_REAL_AUXILIARY 03004 #undef REDUCE_IMAG_AUXILIARY 03005 #undef REDUCE_REAL_OPERATION 03006 #undef REDUCE_IMAG_OPERATION 03007 03008 template <unsigned int reduce_threads, typename Float, typename Float2> 03009 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyH##suffix 03010 #define REDUCE_TYPES Float a, short2 *yH, Float2 *yN, int stride 03011 #define REDUCE_PARAMS a, yH, yN, stride 03012 #define REDUCE_REAL_AUXILIARY(i) \ 03013 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); \ 03014 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); \ 03015 RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride); \ 03016 XPAY_FLOAT2(x0, a, y0); \ 03017 XPAY_FLOAT2(x1, a, y1); \ 03018 XPAY_FLOAT2(x2, a, y2); \ 03019 REAL_DOT_FLOAT2(rdot0, z0, y0); \ 03020 REAL_DOT_FLOAT2(rdot1, z1, y1); \ 03021 REAL_DOT_FLOAT2(rdot2, z2, y2); \ 03022 rdot0 += rdot1; rdot0 += rdot2; 03023 #define REDUCE_IMAG_AUXILIARY(i) \ 03024 IMAG_DOT_FLOAT2(idot0, z0, y0); \ 03025 IMAG_DOT_FLOAT2(idot1, z1, y1); \ 03026 IMAG_DOT_FLOAT2(idot2, z2, y2); \ 03027 idot0 += idot1; idot0 += idot2; \ 03028 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 03029 #define REDUCE_REAL_OPERATION(i) (rdot0) 03030 #define REDUCE_IMAG_OPERATION(i) (idot0) 03031 #include "reduce_complex_core.h" 03032 #undef REDUCE_FUNC_NAME 03033 #undef REDUCE_TYPES 03034 #undef REDUCE_PARAMS 03035 #undef REDUCE_REAL_AUXILIARY 03036 #undef REDUCE_IMAG_AUXILIARY 03037 #undef REDUCE_REAL_OPERATION 03038 #undef REDUCE_IMAG_OPERATION 03039 03040 Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z) { 03041 const int id = 20; 03042 blas_quda_flops += 6*x.real_length; 03043 checkSpinor(x,y); 03044 checkSpinor(x,z); 03045 int length = x.length/2; 03046 blas_quda_bytes += 4*x.real_length*x.precision; 03047 double2 dot; 03048 if (x.precision == QUDA_DOUBLE_PRECISION) { 03049 int spinor_bytes = x.length*sizeof(double); 03050 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 03051 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 03052 cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 03053 dot = xpaycDotzyDCuda((double2*)x.v, a, (double2*)y.v, (double2*)z.v, length, id, x.precision); 03054 } else if (x.precision == QUDA_SINGLE_PRECISION) { 03055 dot = xpaycDotzySCuda((float2*)x.v, (float)a, (float2*)y.v, (float2*)z.v, length, id, x.precision); 03056 } else { 03057 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 03058 return xpaycDotzyCuda(x.Even(), a, y.Even(), z.Even()) + xpaycDotzyCuda(x.Odd(), a, y.Odd(), z.Odd()); 03059 int spinor_bytes = x.length*sizeof(short); 03060 blas_quda_bytes += 4*x.volume*sizeof(float); 03061 if (x.nSpin ==4 ){//wilson 03062 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 03063 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 03064 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 03065 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 03066 cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 03067 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12); 03068 dot = xpaycDotzyHCuda((float)a, (short4*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision); 03069 } else if (x.nSpin ==1 ){//wilson 03070 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 03071 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 03072 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 03073 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 03074 cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 03075 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3); 03076 dot = xpaycDotzyHCuda((float)a, (short2*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision); 03077 }else{ 03078 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 03079 } 03080 } 03081 return Complex(dot.x, dot.y); 03082 } 03083 03084 03085 // 03086 // double3 cDotProductNormACuda(float2 *a, float2 *b, int n) {} 03087 // 03088 template <unsigned int reduce_threads, typename Float2> 03089 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAD##suffix 03090 #define REDUCE_TYPES Float2 *x, Float2 *y 03091 #define REDUCE_PARAMS x, y 03092 #define REDUCE_X_AUXILIARY(i) Float2 a = READ_DOUBLE2_TEXTURE(x, i); 03093 #define REDUCE_Y_AUXILIARY(i) Float2 b = READ_DOUBLE2_TEXTURE(y, i); 03094 #define REDUCE_Z_AUXILIARY(i) 03095 #define REDUCE_X_OPERATION(i) (a.x*b.x + a.y*b.y) 03096 #define REDUCE_Y_OPERATION(i) (a.x*b.y - a.y*b.x) 03097 #define REDUCE_Z_OPERATION(i) (a.x*a.x + a.y*a.y) 03098 #include "reduce_triple_core.h" 03099 #undef REDUCE_FUNC_NAME 03100 #undef REDUCE_TYPES 03101 #undef REDUCE_PARAMS 03102 #undef REDUCE_X_AUXILIARY 03103 #undef REDUCE_Y_AUXILIARY 03104 #undef REDUCE_Z_AUXILIARY 03105 #undef REDUCE_X_OPERATION 03106 #undef REDUCE_Y_OPERATION 03107 #undef REDUCE_Z_OPERATION 03108 03109 template <unsigned int reduce_threads, typename Float2> 03110 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAS##suffix 03111 #define REDUCE_TYPES Float2 *a, Float2 *b 03112 #define REDUCE_PARAMS a, b 03113 #define REDUCE_X_AUXILIARY(i) 03114 #define REDUCE_Y_AUXILIARY(i) 03115 #define REDUCE_Z_AUXILIARY(i) 03116 #define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y) 03117 #define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x) 03118 #define REDUCE_Z_OPERATION(i) (a[i].x*a[i].x + a[i].y*a[i].y) 03119 #include "reduce_triple_core.h" 03120 #undef REDUCE_FUNC_NAME 03121 #undef REDUCE_TYPES 03122 #undef REDUCE_PARAMS 03123 #undef REDUCE_X_AUXILIARY 03124 #undef REDUCE_Y_AUXILIARY 03125 #undef REDUCE_Z_AUXILIARY 03126 #undef REDUCE_X_OPERATION 03127 #undef REDUCE_Y_OPERATION 03128 #undef REDUCE_Z_OPERATION 03129 03130 template <unsigned int reduce_threads, typename Float2> 03131 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAH##suffix 03132 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride 03133 #define REDUCE_PARAMS xN, yN, stride 03134 #define REDUCE_X_AUXILIARY(i) \ 03135 READ_HALF_SPINOR(x, texHalf1, stride); \ 03136 READ_HALF_SPINOR(y, texHalf2, stride); \ 03137 REAL_DOT_FLOAT4(norm0, x0, x0); \ 03138 REAL_DOT_FLOAT4(norm1, x1, x1); \ 03139 REAL_DOT_FLOAT4(norm2, x2, x2); \ 03140 REAL_DOT_FLOAT4(norm3, x3, x3); \ 03141 REAL_DOT_FLOAT4(norm4, x4, x4); \ 03142 REAL_DOT_FLOAT4(norm5, x5, x5); \ 03143 norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4; 03144 #define REDUCE_Y_AUXILIARY(i) \ 03145 REAL_DOT_FLOAT4(rdot0, x0, y0); \ 03146 REAL_DOT_FLOAT4(rdot1, x1, y1); \ 03147 REAL_DOT_FLOAT4(rdot2, x2, y2); \ 03148 REAL_DOT_FLOAT4(rdot3, x3, y3); \ 03149 REAL_DOT_FLOAT4(rdot4, x4, y4); \ 03150 REAL_DOT_FLOAT4(rdot5, x5, y5); \ 03151 rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; 03152 #define REDUCE_Z_AUXILIARY(i) \ 03153 IMAG_DOT_FLOAT4(idot0, x0, y0); \ 03154 IMAG_DOT_FLOAT4(idot1, x1, y1); \ 03155 IMAG_DOT_FLOAT4(idot2, x2, y2); \ 03156 IMAG_DOT_FLOAT4(idot3, x3, y3); \ 03157 IMAG_DOT_FLOAT4(idot4, x4, y4); \ 03158 IMAG_DOT_FLOAT4(idot5, x5, y5); \ 03159 idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4; 03160 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0) 03161 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0) 03162 #define REDUCE_Z_OPERATION(i) (xc*xc*norm0) 03163 #include "reduce_triple_core.h" 03164 #undef REDUCE_FUNC_NAME 03165 #undef REDUCE_TYPES 03166 #undef REDUCE_PARAMS 03167 #undef REDUCE_X_AUXILIARY 03168 #undef REDUCE_Y_AUXILIARY 03169 #undef REDUCE_Z_AUXILIARY 03170 #undef REDUCE_X_OPERATION 03171 #undef REDUCE_Y_OPERATION 03172 #undef REDUCE_Z_OPERATION 03173 03174 template <unsigned int reduce_threads, typename Float2> 03175 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAHSt##suffix 03176 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride 03177 #define REDUCE_PARAMS xN, yN, stride 03178 #define REDUCE_X_AUXILIARY(i) \ 03179 READ_HALF_SPINOR_ST(x, texHalfSt1, stride); \ 03180 READ_HALF_SPINOR_ST(y, texHalfSt2, stride); \ 03181 REAL_DOT_FLOAT2(norm0, x0, x0); \ 03182 REAL_DOT_FLOAT2(norm1, x1, x1); \ 03183 REAL_DOT_FLOAT2(norm2, x2, x2); \ 03184 norm0 += norm1; norm0 += norm2; 03185 #define REDUCE_Y_AUXILIARY(i) \ 03186 REAL_DOT_FLOAT2(rdot0, x0, y0); \ 03187 REAL_DOT_FLOAT2(rdot1, x1, y1); \ 03188 REAL_DOT_FLOAT2(rdot2, x2, y2); \ 03189 rdot0 += rdot1; rdot0 += rdot2; 03190 #define REDUCE_Z_AUXILIARY(i) \ 03191 IMAG_DOT_FLOAT2(idot0, x0, y0); \ 03192 IMAG_DOT_FLOAT2(idot1, x1, y1); \ 03193 IMAG_DOT_FLOAT2(idot2, x2, y2); \ 03194 idot0 += idot1; idot0 += idot2; 03195 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0) 03196 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0) 03197 #define REDUCE_Z_OPERATION(i) (xc*xc*norm0) 03198 #include "reduce_triple_core.h" 03199 #undef REDUCE_FUNC_NAME 03200 #undef REDUCE_TYPES 03201 #undef REDUCE_PARAMS 03202 #undef REDUCE_X_AUXILIARY 03203 #undef REDUCE_Y_AUXILIARY 03204 #undef REDUCE_Z_AUXILIARY 03205 #undef REDUCE_X_OPERATION 03206 #undef REDUCE_Y_OPERATION 03207 #undef REDUCE_Z_OPERATION 03208 03209 double3 cDotProductNormACuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 03210 const int id = 21; 03211 blas_quda_flops += 6*x.real_length; 03212 checkSpinor(x,y); 03213 int length = x.length/2; 03214 blas_quda_bytes += 2*x.real_length*x.precision; 03215 if (x.precision == QUDA_DOUBLE_PRECISION) { 03216 int spinor_bytes = x.length*sizeof(double); 03217 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 03218 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 03219 return cDotProductNormADCuda((double2*)x.v, (double2*)y.v, length, id, x.precision); 03220 } else if (x.precision == QUDA_SINGLE_PRECISION) { 03221 return cDotProductNormASCuda((float2*)x.v, (float2*)y.v, length, id, x.precision); 03222 } else { 03223 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 03224 return cDotProductNormACuda(x.Even(), y.Even()) + cDotProductNormACuda(x.Odd(), y.Odd()); 03225 int spinor_bytes = x.length*sizeof(short); 03226 blas_quda_bytes += 2*x.volume*sizeof(float); 03227 if (x.nSpin == 4){ //wilson 03228 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 03229 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 03230 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 03231 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 03232 return cDotProductNormAHCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision); 03233 } else if (x.nSpin == 1){ //staggered 03234 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 03235 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 03236 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 03237 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 03238 return cDotProductNormAHStCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision); 03239 }else{ 03240 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 03241 } 03242 } 03243 03244 } 03245 03246 // 03247 // double3 cDotProductNormBCuda(float2 *a, float2 *b, int n) {} 03248 // 03249 template <unsigned int reduce_threads, typename Float2> 03250 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBD##suffix 03251 #define REDUCE_TYPES Float2 *x, Float2 *y 03252 #define REDUCE_PARAMS x, y 03253 #define REDUCE_X_AUXILIARY(i) Float2 a = READ_DOUBLE2_TEXTURE(x, i); 03254 #define REDUCE_Y_AUXILIARY(i) Float2 b = READ_DOUBLE2_TEXTURE(y, i); 03255 #define REDUCE_Z_AUXILIARY(i) 03256 #define REDUCE_X_OPERATION(i) (a.x*b.x + a.y*b.y) 03257 #define REDUCE_Y_OPERATION(i) (a.x*b.y - a.y*b.x) 03258 #define REDUCE_Z_OPERATION(i) (b.x*b.x + b.y*b.y) 03259 #include "reduce_triple_core.h" 03260 #undef REDUCE_FUNC_NAME 03261 #undef REDUCE_TYPES 03262 #undef REDUCE_PARAMS 03263 #undef REDUCE_X_AUXILIARY 03264 #undef REDUCE_Y_AUXILIARY 03265 #undef REDUCE_Z_AUXILIARY 03266 #undef REDUCE_X_OPERATION 03267 #undef REDUCE_Y_OPERATION 03268 #undef REDUCE_Z_OPERATION 03269 03270 template <unsigned int reduce_threads, typename Float2> 03271 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBS##suffix 03272 #define REDUCE_TYPES Float2 *a, Float2 *b 03273 #define REDUCE_PARAMS a, b 03274 #define REDUCE_X_AUXILIARY(i) 03275 #define REDUCE_Y_AUXILIARY(i) 03276 #define REDUCE_Z_AUXILIARY(i) 03277 #define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y) 03278 #define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x) 03279 #define REDUCE_Z_OPERATION(i) (b[i].x*b[i].x + b[i].y*b[i].y) 03280 #include "reduce_triple_core.h" 03281 #undef REDUCE_FUNC_NAME 03282 #undef REDUCE_TYPES 03283 #undef REDUCE_PARAMS 03284 #undef REDUCE_X_AUXILIARY 03285 #undef REDUCE_Y_AUXILIARY 03286 #undef REDUCE_Z_AUXILIARY 03287 #undef REDUCE_X_OPERATION 03288 #undef REDUCE_Y_OPERATION 03289 #undef REDUCE_Z_OPERATION 03290 03291 template <unsigned int reduce_threads, typename Float2> 03292 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBH##suffix 03293 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride 03294 #define REDUCE_PARAMS xN, yN, stride 03295 #define REDUCE_X_AUXILIARY(i) \ 03296 READ_HALF_SPINOR(x, texHalf1, stride); \ 03297 READ_HALF_SPINOR(y, texHalf2, stride); \ 03298 REAL_DOT_FLOAT4(norm0, y0, y0); \ 03299 REAL_DOT_FLOAT4(norm1, y1, y1); \ 03300 REAL_DOT_FLOAT4(norm2, y2, y2); \ 03301 REAL_DOT_FLOAT4(norm3, y3, y3); \ 03302 REAL_DOT_FLOAT4(norm4, y4, y4); \ 03303 REAL_DOT_FLOAT4(norm5, y5, y5); \ 03304 norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4; 03305 #define REDUCE_Y_AUXILIARY(i) \ 03306 REAL_DOT_FLOAT4(rdot0, x0, y0); \ 03307 REAL_DOT_FLOAT4(rdot1, x1, y1); \ 03308 REAL_DOT_FLOAT4(rdot2, x2, y2); \ 03309 REAL_DOT_FLOAT4(rdot3, x3, y3); \ 03310 REAL_DOT_FLOAT4(rdot4, x4, y4); \ 03311 REAL_DOT_FLOAT4(rdot5, x5, y5); \ 03312 rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; 03313 #define REDUCE_Z_AUXILIARY(i) \ 03314 IMAG_DOT_FLOAT4(idot0, x0, y0); \ 03315 IMAG_DOT_FLOAT4(idot1, x1, y1); \ 03316 IMAG_DOT_FLOAT4(idot2, x2, y2); \ 03317 IMAG_DOT_FLOAT4(idot3, x3, y3); \ 03318 IMAG_DOT_FLOAT4(idot4, x4, y4); \ 03319 IMAG_DOT_FLOAT4(idot5, x5, y5); \ 03320 idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4; 03321 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0) 03322 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0) 03323 #define REDUCE_Z_OPERATION(i) (yc*yc*norm0) 03324 #include "reduce_triple_core.h" 03325 #undef REDUCE_FUNC_NAME 03326 #undef REDUCE_TYPES 03327 #undef REDUCE_PARAMS 03328 #undef REDUCE_X_AUXILIARY 03329 #undef REDUCE_Y_AUXILIARY 03330 #undef REDUCE_Z_AUXILIARY 03331 #undef REDUCE_X_OPERATION 03332 #undef REDUCE_Y_OPERATION 03333 #undef REDUCE_Z_OPERATION 03334 03335 template <unsigned int reduce_threads, typename Float2> 03336 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBHSt##suffix 03337 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride 03338 #define REDUCE_PARAMS xN, yN, stride 03339 #define REDUCE_X_AUXILIARY(i) \ 03340 READ_HALF_SPINOR_ST(x, texHalfSt1, stride); \ 03341 READ_HALF_SPINOR_ST(y, texHalfSt2, stride); \ 03342 REAL_DOT_FLOAT2(norm0, y0, y0); \ 03343 REAL_DOT_FLOAT2(norm1, y1, y1); \ 03344 REAL_DOT_FLOAT2(norm2, y2, y2); \ 03345 norm0 += norm1; norm0 += norm2; 03346 #define REDUCE_Y_AUXILIARY(i) \ 03347 REAL_DOT_FLOAT2(rdot0, x0, y0); \ 03348 REAL_DOT_FLOAT2(rdot1, x1, y1); \ 03349 REAL_DOT_FLOAT2(rdot2, x2, y2); \ 03350 rdot0 += rdot1; rdot0 += rdot2; 03351 #define REDUCE_Z_AUXILIARY(i) \ 03352 IMAG_DOT_FLOAT2(idot0, x0, y0); \ 03353 IMAG_DOT_FLOAT2(idot1, x1, y1); \ 03354 IMAG_DOT_FLOAT2(idot2, x2, y2); \ 03355 idot0 += idot1; idot0 += idot2; 03356 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0) 03357 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0) 03358 #define REDUCE_Z_OPERATION(i) (yc*yc*norm0) 03359 #include "reduce_triple_core.h" 03360 #undef REDUCE_FUNC_NAME 03361 #undef REDUCE_TYPES 03362 #undef REDUCE_PARAMS 03363 #undef REDUCE_X_AUXILIARY 03364 #undef REDUCE_Y_AUXILIARY 03365 #undef REDUCE_Z_AUXILIARY 03366 #undef REDUCE_X_OPERATION 03367 #undef REDUCE_Y_OPERATION 03368 #undef REDUCE_Z_OPERATION 03369 03370 double3 cDotProductNormBCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 03371 const int id = 22; 03372 blas_quda_flops += 6*x.real_length; 03373 checkSpinor(x,y); 03374 int length = x.length/2; 03375 blas_quda_bytes += 2*x.real_length*x.precision; 03376 if (x.precision == QUDA_DOUBLE_PRECISION) { 03377 int spinor_bytes = x.length*sizeof(double); 03378 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 03379 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 03380 return cDotProductNormBDCuda((double2*)x.v, (double2*)y.v, length, id, x.precision); 03381 } else if (x.precision == QUDA_SINGLE_PRECISION) { 03382 return cDotProductNormBSCuda((float2*)x.v, (float2*)y.v, length, id, x.precision); 03383 } else { 03384 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 03385 return cDotProductNormBCuda(x.Even(), y.Even()) + cDotProductNormBCuda(x.Odd(), y.Odd()); 03386 int spinor_bytes = x.length*sizeof(short); 03387 blas_quda_bytes += 2*x.volume*sizeof(float); 03388 if (x.nSpin == 4){ //wilson 03389 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 03390 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 03391 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 03392 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 03393 return cDotProductNormBHCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision); 03394 } else if (x.nSpin == 1){ //staggered 03395 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 03396 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 03397 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 03398 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 03399 return cDotProductNormBHStCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision); 03400 }else{ 03401 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 03402 } 03403 } 03404 03405 } 03406 03407 03408 // 03409 // double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y, 03410 // float2 *z, float2 *w, float2 *u, int len) 03411 // 03412 template <unsigned int reduce_threads, typename Float2> 03413 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYD##suffix 03414 #define REDUCE_TYPES Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, Float2 *u 03415 #define REDUCE_PARAMS a, x, b, y, z, w, u 03416 #define REDUCE_X_AUXILIARY(i) \ 03417 Float2 X = READ_DOUBLE2_TEXTURE(x, i); \ 03418 Float2 Y = READ_DOUBLE2_TEXTURE(y, i); \ 03419 Float2 W = READ_DOUBLE2_TEXTURE(w, i); 03420 #define REDUCE_Y_AUXILIARY(i) \ 03421 Float2 Z = read_Float2(z, i); \ 03422 Z.x += a.x*X.x - a.y*X.y; \ 03423 Z.y += a.y*X.x + a.x*X.y; \ 03424 Z.x += b.x*Y.x - b.y*Y.y; \ 03425 Z.y += b.y*Y.x + b.x*Y.y; \ 03426 Y.x -= b.x*W.x - b.y*W.y; \ 03427 Y.y -= b.y*W.x + b.x*W.y; 03428 #define REDUCE_Z_AUXILIARY(i) \ 03429 z[i] = make_Float2(Z); \ 03430 y[i] = make_Float2(Y); 03431 #define REDUCE_X_OPERATION(i) (u[i].x*y[i].x + u[i].y*y[i].y) 03432 #define REDUCE_Y_OPERATION(i) (u[i].x*y[i].y - u[i].y*y[i].x) 03433 #define REDUCE_Z_OPERATION(i) (y[i].x*y[i].x + y[i].y*y[i].y) 03434 #include "reduce_triple_core.h" 03435 #undef REDUCE_FUNC_NAME 03436 #undef REDUCE_TYPES 03437 #undef REDUCE_PARAMS 03438 #undef REDUCE_X_AUXILIARY 03439 #undef REDUCE_Y_AUXILIARY 03440 #undef REDUCE_Z_AUXILIARY 03441 #undef REDUCE_X_OPERATION 03442 #undef REDUCE_Y_OPERATION 03443 #undef REDUCE_Z_OPERATION 03444 03445 template <unsigned int reduce_threads, typename Float2> 03446 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYS##suffix 03447 #define REDUCE_TYPES Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, Float2 *u 03448 #define REDUCE_PARAMS a, x, b, y, z, w, u 03449 #define REDUCE_X_AUXILIARY(i) \ 03450 Float2 X = read_Float2(x, i); \ 03451 Float2 Y = read_Float2(y, i); \ 03452 Float2 W = read_Float2(w, i); 03453 #define REDUCE_Y_AUXILIARY(i) \ 03454 Float2 Z = read_Float2(z, i); \ 03455 Z.x += a.x*X.x - a.y*X.y; \ 03456 Z.y += a.y*X.x + a.x*X.y; \ 03457 Z.x += b.x*Y.x - b.y*Y.y; \ 03458 Z.y += b.y*Y.x + b.x*Y.y; \ 03459 Y.x -= b.x*W.x - b.y*W.y; \ 03460 Y.y -= b.y*W.x + b.x*W.y; 03461 #define REDUCE_Z_AUXILIARY(i) \ 03462 z[i] = make_Float2(Z); \ 03463 y[i] = make_Float2(Y); 03464 #define REDUCE_X_OPERATION(i) (u[i].x*y[i].x + u[i].y*y[i].y) 03465 #define REDUCE_Y_OPERATION(i) (u[i].x*y[i].y - u[i].y*y[i].x) 03466 #define REDUCE_Z_OPERATION(i) (y[i].x*y[i].x + y[i].y*y[i].y) 03467 #include "reduce_triple_core.h" 03468 #undef REDUCE_FUNC_NAME 03469 #undef REDUCE_TYPES 03470 #undef REDUCE_PARAMS 03471 #undef REDUCE_X_AUXILIARY 03472 #undef REDUCE_Y_AUXILIARY 03473 #undef REDUCE_Z_AUXILIARY 03474 #undef REDUCE_X_OPERATION 03475 #undef REDUCE_Y_OPERATION 03476 #undef REDUCE_Z_OPERATION 03477 03478 // 03479 // double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y, 03480 // float2 *z, float2 *w, float2 *u, int len) 03481 // 03482 template <unsigned int reduce_threads, typename Float2> 03483 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYH##suffix 03484 #define REDUCE_TYPES Float2 a, Float2 b, short4 *yH, float *yN, short4 *zH, float *zN, float *wN, float *uN, int stride 03485 #define REDUCE_PARAMS a, b, yH, yN, zH, zN, wN, uN, stride 03486 #define REDUCE_X_AUXILIARY(i) \ 03487 RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride); \ 03488 RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride); \ 03489 RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride); \ 03490 CAXPBYPZ_FLOAT4(a, x0, b, y0, z0); \ 03491 CAXPBYPZ_FLOAT4(a, x1, b, y1, z1); \ 03492 CAXPBYPZ_FLOAT4(a, x2, b, y2, z2); \ 03493 CAXPBYPZ_FLOAT4(a, x3, b, y3, z3); \ 03494 CAXPBYPZ_FLOAT4(a, x4, b, y4, z4); \ 03495 CAXPBYPZ_FLOAT4(a, x5, b, y5, z5); \ 03496 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(zH, zN, z, stride); \ 03497 READ_HALF_SPINOR(w, texHalf4, stride); \ 03498 float2 bwc = -wc*b; \ 03499 CAXPY_FLOAT4(bwc, w0, y0); \ 03500 CAXPY_FLOAT4(bwc, w1, y1); \ 03501 CAXPY_FLOAT4(bwc, w2, y2); \ 03502 CAXPY_FLOAT4(bwc, w3, y3); \ 03503 CAXPY_FLOAT4(bwc, w4, y4); \ 03504 CAXPY_FLOAT4(bwc, w5, y5); \ 03505 REAL_DOT_FLOAT4(norm0, y0, y0); \ 03506 REAL_DOT_FLOAT4(norm1, y1, y1); \ 03507 REAL_DOT_FLOAT4(norm2, y2, y2); \ 03508 REAL_DOT_FLOAT4(norm3, y3, y3); \ 03509 REAL_DOT_FLOAT4(norm4, y4, y4); \ 03510 REAL_DOT_FLOAT4(norm5, y5, y5); \ 03511 CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride); 03512 #define REDUCE_Y_AUXILIARY(i) \ 03513 READ_HALF_SPINOR(u, texHalf5, stride); \ 03514 REAL_DOT_FLOAT4(rdot0, u0, y0); \ 03515 REAL_DOT_FLOAT4(rdot1, u1, y1); \ 03516 REAL_DOT_FLOAT4(rdot2, u2, y2); \ 03517 REAL_DOT_FLOAT4(rdot3, u3, y3); \ 03518 REAL_DOT_FLOAT4(rdot4, u4, y4); \ 03519 REAL_DOT_FLOAT4(rdot5, u5, y5); \ 03520 IMAG_DOT_FLOAT4(idot0, u0, y0); \ 03521 IMAG_DOT_FLOAT4(idot1, u1, y1); \ 03522 IMAG_DOT_FLOAT4(idot2, u2, y2); \ 03523 IMAG_DOT_FLOAT4(idot3, u3, y3); \ 03524 IMAG_DOT_FLOAT4(idot4, u4, y4); \ 03525 IMAG_DOT_FLOAT4(idot5, u5, y5); 03526 #define REDUCE_Z_AUXILIARY(i) \ 03527 norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4; \ 03528 rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; \ 03529 idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4; 03530 03531 #define REDUCE_X_OPERATION(i) (uc*rdot0) 03532 #define REDUCE_Y_OPERATION(i) (uc*idot0) 03533 #define REDUCE_Z_OPERATION(i) (norm0) 03534 03535 #include "reduce_triple_core.h" 03536 #undef REDUCE_FUNC_NAME 03537 #undef REDUCE_TYPES 03538 #undef REDUCE_PARAMS 03539 #undef REDUCE_X_AUXILIARY 03540 #undef REDUCE_Y_AUXILIARY 03541 #undef REDUCE_Z_AUXILIARY 03542 #undef REDUCE_X_OPERATION 03543 #undef REDUCE_Y_OPERATION 03544 #undef REDUCE_Z_OPERATION 03545 03546 03547 template <unsigned int reduce_threads, typename Float2> 03548 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYH##suffix 03549 #define REDUCE_TYPES Float2 a, Float2 b, short2 *yH, float *yN, short2 *zH, float *zN, float *wN, float *uN, int stride 03550 #define REDUCE_PARAMS a, b, yH, yN, zH, zN, wN, uN, stride 03551 #define REDUCE_X_AUXILIARY(i) \ 03552 RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride); \ 03553 RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride); \ 03554 RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride); \ 03555 CAXPBYPZ_FLOAT2(a, x0, b, y0, z0); \ 03556 CAXPBYPZ_FLOAT2(a, x1, b, y1, z1); \ 03557 CAXPBYPZ_FLOAT2(a, x2, b, y2, z2); \ 03558 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(zH, zN, z, stride); \ 03559 READ_HALF_SPINOR_ST(w, texHalfSt4, stride); \ 03560 float2 bwc = -wc*b; \ 03561 CAXPY_FLOAT2(bwc, w0, y0); \ 03562 CAXPY_FLOAT2(bwc, w1, y1); \ 03563 CAXPY_FLOAT2(bwc, w2, y2); \ 03564 REAL_DOT_FLOAT2(norm0, y0, y0); \ 03565 REAL_DOT_FLOAT2(norm1, y1, y1); \ 03566 REAL_DOT_FLOAT2(norm2, y2, y2); \ 03567 CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride); 03568 #define REDUCE_Y_AUXILIARY(i) \ 03569 READ_HALF_SPINOR_ST(u, texHalfSt5, stride); \ 03570 REAL_DOT_FLOAT2(rdot0, u0, y0); \ 03571 REAL_DOT_FLOAT2(rdot1, u1, y1); \ 03572 REAL_DOT_FLOAT2(rdot2, u2, y2); \ 03573 IMAG_DOT_FLOAT2(idot0, u0, y0); \ 03574 IMAG_DOT_FLOAT2(idot1, u1, y1); \ 03575 IMAG_DOT_FLOAT2(idot2, u2, y2); 03576 #define REDUCE_Z_AUXILIARY(i) \ 03577 norm0 += norm1; norm0 += norm2; \ 03578 rdot0 += rdot1; rdot0 += rdot2; \ 03579 idot0 += idot1; idot0 += idot2; 03580 03581 #define REDUCE_X_OPERATION(i) (uc*rdot0) 03582 #define REDUCE_Y_OPERATION(i) (uc*idot0) 03583 #define REDUCE_Z_OPERATION(i) (norm0) 03584 03585 #include "reduce_triple_core.h" 03586 #undef REDUCE_FUNC_NAME 03587 #undef REDUCE_TYPES 03588 #undef REDUCE_PARAMS 03589 #undef REDUCE_X_AUXILIARY 03590 #undef REDUCE_Y_AUXILIARY 03591 #undef REDUCE_Z_AUXILIARY 03592 #undef REDUCE_X_OPERATION 03593 #undef REDUCE_Y_OPERATION 03594 #undef REDUCE_Z_OPERATION 03595 03596 // This convoluted kernel does the following: z += a*x + b*y, y -= b*w, norm = (y,y), dot = (u, y) 03597 double3 caxpbypzYmbwcDotProductWYNormYCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y, 03598 cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &u) { 03599 const int id = 23; 03600 blas_quda_flops += 18*x.real_length; 03601 checkSpinor(x,y); 03602 checkSpinor(x,z); 03603 checkSpinor(x,w); 03604 checkSpinor(x,u); 03605 int length = x.length/2; 03606 blas_quda_bytes += 7*x.real_length*x.precision; 03607 if (x.precision == QUDA_DOUBLE_PRECISION) { 03608 int spinor_bytes = x.length*sizeof(double); 03609 cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 03610 cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 03611 cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 03612 cudaBindTexture(0, wTexDouble2, w.v, spinor_bytes); 03613 cudaBindTexture(0, uTexDouble2, u.v, spinor_bytes); 03614 double2 a2 = make_double2(real(a), imag(a)); 03615 double2 b2 = make_double2(real(b), imag(b)); 03616 return caxpbypzYmbwcDotProductWYNormYDCuda(a2, (double2*)x.v, b2, (double2*)y.v, (double2*)z.v, 03617 (double2*)w.v, (double2*)u.v, length, id, x.precision); 03618 } else if (x.precision == QUDA_SINGLE_PRECISION) { 03619 float2 a2 = make_float2(real(a), imag(a)); 03620 float2 b2 = make_float2(real(b), imag(b)); 03621 return caxpbypzYmbwcDotProductWYNormYSCuda(a2, (float2*)x.v, b2, (float2*)y.v, (float2*)z.v, 03622 (float2*)w.v, (float2*)u.v, length, id, x.precision); 03623 } else { 03624 // fused nSpin=4 kernel is slow on Fermi 03625 // N.B. this introduces an extra half truncation so will affect convergence 03626 if (!blasTuning && (__CUDA_ARCH__ >= 200) && x.nSpin == 4) { 03627 caxpbypzYmbwCuda(a, x, b, y, z, w); 03628 return cDotProductNormBCuda(u, y); 03629 } 03630 03631 if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 03632 return caxpbypzYmbwcDotProductWYNormYCuda(a, x.Even(), b, y.Even(), z.Even(), w.Even(), u.Even()) + 03633 caxpbypzYmbwcDotProductWYNormYCuda(a, x.Odd(), b, y.Odd(), z.Odd(), w.Odd(), u.Odd()); 03634 int spinor_bytes = x.length*sizeof(short); 03635 blas_quda_bytes += 7*x.volume*sizeof(float); 03636 if (x.nSpin == 4) { // wilson 03637 cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 03638 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12); 03639 cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 03640 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12); 03641 cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 03642 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12); 03643 cudaBindTexture(0, texHalf4, w.v, spinor_bytes); 03644 cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/12); 03645 cudaBindTexture(0, texHalf5, u.v, spinor_bytes); 03646 cudaBindTexture(0, texNorm5, u.norm, spinor_bytes/12); 03647 float2 a2 = make_float2(real(a), imag(a)); 03648 float2 b2 = make_float2(real(b), imag(b)); 03649 return caxpbypzYmbwcDotProductWYNormYHCuda(a2, b2, (short4*)y.v, (float*)y.norm, 03650 (short4*)z.v, (float*)z.norm, (float*)w.norm, (float*)u.norm, 03651 y.stride, y.volume, id, x.precision); 03652 } else if (x.nSpin == 1){ // staggered 03653 cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 03654 cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3); 03655 cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 03656 cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3); 03657 cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 03658 cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3); 03659 cudaBindTexture(0, texHalfSt4, w.v, spinor_bytes); 03660 cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/3); 03661 cudaBindTexture(0, texHalfSt5, u.v, spinor_bytes); 03662 cudaBindTexture(0, texNorm5, u.norm, spinor_bytes/3); 03663 float2 a2 = make_float2(real(a), imag(a)); 03664 float2 b2 = make_float2(real(b), imag(b)); 03665 return caxpbypzYmbwcDotProductWYNormYHCuda(a2, b2, (short2*)y.v, (float*)y.norm, 03666 (short2*)z.v, (float*)z.norm, (float*)w.norm, (float*)u.norm, 03667 y.stride, y.volume, id, x.precision); 03668 } else { 03669 errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin); 03670 } 03671 } 03672 03673 } 03674
1.7.3