QUDA: quda/lib/blas_quda.cu Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 
00004 #include <quda_internal.h>
00005 #include <blas_quda.h>
00006 #include <color_spinor_field.h>
00007 
00008 #include <cuComplex.h>
00009 
00010 #define REDUCE_MAX_BLOCKS 65536
00011 
00012 #define REDUCE_DOUBLE 64
00013 #define REDUCE_KAHAN 32
00014 
00015 #if (__CUDA_ARCH__ >= 130)
00016 #define REDUCE_TYPE REDUCE_DOUBLE
00017 #define QudaSumFloat double
00018 #define QudaSumComplex cuDoubleComplex
00019 #define QudaSumFloat3 double3
00020 #else
00021 #define REDUCE_TYPE REDUCE_KAHAN
00022 #define QudaSumFloat float
00023 #define QudaSumComplex cuComplex
00024 #define QudaSumFloat3 float3
00025 #endif
00026 
00027 // Required for the reduction kernels
00028 #ifdef __DEVICE_EMULATION__
00029 #define EMUSYNC __syncthreads()
00030 #else
00031 #define EMUSYNC
00032 #endif
00033 
00034 // These are used for reduction kernels
00035 static QudaSumFloat *d_reduceFloat=0;
00036 static QudaSumComplex *d_reduceComplex=0;
00037 static QudaSumFloat3 *d_reduceFloat3=0;
00038 
00039 static QudaSumFloat *h_reduceFloat=0;
00040 static QudaSumComplex *h_reduceComplex=0;
00041 static QudaSumFloat3 *h_reduceFloat3=0;
00042 
00043 unsigned long long blas_quda_flops;
00044 unsigned long long blas_quda_bytes;
00045 
00046 static dim3 blasBlock;
00047 static dim3 blasGrid;
00048 
00049 // generated by blas_test
00050 #include <blas_param.h>
00051 
00052 double2 operator+(const double2& x, const double2 &y) {
00053   return make_double2(x.x + y.x, x.y + y.y);
00054 }
00055 
00056 double3 operator+(const double3& x, const double3 &y) {
00057   double3 z;
00058   z.x = x.x + y.x; z.y = x.y + y.y; z.z = x.z + y.z;
00059   return z;
00060 }
00061 
00062 __device__ float2 operator*(const float a, const float2 x) {
00063   float2 y;
00064   y.x = a*x.x;
00065   y.y = a*x.y;
00066   return y;
00067 }
00068 
00069 template <typename Float2>
00070 __device__ Float2 operator+(const Float2 x, const Float2 y) {
00071   Float2 z;
00072   z.x = x.x + y.x;
00073   z.y = x.y + y.y;
00074   return z;
00075 }
00076 
00077 template <typename Float2>
00078 __device__ Float2 operator+=(Float2 &x, const Float2 y) {
00079   x.x += y.x;
00080   x.y += y.y;
00081   return x;
00082 }
00083 
00084 template <typename Float2>
00085 __device__ Float2 operator-=(Float2 &x, const Float2 y) {
00086   x.x -= y.x;
00087   x.y -= y.y;
00088   return x;
00089 }
00090 
00091 template <typename Float, typename Float2>
00092 __device__ Float2 operator*=(Float2 &x, const Float a) {
00093   x.x *= a;
00094   x.y *= a;
00095   return x;
00096 }
00097 
00098 template <typename Float>
00099 __device__ float4 operator*=(float4 &a, const Float &b) {
00100   a.x *= b;
00101   a.y *= b;
00102   a.z *= b;
00103   a.w *= b;
00104   return a;
00105 }
00106 
00107 
00108 
00109 void zeroCuda(cudaColorSpinorField &a) { a.zero(); }
00110 
00111 void initBlas(void)
00112 {  
00113   if (!d_reduceFloat) {
00114     if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
00115       errorQuda("Error allocating device reduction array");
00116     }
00117   }
00118 
00119   if (!d_reduceComplex) {
00120     if (cudaMalloc((void**) &d_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) {
00121       errorQuda("Error allocating device reduction array");
00122     }
00123   }
00124   
00125   if (!d_reduceFloat3) {
00126     if (cudaMalloc((void**) &d_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) {
00127       errorQuda("Error allocating device reduction array");
00128     }
00129   }
00130 
00131   if (!h_reduceFloat) {
00132     if (cudaMallocHost((void**) &h_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
00133       errorQuda("Error allocating host reduction array");
00134     }
00135   }
00136 
00137   if (!h_reduceComplex) {
00138     if (cudaMallocHost((void**) &h_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) {
00139       errorQuda("Error allocating host reduction array");
00140     }
00141   }
00142   
00143   if (!h_reduceFloat3) {
00144     if (cudaMallocHost((void**) &h_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) {
00145       errorQuda("Error allocating host reduction array");
00146     }
00147   }
00148 }
00149 
00150 void endBlas(void)
00151 {
00152   if (d_reduceFloat) cudaFree(d_reduceFloat);
00153   if (d_reduceComplex) cudaFree(d_reduceComplex);
00154   if (d_reduceFloat3) cudaFree(d_reduceFloat3);
00155   if (h_reduceFloat) cudaFreeHost(h_reduceFloat);
00156   if (h_reduceComplex) cudaFreeHost(h_reduceComplex);
00157   if (h_reduceFloat3) cudaFreeHost(h_reduceFloat3);
00158 }
00159 
00160 // blasTuning = 1 turns off error checking
00161 static int blasTuning = 0;
00162 
00163 void setBlasTuning(int tuning)
00164 {
00165   blasTuning = tuning;
00166 }
00167 
00168 void setBlasParam(int kernel, int prec, int threads, int blocks)
00169 {
00170   blas_threads[kernel][prec] = threads;
00171   blas_blocks[kernel][prec] = blocks;
00172 }
00173 
00174 void setBlock(int kernel, int length, QudaPrecision precision)
00175 {
00176   int prec;
00177   switch(precision) {
00178   case QUDA_HALF_PRECISION:
00179     prec = 0;
00180     break;
00181   case QUDA_SINGLE_PRECISION:
00182     prec = 1;
00183     break;
00184   case QUDA_DOUBLE_PRECISION:
00185     prec = 2;
00186     break;
00187   }
00188 
00189   int blocks = min(blas_blocks[kernel][prec], max(length/blas_threads[kernel][prec], 1));
00190   blasBlock.x = blas_threads[kernel][prec];
00191   blasBlock.y = 1;
00192   blasBlock.z = 1;
00193 
00194   blasGrid.x = blocks;
00195   blasGrid.y = 1;
00196   blasGrid.z = 1;
00197 }
00198 
00199 #if (__CUDA_ARCH__ >= 130)
00200 static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
00201 {
00202   int4 v = tex1Dfetch(t,i);
00203   return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
00204 }
00205 #else
00206 static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
00207 {
00208   // do nothing
00209   return make_double2(0.0, 0.0);
00210 }
00211 #endif
00212 
00213 float2 __device__ read_Float2(float2 *x, int i) {
00214   return make_float2(x[i].x, x[i].y);
00215 }
00216 
00217 double2 __device__ read_Float2(double2 *x, int i) {
00218   return make_double2(x[i].x, x[i].y);
00219 }
00220 
00221 #define READ_DOUBLE2_TEXTURE(x, i) \
00222   fetch_double2(x##TexDouble2, i)
00223 
00224 #define READ_FLOAT2_TEXTURE(x, i) \
00225   tex1Dfetch(x##TexSingle2, i)
00226 
00227 float2 __device__ make_Float2(float2 x) {
00228   return make_float2(x.x, x.y);
00229 }
00230 
00231 double2 __device__ make_Float2(double2 x) {
00232   return make_double2(x.x, x.y);
00233 }
00234 
00235 #define RECONSTRUCT_HALF_SPINOR(a, texHalf, texNorm, length)            \
00236   float a##c = tex1Dfetch(texNorm, i);                                  \
00237   float4 a##0 = tex1Dfetch(texHalf, i + 0*length);                      \
00238   float4 a##1 = tex1Dfetch(texHalf, i + 1*length);                      \
00239   float4 a##2 = tex1Dfetch(texHalf, i + 2*length);                      \
00240   float4 a##3 = tex1Dfetch(texHalf, i + 3*length);                      \
00241   float4 a##4 = tex1Dfetch(texHalf, i + 4*length);                      \
00242   float4 a##5 = tex1Dfetch(texHalf, i + 5*length);                      \
00243   a##0 *= a##c;                                                         \
00244   a##1 *= a##c;                                                         \
00245   a##2 *= a##c;                                                         \
00246   a##3 *= a##c;                                                         \
00247   a##4 *= a##c;                                                         \
00248   a##5 *= a##c;
00249 
00250 #define RECONSTRUCT_HALF_SPINOR_ST(a, texHalf, texNorm, length)         \
00251   float a##c = tex1Dfetch(texNorm, i);                                  \
00252   float2 a##0 = tex1Dfetch(texHalf, i + 0*length);                      \
00253   float2 a##1 = tex1Dfetch(texHalf, i + 1*length);                      \
00254   float2 a##2 = tex1Dfetch(texHalf, i + 2*length);                      \
00255   (a##0) *= a##c;                                                               \
00256   (a##1) *= a##c;                                                               \
00257   (a##2) *= a##c;
00258 
00259 
00260 // Some musings on how to clean up the blas code using Boost
00261 /*#define BOOST_RECONSTRUCT_HALF_SPINOR(z, j, a, texHalf, length)       \
00262   float4 a##k tex1Dfetch(texHalf, i + j*length);        \
00263   a##k *= a##c;
00264 
00265 #define RECONSTRUCT_HALF_SPINOR(a, texHalf, texNorm, length)            \
00266   BOOST_PP_REPEAT(6, BOOST_RECONSTRUCT_HALF_SPINOR, a, texHalf, length) \
00267 */
00268 
00269 #define READ_HALF_SPINOR_TEX(a, tex, texNorm, length)                   \
00270   float a##c = tex1Dfetch(texNorm, i);                                  \
00271   float4 a##0 = tex1Dfetch(tex, i + 0*length);                          \
00272   float4 a##1 = tex1Dfetch(tex, i + 1*length);                          \
00273   float4 a##2 = tex1Dfetch(tex, i + 2*length);                          \
00274   float4 a##3 = tex1Dfetch(tex, i + 3*length);                          \
00275   float4 a##4 = tex1Dfetch(tex, i + 4*length);                          \
00276   float4 a##5 = tex1Dfetch(tex, i + 5*length);                          \
00277 
00278 #define READ_HALF_SPINOR(a, tex, length)                                \
00279   float4 a##0 = tex1Dfetch(tex, i + 0*length);                          \
00280   float4 a##1 = tex1Dfetch(tex, i + 1*length);                          \
00281   float4 a##2 = tex1Dfetch(tex, i + 2*length);                          \
00282   float4 a##3 = tex1Dfetch(tex, i + 3*length);                          \
00283   float4 a##4 = tex1Dfetch(tex, i + 4*length);                          \
00284   float4 a##5 = tex1Dfetch(tex, i + 5*length);                          \
00285   float a##c = a##N[i];
00286 
00287 #define READ_HALF_SPINOR_ST(a, tex, length)                             \
00288   float2 a##0 = tex1Dfetch(tex, i + 0*length);                          \
00289   float2 a##1 = tex1Dfetch(tex, i + 1*length);                          \
00290   float2 a##2 = tex1Dfetch(tex, i + 2*length);                          \
00291   float a##c = a##N[i];
00292 
00293 #define SHORT_LENGTH 65536
00294 #define SCALE_FLOAT ((SHORT_LENGTH-1) * 0.5)
00295 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
00296 
00297 __device__ short float2short(float c, float a) {
00298   //return (short)(a*MAX_SHORT);
00299   short rtn = (short)((a+SHIFT_FLOAT)*SCALE_FLOAT*c);
00300   return rtn;
00301 }
00302 
00303 __device__ float short2float(short a) {
00304   return (float)a/SCALE_FLOAT - SHIFT_FLOAT;
00305 }
00306 
00307 __device__ short4 float42short4(float c, float4 a) {
00308   return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w));
00309 }
00310 
00311 #define FAST_ABS_MAX(a, b) fmaxf(fabsf(a), fabsf(b));
00312 #define FAST_MAX(a, b) fmaxf(a, b);
00313 
00314 __device__ float fast_abs_max(float4 a) {
00315   float c0 = FAST_ABS_MAX(a.x, a.y);
00316   float c1 = FAST_ABS_MAX(a.z, a.w);
00317   return FAST_MAX(c0, c1);
00318 }
00319 
00320 #define CONSTRUCT_HALF_SPINOR_FROM_SINGLE(h, n, a, length) {            \
00321     float c0 = fast_abs_max(a##0);                                      \
00322     float c1 = fast_abs_max(a##1);                                      \
00323     c0 = FAST_MAX(c0, c1);                                              \
00324     float c2 = fast_abs_max(a##2);                                      \
00325     float c3 = fast_abs_max(a##3);                                      \
00326     c1 = FAST_MAX(c2, c3);                                              \
00327     c0 = FAST_MAX(c0, c1);                                              \
00328     c2 = fast_abs_max(a##4);                                            \
00329     c3 = fast_abs_max(a##5);                                            \
00330     c1 = FAST_MAX(c2, c3);                                              \
00331     c0 = FAST_MAX(c0, c1);                                              \
00332     n[i] = c0;                                                          \
00333     float C = __fdividef(MAX_SHORT, c0);                                \
00334     h[i+0*length] = make_short4((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y), \
00335                                 (short)(C*(float)(a##0).z), (short)(C*(float)(a##0).w)); \
00336     h[i+1*length] = make_short4((short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y), \
00337                                 (short)(C*(float)(a##1).z), (short)(C*(float)(a##1).w)); \
00338     h[i+2*length] = make_short4((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y), \
00339                                 (short)(C*(float)(a##2).z), (short)(C*(float)(a##2).w)); \
00340     h[i+3*length] = make_short4((short)(C*(float)(a##3).x), (short)(C*(float)(a##3).y), \
00341                                 (short)(C*(float)(a##3).z), (short)(C*(float)(a##3).w)); \
00342     h[i+4*length] = make_short4((short)(C*(float)(a##4).x), (short)(C*(float)(a##4).y), \
00343                                 (short)(C*(float)(a##4).z), (short)(C*(float)(a##4).w)); \
00344     h[i+5*length] = make_short4((short)(C*(float)(a##5).x), (short)(C*(float)(a##5).y), \
00345                                 (short)(C*(float)(a##5).z), (short)(C*(float)(a##5).w));}
00346 
00347 #define CONSTRUCT_HALF_SPINOR_FROM_DOUBLE(h, n, a, length)              \
00348   {float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y));                  \
00349   float c1 = fmaxf(fabsf((a##1).x), fabsf((a##1).y));                   \
00350   float c2 = fmaxf(fabsf((a##2).x), fabsf((a##2).y));                   \
00351   float c3 = fmaxf(fabsf((a##3).x), fabsf((a##3).y));                   \
00352   float c4 = fmaxf(fabsf((a##4).x), fabsf((a##4).y));                   \
00353   float c5 = fmaxf(fabsf((a##5).x), fabsf((a##5).y));                   \
00354   float c6 = fmaxf(fabsf((a##6).x), fabsf((a##6).y));                   \
00355   float c7 = fmaxf(fabsf((a##7).x), fabsf((a##7).y));                   \
00356   float c8 = fmaxf(fabsf((a##8).x), fabsf((a##8).y));                   \
00357   float c9 = fmaxf(fabsf((a##9).x), fabsf((a##9).y));                   \
00358   float c10 = fmaxf(fabsf((a##10).x), fabsf((a##10).y));                \
00359   float c11 = fmaxf(fabsf((a##11).x), fabsf((a##11).y));                \
00360   c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3);  c2 = fmaxf(c4, c5); c3 = fmaxf(c6, c7); \
00361   c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); \
00362   c2 = fmaxf(c4, c5); c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2);           \
00363   n[i] = c0;                                                            \
00364   float C = __fdividef(MAX_SHORT, c0);                                  \
00365   h[i+0*length] = make_short4((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y), \
00366                               (short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y)); \
00367   h[i+1*length] = make_short4((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y), \
00368                               (short)(C*(float)(a##3).x), (short)(C*(float)(a##3).y)); \
00369   h[i+2*length] = make_short4((short)(C*(float)(a##4).x), (short)(C*(float)(a##4).y), \
00370                               (short)(C*(float)(a##5).x), (short)(C*(float)(a##5).y)); \
00371   h[i+3*length] = make_short4((short)(C*(float)(a##6).x), (short)(C*(float)(a##6).y), \
00372                               (short)(C*(float)(a##7).x), (short)(C*(float)(a##7).y)); \
00373   h[i+4*length] = make_short4((short)(C*(float)(a##8).x), (short)(C*(float)(a##8).y), \
00374                               (short)(C*(float)(a##9).x), (short)(C*(float)(a##9).y)); \
00375   h[i+5*length] = make_short4((short)(C*(float)(a##10).x), (short)(C*(float)(a##10).y), \
00376                               (short)(C*(float)(a##11).x), (short)(C*(float)(a##11).y));}
00377 
00378 #define CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(h, n, a, length)           \
00379     {float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y));                \
00380         float c1 = fmaxf(fabsf((a##1).x), fabsf((a##1).y));             \
00381         float c2 = fmaxf(fabsf((a##2).x), fabsf((a##2).y));             \
00382         c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2);                         \
00383         n[i] = c0;                                                      \
00384         float C = __fdividef(MAX_SHORT, c0);                            \
00385         h[i+0*length] = make_short2((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y)); \
00386         h[i+1*length] = make_short2((short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y)); \
00387         h[i+2*length] = make_short2((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y));}
00388 
00389 #define CONSTRUCT_HALF_SPINOR_FROM_DOUBLE_ST(h, n, a, length)           \
00390     {float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y));                \
00391         float c1 = fmaxf(fabsf((a##1).x), fabsf((a##1).y));             \
00392         float c2 = fmaxf(fabsf((a##2).x), fabsf((a##2).y));             \
00393         c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2);                         \
00394         n[i] = c0;                                                      \
00395         float C = __fdividef(MAX_SHORT, c0);                            \
00396         h[i+0*length] = make_short2((short)(C*(float)(a##0).x), (short)(C*(float)(a##0).y)); \
00397         h[i+1*length] = make_short2((short)(C*(float)(a##1).x), (short)(C*(float)(a##1).y)); \
00398         h[i+2*length] = make_short2((short)(C*(float)(a##2).x), (short)(C*(float)(a##2).y));}
00399 
00400 
00401 #define SUM_FLOAT4(sum, a)                      \
00402   float sum = a.x + a.y + a.z + a.w;
00403 
00404 #define SUM_FLOAT2(sum, a)                      \
00405   float sum = a.x + a.y;
00406 
00407 #if (__CUDA_ARCH__ < 200) 
00408 #define REAL_DOT_FLOAT4(dot, a, b) \
00409   float dot = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
00410 #else
00411 #define REAL_DOT_FLOAT4(dot, a, b)              \
00412   float dot = fmaf(a.x, b.x, 0.0f);             \
00413   dot = fmaf(a.y, b.y, dot);                    \
00414   dot = fmaf(a.z, b.z, dot);                    \
00415   dot = fmaf(a.w, b.w, dot)
00416 #endif
00417 
00418 #define REAL_DOT_FLOAT2(dot, a, b) \
00419   float dot = a.x*b.x + a.y*b.y;
00420 
00421 #if (__CUDA_ARCH__ < 200) 
00422 #define IMAG_DOT_FLOAT4(dot, a, b)                      \
00423   float dot = a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z;
00424 #else
00425 #define IMAG_DOT_FLOAT4(dot, a, b)              \
00426   float dot = fmaf(a.x, b.y, 0.0f);             \
00427   dot = fmaf(-a.y, b.x, dot);                   \
00428   dot = fmaf(a.z, b.w, dot);                    \
00429   dot = fmaf(-a.w, b.z, dot)
00430 #endif
00431 
00432 #define IMAG_DOT_FLOAT2(dot, a, b)                      \
00433   float dot = a.x*b.y - a.y*b.x;
00434 
00435 #define AX_FLOAT4(a, X)                         \
00436   X.x *= a; X.y *= a; X.z *= a; X.w *= a;
00437 
00438 #define AX_FLOAT2(a, X)                         \
00439   X.x *= a; X.y *= a;
00440 
00441 #define XPY_FLOAT4(X, Y)                     \
00442   Y.x += X.x; Y.y += X.y; Y.z += X.z; Y.w += X.w;
00443 
00444 #define XPY_FLOAT2(X, Y)                     \
00445   Y.x += X.x; Y.y += X.y;
00446 
00447 #define XMY_FLOAT4(X, Y)                     \
00448   Y.x = X.x - Y.x; Y.y = X.y - Y.y; Y.z = X.z - Y.z; Y.w = X.w - Y.w;
00449 
00450 #define XMY_FLOAT2(X, Y)                     \
00451   Y.x = X.x - Y.x; Y.y = X.y - Y.y;
00452 
00453 #define MXPY_FLOAT4(X, Y)                    \
00454   Y.x -= X.x; Y.y -= X.y; Y.z -= X.z; Y.w -= X.w;
00455 
00456 #define MXPY_FLOAT2(X, Y)                    \
00457   Y.x -= X.x; Y.y -= X.y; 
00458 
00459 #if (__CUDA_ARCH__ < 200) 
00460 #define AXPY_FLOAT4(a, X, Y)                 \
00461   Y.x += a*X.x; Y.y += a*X.y;                \
00462   Y.z += a*X.z; Y.w += a*X.w;
00463 #else
00464 #define AXPY_FLOAT4(a, X, Y)                            \
00465   Y.x = fmaf(a, X.x, Y.x); Y.y = fmaf(a, X.y, Y.y);     \
00466   Y.z = fmaf(a, X.z, Y.z); Y.w = fmaf(a, X.w, Y.w);
00467 #endif
00468 
00469 #define AXPY_FLOAT2(a, X, Y)                 \
00470   Y.x += a*X.x; Y.y += a*X.y;                
00471 
00472 #define AXPBY_FLOAT4(a, X, b, Y)                                \
00473   Y.x = b*Y.x; Y.x += a*X.x; Y.y = b*Y.y; Y.y += a*X.y;         \
00474   Y.z = b*Y.z; Y.z += a*X.z; Y.w = b*Y.w; Y.w += a*X.w; 
00475 
00476 #define AXPBY_FLOAT2(a, X, b, Y)                        \
00477   Y.x = b*Y.x; Y.x += a*X.x; Y.y = b*Y.y; Y.y += a*X.y;         \
00478 
00479 #if (__CUDA_ARCH__ < 200)
00480 #define XPAY_FLOAT4(X, a, Y)                         \
00481   Y.x = X.x + a*Y.x; Y.y = X.y + a*Y.y;              \
00482   Y.z = X.z + a*Y.z; Y.w = X.w + a*Y.w;
00483 #else
00484 #define XPAY_FLOAT4(X, a, Y)                         \
00485   Y.x = fmaf(a, Y.x, X.x); Y.y = fmaf(a, Y.y, X.y);  \
00486   Y.z = fmaf(a, Y.z, X.z); Y.w = fmaf(a, Y.w, X.w);
00487 #endif
00488 
00489 #define XPAY_FLOAT2(X, a, Y)                         \
00490   Y.x = X.x + a*Y.x; Y.y = X.y + a*Y.y;              
00491 
00492 #if (__CUDA_ARCH__ < 200)
00493 #define CAXPY_FLOAT4(a, X, Y)           \
00494   Y.x += a.x*X.x; Y.x -= a.y*X.y;       \
00495   Y.y += a.y*X.x; Y.y += a.x*X.y;       \
00496   Y.z += a.x*X.z; Y.z -= a.y*X.w;       \
00497   Y.w += a.y*X.z; Y.w += a.x*X.w;
00498 #else
00499 #define CAXPY_FLOAT4(a, X, Y)                                   \
00500   Y.x = fmaf(a.x, X.x, Y.x); Y.x = fmaf(-a.y, X.y, Y.x);        \
00501   Y.y = fmaf(a.y, X.x, Y.y); Y.y = fmaf( a.x, X.y, Y.y);        \
00502   Y.z = fmaf(a.x, X.z, Y.z); Y.z = fmaf(-a.y, X.w, Y.z);        \
00503   Y.w = fmaf(a.y, X.z, Y.w); Y.w = fmaf( a.x, X.w, Y.w);
00504 #endif // (__CUDA_ARCH__ < 200)
00505 
00506 #if (__CUDA_ARCH__ < 200)
00507 #define CAXPY_FLOAT2(a, X, Y)           \
00508   Y.x += a.x*X.x; Y.x -= a.y*X.y;       \
00509   Y.y += a.y*X.x; Y.y += a.x*X.y;
00510 #else
00511 #define CAXPY_FLOAT2(a, X, Y)                                   \
00512   Y.x = fmaf(a.x, X.x, Y.x); Y.x = fmaf(-a.y, X.y, Y.x);        \
00513   Y.y = fmaf(a.y, X.x, Y.y); Y.y = fmaf( a.x, X.y, Y.y);
00514 #endif // (__CUDA_ARCH__ < 200)
00515 
00516 #define CMAXPY_FLOAT4(a, X, Y)                  \
00517   Y.x -= a.x*X.x; Y.x += a.y*X.y;               \
00518   Y.y -= a.y*X.x; Y.y -= a.x*X.y;               \
00519   Y.z -= a.x*X.z; Y.z += a.y*X.w;               \
00520   Y.w -= a.y*X.z; Y.w -= a.x*X.w;
00521 
00522 #define CAXPBY_FLOAT4(a, X, b, Y)                                       \
00523   { float2 y;                                                           \
00524   y.x = a.x*X.x; y.x -= a.y*X.y; y.x += b.x*Y.x; y.x -= b.y*Y.y;        \
00525   y.y = a.y*X.x; y.y += a.x*X.y; y.y += b.y*Y.x; y.y += b.x*Y.y;        \
00526   Y.x = y.x; Y.y = y.y;                                                 \
00527   y.x = a.x*X.z; y.x -= a.y*X.w; y.x += b.x*Y.z; y.x -= b.y*Y.w;        \
00528   y.y = a.y*X.z; y.y += a.x*X.w; y.y += b.y*Y.z; y.y += b.x*Y.w;        \
00529   Y.z = y.x; Y.w = y.y;}
00530 
00531 #define CAXPBY_FLOAT2(a, X, b, Y)                                       \
00532   { float2 y;                                                           \
00533   y.x = a.x*X.x; y.x -= a.y*X.y; y.x += b.x*Y.x; y.x -= b.y*Y.y;        \
00534   y.y = a.y*X.x; y.y += a.x*X.y; y.y += b.y*Y.x; y.y += b.x*Y.y;        \
00535   Y.x = y.x; Y.y = y.y;}
00536 
00537 
00538 #define CXPAYPBZ_FLOAT4(X, a, Y, b, Z)                                 \
00539   {float2 z;                                                           \
00540   z.x = X.x + a.x*Y.x; z.x -= a.y*Y.y; z.x += b.x*Z.x; z.x -= b.y*Z.y; \
00541   z.y = X.y + a.y*Y.x; z.y += a.x*Y.y; z.y += b.y*Z.x; z.y += b.x*Z.y; \
00542   Z.x = z.x; Z.y = z.y;                                                \
00543   z.x = X.z + a.x*Y.z; z.x -= a.y*Y.w; z.x += b.x*Z.z; z.x -= b.y*Z.w; \
00544   z.y = X.w + a.y*Y.z; z.y += a.x*Y.w; z.y += b.y*Z.z; z.y += b.x*Z.w; \
00545   Z.z = z.x; Z.w = z.y;}
00546 
00547 #define CXPAYPBZ_FLOAT2(X, a, Y, b, Z)                                 \
00548   {float2 z;                                                           \
00549   z.x = X.x + a.x*Y.x; z.x -= a.y*Y.y; z.x += b.x*Z.x; z.x -= b.y*Z.y; \
00550   z.y = X.y + a.y*Y.x; z.y += a.x*Y.y; z.y += b.y*Z.x; z.y += b.x*Z.y; \
00551   Z.x = z.x; Z.y = z.y;}
00552 
00553 #if (__CUDA_ARCH__ < 200)
00554 #define CAXPBYPZ_FLOAT4(a, X, b, Y, Z)            \
00555   Z.x += a.x*X.x - a.y*X.y + b.x*Y.x - b.y*Y.y;   \
00556   Z.y += a.y*X.x + a.x*X.y + b.y*Y.x + b.x*Y.y;   \
00557   Z.z += a.x*X.z - a.y*X.w + b.x*Y.z - b.y*Y.w;   \
00558   Z.w += a.y*X.z + a.x*X.w + b.y*Y.z + b.x*Y.w;
00559 #else
00560 #define CAXPBYPZ_FLOAT4(a, X, b, Y, Z)                          \
00561   Z.x = fmaf(a.x, X.x, Z.x); Z.x = fmaf(-a.y, X.y, Z.x); Z.x = fmaf(b.x, Y.x, Z.x); Z.x = fmaf(-b.y, Y.y, Z.x); \
00562   Z.y = fmaf(a.y, X.x, Z.y); Z.y = fmaf( a.x, X.y, Z.y); Z.y = fmaf(b.y, Y.x, Z.y); Z.y = fmaf( b.x, Y.y, Z.y); \
00563   Z.z = fmaf(a.x, X.z, Z.z); Z.z = fmaf(-a.y, X.w, Z.z); Z.z = fmaf(b.x, Y.z, Z.z); Z.z = fmaf(-b.y, Y.w, Z.z); \
00564   Z.w = fmaf(a.y, X.z, Z.w); Z.w = fmaf( a.x, X.w, Z.w); Z.w = fmaf(b.y, Y.z, Z.w); Z.w = fmaf( b.x, Y.w, Z.w);
00565 #endif // (__CUDA_ARCH__ < 200)
00566 
00567 #if (__CUDA_ARCH__ < 200)
00568 #define CAXPBYPZ_FLOAT2(a, X, b, Y, Z)            \
00569   Z.x += a.x*X.x - a.y*X.y + b.x*Y.x - b.y*Y.y;   \
00570   Z.y += a.y*X.x + a.x*X.y + b.y*Y.x + b.x*Y.y;
00571 #else
00572 #define CAXPBYPZ_FLOAT2(a, X, b, Y, Z)                          \
00573   Z.x = fmaf(a.x, X.x, Z.x); Z.x = fmaf(-a.y, X.y, Z.x); Z.x = fmaf(b.x, Y.x, Z.x); Z.x = fmaf(-b.y, Y.y, Z.x); \
00574   Z.y = fmaf(a.y, X.x, Z.y); Z.y = fmaf( a.x, X.y, Z.y); Z.y = fmaf(b.y, Y.x, Z.y); Z.y = fmaf( b.x, Y.y, Z.y);
00575 #endif // (__CUDA_ARCH__ < 200)
00576 
00577 // Double precision input spinor field
00578 texture<int4, 1> xTexDouble2;
00579 texture<int4, 1> yTexDouble2;
00580 texture<int4, 1> zTexDouble2;
00581 texture<int4, 1> wTexDouble2;
00582 texture<int4, 1> uTexDouble2;
00583 
00584 // Single precision input spinor field
00585 texture<float2, 1> xTexSingle2;
00586 texture<float2, 1> yTexSingle2;
00587 
00588 texture<float4, 1> xTexSingle4;
00589 
00590 // Half precision input spinor field
00591 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf1;
00592 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt1;
00593 texture<float, 1, cudaReadModeElementType> texNorm1;
00594 
00595 // Half precision input spinor field
00596 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf2;
00597 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt2;
00598 texture<float, 1, cudaReadModeElementType> texNorm2;
00599 
00600 // Half precision input spinor field
00601 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf3;
00602 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt3;
00603 texture<float, 1, cudaReadModeElementType> texNorm3;
00604 
00605 // Half precision input spinor field
00606 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf4;
00607 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt4;
00608 texture<float, 1, cudaReadModeElementType> texNorm4;
00609 
00610 // Half precision input spinor field
00611 texture<short4, 1, cudaReadModeNormalizedFloat> texHalf5;
00612 texture<short2, 1, cudaReadModeNormalizedFloat> texHalfSt5;
00613 texture<float, 1, cudaReadModeElementType> texNorm5;
00614 
00615 #define checkSpinor(a, b)                                               \
00616   {                                                                     \
00617     if (a.Precision() != b.Precision())                                 \
00618       errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
00619     if (a.Length() != b.Length())                                       \
00620       errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
00621     if (a.Stride() != b.Stride())                                       \
00622       errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
00623   }
00624 
00625 // For kernels with precision conversion built in
00626 #define checkSpinorLength(a, b)                                         \
00627   {                                                                     \
00628     if (a.Length() != b.Length()) {                                     \
00629       errorQuda("engths do not match: %d %d", a.Length(), b.Length());  \
00630     }                                                                   
00631 
00632 __global__ void convertDSKernel(double2 *dst, float4 *src, int length) {
00633   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00634   unsigned int gridSize = gridDim.x*blockDim.x;
00635   while (i < length) {
00636     for (int k=0; k<6; k++) {
00637       dst[2*k*length+i].x = src[k*length+i].x;
00638       dst[2*k*length+i].y = src[k*length+i].y;
00639       dst[(2*k+1)*length+i].x = src[k*length+i].z;
00640       dst[(2*k+1)*length+i].y = src[k*length+i].w;
00641     }
00642     i += gridSize;
00643   }   
00644 }
00645 
00646 __global__ void convertDSKernel(double2 *dst, float2 *src, int length) {
00647   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00648   unsigned int gridSize = gridDim.x*blockDim.x;
00649   while (i < length) {
00650       for (int k=0; k<3; k++) {
00651           dst[k*length+i].x = src[k*length+i].x;
00652           dst[k*length+i].y = src[k*length+i].y;
00653       }
00654       i += gridSize;
00655   }   
00656 }
00657 
00658 __global__ void convertSDKernel(float4 *dst, double2 *src, int length) {
00659   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00660   unsigned int gridSize = gridDim.x*blockDim.x;
00661   while (i < length) {
00662     for (int k=0; k<6; k++) {
00663       dst[k*length+i].x = src[2*k*length+i].x;
00664       dst[k*length+i].y = src[2*k*length+i].y;
00665       dst[k*length+i].z = src[(2*k+1)*length+i].x;
00666       dst[k*length+i].w = src[(2*k+1)*length+i].y;
00667     }
00668     i += gridSize;
00669   }   
00670 }
00671 
00672 __global__ void convertSDKernel(float2 *dst, double2 *src, int length) {
00673     unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00674   unsigned int gridSize = gridDim.x*blockDim.x;
00675   while (i < length) {
00676     for (int k=0; k<3; k++) {
00677         dst[k*length+i].x = src[k*length+i].x;
00678         dst[k*length+i].y = src[k*length+i].y;
00679     }
00680     i += gridSize;
00681   }   
00682 }
00683 
00684 __global__ void convertHSKernel(short4 *h, float *norm, int length, int real_length) {
00685 
00686   int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00687   unsigned int gridSize = gridDim.x*blockDim.x;
00688 
00689   while(i < real_length) {
00690     float4 F0 = tex1Dfetch(xTexSingle4, i + 0*length);
00691     float4 F1 = tex1Dfetch(xTexSingle4, i + 1*length);
00692     float4 F2 = tex1Dfetch(xTexSingle4, i + 2*length);
00693     float4 F3 = tex1Dfetch(xTexSingle4, i + 3*length);
00694     float4 F4 = tex1Dfetch(xTexSingle4, i + 4*length);
00695     float4 F5 = tex1Dfetch(xTexSingle4, i + 5*length);
00696     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(h, norm, F, length);
00697     i += gridSize;
00698   }
00699 
00700 }
00701 
00702 __global__ void convertHSKernel(short2 *h, float *norm, int length, int real_length) {
00703 
00704   int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00705   unsigned int gridSize = gridDim.x*blockDim.x;
00706 
00707   while(i < real_length) {
00708       float2 F0 = tex1Dfetch(xTexSingle2, i + 0*length);
00709       float2 F1 = tex1Dfetch(xTexSingle2, i + 1*length);
00710       float2 F2 = tex1Dfetch(xTexSingle2, i + 2*length);
00711       CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(h, norm, F, length);
00712       i += gridSize;
00713   }
00714 
00715 }
00716 
00717 
00718 __global__ void convertSHKernel(float4 *res, int length, int real_length) {
00719 
00720   int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00721   unsigned int gridSize = gridDim.x*blockDim.x;
00722 
00723   while (i<real_length) {
00724     RECONSTRUCT_HALF_SPINOR(I, texHalf1, texNorm1, length);
00725     res[0*length+i] = I0;
00726     res[1*length+i] = I1;
00727     res[2*length+i] = I2;
00728     res[3*length+i] = I3;
00729     res[4*length+i] = I4;
00730     res[5*length+i] = I5;
00731     i += gridSize;
00732   }
00733 }
00734 
00735 __global__ void convertSHKernel(float2 *res, int length, int real_length) {
00736 
00737     int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00738     unsigned int gridSize = gridDim.x*blockDim.x;
00739     
00740     while (i<real_length) {
00741         RECONSTRUCT_HALF_SPINOR_ST(I, texHalfSt1, texNorm1, length);
00742         res[0*length+i] = I0;
00743         res[1*length+i] = I1;
00744         res[2*length+i] = I2;
00745         i += gridSize;
00746     }
00747 }
00748 
00749 __global__ void convertHDKernel(short4 *h, float *norm, int length, int real_length) {
00750 
00751   int i = blockIdx.x*(blockDim.x) + threadIdx.x; 
00752   unsigned int gridSize = gridDim.x*blockDim.x;
00753 
00754   while(i < real_length) {
00755     double2 F0 = fetch_double2(xTexDouble2, i+0*length);
00756     double2 F1 = fetch_double2(xTexDouble2, i+1*length);
00757     double2 F2 = fetch_double2(xTexDouble2, i+2*length);
00758     double2 F3 = fetch_double2(xTexDouble2, i+3*length);
00759     double2 F4 = fetch_double2(xTexDouble2, i+4*length);
00760     double2 F5 = fetch_double2(xTexDouble2, i+5*length);
00761     double2 F6 = fetch_double2(xTexDouble2, i+6*length);
00762     double2 F7 = fetch_double2(xTexDouble2, i+7*length);
00763     double2 F8 = fetch_double2(xTexDouble2, i+8*length);
00764     double2 F9 = fetch_double2(xTexDouble2, i+9*length);
00765     double2 F10 = fetch_double2(xTexDouble2, i+10*length);
00766     double2 F11 = fetch_double2(xTexDouble2, i+11*length);
00767     CONSTRUCT_HALF_SPINOR_FROM_DOUBLE(h, norm, F, length);
00768     i += gridSize;
00769   }
00770 }
00771 
00772 __global__ void convertHDKernel(short2 *h, float *norm, int length, int real_length) {
00773 
00774   int i = blockIdx.x*(blockDim.x) + threadIdx.x; 
00775   unsigned int gridSize = gridDim.x*blockDim.x;
00776 
00777   while(i < real_length) {
00778       double2 F0 = fetch_double2(xTexDouble2, i+0*length);
00779       double2 F1 = fetch_double2(xTexDouble2, i+1*length);
00780       double2 F2 = fetch_double2(xTexDouble2, i+2*length);
00781       CONSTRUCT_HALF_SPINOR_FROM_DOUBLE_ST(h, norm, F, length);
00782       i += gridSize;
00783   }
00784 }
00785 
00786 __global__ void convertDHKernel(double2 *res, int length, int real_length) {
00787 
00788   int i = blockIdx.x*(blockDim.x) + threadIdx.x; 
00789   unsigned int gridSize = gridDim.x*blockDim.x;
00790 
00791   while(i < real_length) {
00792     RECONSTRUCT_HALF_SPINOR(I, texHalf1, texNorm1, length);
00793     res[0*length+i] = make_double2(I0.x, I0.y);
00794     res[1*length+i] = make_double2(I0.z, I0.w);
00795     res[2*length+i] = make_double2(I1.x, I1.y);
00796     res[3*length+i] = make_double2(I1.z, I1.w);
00797     res[4*length+i] = make_double2(I2.x, I2.y);
00798     res[5*length+i] = make_double2(I2.z, I2.w);
00799     res[6*length+i] = make_double2(I3.x, I3.y);
00800     res[7*length+i] = make_double2(I3.z, I3.w);
00801     res[8*length+i] = make_double2(I4.x, I4.y);
00802     res[9*length+i] = make_double2(I4.z, I4.w);
00803     res[10*length+i] = make_double2(I5.x, I5.y);
00804     res[11*length+i] = make_double2(I5.z, I5.w);
00805     i += gridSize;
00806   }
00807 
00808 }
00809 
00810 __global__ void convertDHKernelSt(double2 *res, int length, int real_length) {
00811 
00812   int i = blockIdx.x*(blockDim.x) + threadIdx.x; 
00813   unsigned int gridSize = gridDim.x*blockDim.x;
00814   
00815   while(i < real_length) {
00816       RECONSTRUCT_HALF_SPINOR_ST(I, texHalfSt1, texNorm1, length);
00817       res[0*length+i] = make_double2(I0.x, I0.y);
00818       res[1*length+i] = make_double2(I1.x, I1.y);
00819       res[2*length+i] = make_double2(I2.x, I2.y);
00820       i += gridSize;
00821   }
00822 
00823 }
00824 
00825 
00826 
00827 void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src) {
00828 
00829   if (src.nSpin != 1 && src.nSpin != 4){
00830     errorQuda("nSpin(%d) not supported in function %s, line %d\n", src.nSpin, __FUNCTION__, __LINE__);  
00831   }
00832   if ((dst.precision == QUDA_HALF_PRECISION || src.precision == QUDA_HALF_PRECISION) &&
00833       (dst.siteSubset == QUDA_FULL_SITE_SUBSET || src.siteSubset == QUDA_FULL_SITE_SUBSET)) {
00834     copyCuda(dst.Even(), src.Even());
00835     copyCuda(dst.Odd(), src.Odd());
00836     return;
00837   }
00838 
00839   // For a given dst precision, there are two non-trivial possibilities for the
00840   // src precision.  The higher one corresponds to kernel index 0 (in the table
00841   // of block and grid dimensions), while the lower one corresponds to index 1.
00842   int id;
00843   if (src.Precision() == QUDA_DOUBLE_PRECISION ||
00844       dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
00845     id = 0;
00846   } else {
00847     id = 1;
00848   }
00849   setBlock(id, dst.stride, dst.Precision());
00850 
00851   blas_quda_bytes += src.real_length*((int)src.precision + (int)dst.precision);
00852 
00853   if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION) {
00854       if (src.nSpin == 4){
00855           convertDSKernel<<<blasGrid, blasBlock>>>((double2*)dst.v, (float4*)src.v, src.stride);
00856       }else{ //src.nSpin == 1
00857           convertDSKernel<<<blasGrid, blasBlock>>>((double2*)dst.v, (float2*)src.v, src.stride);          
00858       }
00859 
00860   } else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) {
00861       if (src.nSpin == 4){
00862           convertSDKernel<<<blasGrid, blasBlock>>>((float4*)dst.v, (double2*)src.v, src.stride);
00863       }else{ //src.nSpin ==1
00864           convertSDKernel<<<blasGrid, blasBlock>>>((float2*)dst.v, (double2*)src.v, src.stride);
00865       }
00866   } else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_HALF_PRECISION) {
00867       blas_quda_bytes += src.volume*sizeof(float);
00868       int spinor_bytes = src.length*sizeof(short);
00869       if (src.nSpin == 4){      
00870         cudaBindTexture(0, texHalf1, src.v, spinor_bytes); 
00871         cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/12);
00872         convertSHKernel<<<blasGrid, blasBlock>>>((float4*)dst.v, src.stride, src.volume);
00873         cudaUnbindTexture(texHalf1); 
00874         cudaUnbindTexture(texNorm1);
00875       }else{ //nSpin== 1;
00876         cudaBindTexture(0, texHalfSt1, src.v, spinor_bytes); 
00877         cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/3);
00878         convertSHKernel<<<blasGrid, blasBlock>>>((float2*)dst.v, src.stride, src.volume);
00879         cudaUnbindTexture(texHalfSt1); 
00880         cudaUnbindTexture(texNorm1);
00881       }
00882   } else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_SINGLE_PRECISION) {
00883     blas_quda_bytes += dst.volume*sizeof(float);
00884     int spinor_bytes = src.length*sizeof(float);
00885     if (src.nSpin == 4){
00886         cudaBindTexture(0, xTexSingle4, src.v, spinor_bytes); 
00887         convertHSKernel<<<blasGrid, blasBlock>>>((short4*)dst.v, (float*)dst.norm, src.stride, src.volume);
00888         cudaUnbindTexture(xTexSingle4); 
00889     }else{ //nSpinr == 1
00890         cudaBindTexture(0, xTexSingle2, src.v, spinor_bytes); 
00891         convertHSKernel<<<blasGrid, blasBlock>>>((short2*)dst.v, (float*)dst.norm, src.stride, src.volume);     
00892         cudaUnbindTexture(xTexSingle2); 
00893     }
00894   } else if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_HALF_PRECISION) {
00895     blas_quda_bytes += src.volume*sizeof(float);
00896     int spinor_bytes = src.length*sizeof(short);
00897     if (src.nSpin == 4){
00898         cudaBindTexture(0, texHalf1, src.v, spinor_bytes); 
00899         cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/12);
00900         convertDHKernel<<<blasGrid, blasBlock>>>((double2*)dst.v, src.stride, src.volume);
00901         cudaUnbindTexture(texHalf1); 
00902         cudaUnbindTexture(texNorm1);
00903     }else{//nSpinr == 1
00904         cudaBindTexture(0, texHalfSt1, src.v, spinor_bytes); 
00905         cudaBindTexture(0, texNorm1, src.norm, spinor_bytes/3);
00906         convertDHKernelSt<<<blasGrid, blasBlock>>>((double2*)dst.v, src.stride, src.volume);
00907         cudaUnbindTexture(texHalfSt1); 
00908         cudaUnbindTexture(texNorm1);
00909     }
00910   } else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) {
00911     blas_quda_bytes += dst.volume*sizeof(float);
00912     int spinor_bytes = src.length*sizeof(double);
00913     cudaBindTexture(0, xTexDouble2, src.v, spinor_bytes); 
00914     if (src.nSpin == 4){
00915       convertHDKernel<<<blasGrid, blasBlock>>>((short4*)dst.v, (float*)dst.norm, src.stride, src.volume);
00916     }else{ //nSpinr == 1
00917       convertHDKernel<<<blasGrid, blasBlock>>>((short2*)dst.v, (float*)dst.norm, src.stride, src.volume);
00918     }
00919     cudaUnbindTexture(xTexDouble2); 
00920   } else {
00921     cudaMemcpy(dst.v, src.v, dst.bytes, cudaMemcpyDeviceToDevice);
00922     if (dst.precision == QUDA_HALF_PRECISION) {
00923       cudaMemcpy(dst.norm, src.norm, dst.bytes/(dst.nColor*dst.nSpin), cudaMemcpyDeviceToDevice);
00924       blas_quda_bytes += 2*dst.real_length*sizeof(float);
00925     }
00926   }
00927   
00928   cudaThreadSynchronize();
00929   if (!blasTuning) checkCudaError();
00930 
00931 }
00932 
00933 
00934 template <typename Float, typename Float2>
00935 __global__ void axpbyKernel(Float a, Float2 *x, Float b, Float2 *y, int length) {
00936   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00937   unsigned int gridSize = gridDim.x*blockDim.x;
00938   while (i < length) {
00939     y[i] = a*x[i] + b*y[i];
00940     i += gridSize;
00941   } 
00942 }
00943 
00944 __global__ void axpbyHKernel(float a, float b, short4 *yH, float *yN, int stride, int length) {
00945   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00946   unsigned int gridSize = gridDim.x*blockDim.x;
00947   while (i < length) {
00948     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
00949     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
00950     AXPBY_FLOAT4(a, x0, b, y0);
00951     AXPBY_FLOAT4(a, x1, b, y1);
00952     AXPBY_FLOAT4(a, x2, b, y2);
00953     AXPBY_FLOAT4(a, x3, b, y3);
00954     AXPBY_FLOAT4(a, x4, b, y4);
00955     AXPBY_FLOAT4(a, x5, b, y5);
00956     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
00957     i += gridSize;
00958   } 
00959   
00960 }
00961 __global__ void axpbyHKernel(float a, float b, short2 *yH, float *yN, int stride, int length) {
00962   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00963   unsigned int gridSize = gridDim.x*blockDim.x;
00964   while (i < length) {
00965     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
00966     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
00967     AXPBY_FLOAT2(a, x0, b, y0);
00968     AXPBY_FLOAT2(a, x1, b, y1);
00969     AXPBY_FLOAT2(a, x2, b, y2);
00970     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
00971     i += gridSize;
00972   } 
00973   
00974 }
00975 
00976 
00977 // performs the operation y[i] = a*x[i] + b*y[i]
00978 void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y) {
00979   setBlock(2, x.length, x.precision);
00980   checkSpinor(x, y);
00981   if (x.precision == QUDA_DOUBLE_PRECISION) {
00982     axpbyKernel<<<blasGrid, blasBlock>>>(a, (double*)x.v, b, (double*)y.v, x.length);
00983   } else if (x.precision == QUDA_SINGLE_PRECISION) {
00984     axpbyKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, (float)b, (float2*)y.v, x.length/2);
00985   } else {
00986     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
00987       axpbyCuda(a, x.Even(), b, y.Even());
00988       axpbyCuda(a, x.Odd(), b, y.Odd());
00989       return;
00990     }
00991     int spinor_bytes = x.length*sizeof(short);
00992     if (x.nSpin == 4){ //wilson
00993         cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
00994         cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
00995         cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
00996         cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
00997         axpbyHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short4*)y.v, 
00998                                               (float*)y.norm, y.stride, y.volume);
00999     }else if (x.nSpin == 1) {//staggered
01000         cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01001         cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01002         cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01003         cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01004         axpbyHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short2*)y.v, 
01005                                               (float*)y.norm, y.stride, y.volume);
01006     }else{
01007       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);
01008     }
01009     blas_quda_bytes += 3*x.volume*sizeof(float);
01010   }
01011   blas_quda_bytes += 3*x.real_length*x.precision;
01012   blas_quda_flops += 3*x.real_length;
01013 
01014   if (!blasTuning) checkCudaError();
01015 }
01016 
01017 template <typename Float>
01018 __global__ void xpyKernel(Float *x, Float *y, int len) {
01019   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01020   unsigned int gridSize = gridDim.x*blockDim.x;
01021   while (i < len) {
01022     y[i] += x[i];
01023     i += gridSize;
01024   } 
01025 }
01026 
01027 __global__ void xpyHKernel(short4 *yH, float *yN, int stride, int length) {
01028   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01029   unsigned int gridSize = gridDim.x*blockDim.x;
01030   while (i < length) {
01031     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01032     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01033     XPY_FLOAT4(x0, y0);
01034     XPY_FLOAT4(x1, y1);
01035     XPY_FLOAT4(x2, y2);
01036     XPY_FLOAT4(x3, y3);
01037     XPY_FLOAT4(x4, y4);
01038     XPY_FLOAT4(x5, y5);
01039     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01040     i += gridSize;
01041   } 
01042   
01043 }
01044 
01045 __global__ void xpyHKernel(short2 *yH, float *yN, int stride, int length) {
01046   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01047   unsigned int gridSize = gridDim.x*blockDim.x;
01048   while (i < length) {
01049     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01050     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01051     XPY_FLOAT2(x0, y0);
01052     XPY_FLOAT2(x1, y1);
01053     XPY_FLOAT2(x2, y2);
01054     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01055     i += gridSize;
01056   } 
01057   
01058 }
01059 
01060 // performs the operation y[i] = x[i] + y[i]
01061 void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
01062   checkSpinor(x,y);
01063   setBlock(3, x.length, x.precision);
01064   if (x.precision == QUDA_DOUBLE_PRECISION) {
01065     xpyKernel<<<blasGrid, blasBlock>>>((double*)x.v, (double*)y.v, x.length);
01066   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01067     xpyKernel<<<blasGrid, blasBlock>>>((float2*)x.v, (float2*)y.v, x.length/2);
01068   } else {
01069     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01070       xpyCuda(x.Even(), y.Even());
01071       xpyCuda(x.Odd(), y.Odd());
01072       return;
01073     }
01074     int spinor_bytes = x.length*sizeof(short);
01075     if (x.nSpin == 4){ //wilson
01076       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01077       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01078       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01079       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01080       xpyHKernel<<<blasGrid, blasBlock>>>((short4*)y.v, (float*)y.norm, y.stride, y.volume);
01081     }else if (x.nSpin == 1){ //staggered
01082       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01083       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01084       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01085       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01086       xpyHKernel<<<blasGrid, blasBlock>>>((short2*)y.v, (float*)y.norm, y.stride, y.volume);
01087     }else{
01088       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);
01089     }
01090     blas_quda_bytes += 3*x.volume*sizeof(float);
01091   }
01092   blas_quda_bytes += 3*x.real_length*x.precision;
01093   blas_quda_flops += x.real_length;
01094 
01095   if (!blasTuning) checkCudaError();
01096 }
01097 
01098 template <typename Float, typename Float2>
01099 __global__ void axpyKernel(Float a, Float2 *x, Float2 *y, int len) {
01100   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01101   unsigned int gridSize = gridDim.x*blockDim.x;
01102   while (i < len) {
01103     y[i] += a*x[i];
01104     i += gridSize;
01105   } 
01106 }
01107 
01108 __global__ void axpyHKernel(float a, short4 *yH, float *yN, int stride, int length) {
01109   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01110   unsigned int gridSize = gridDim.x*blockDim.x;
01111   while (i < length) {
01112     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01113     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01114     AXPY_FLOAT4(a, x0, y0);
01115     AXPY_FLOAT4(a, x1, y1);
01116     AXPY_FLOAT4(a, x2, y2);
01117     AXPY_FLOAT4(a, x3, y3);
01118     AXPY_FLOAT4(a, x4, y4);
01119     AXPY_FLOAT4(a, x5, y5);
01120     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01121     i += gridSize;
01122   } 
01123   
01124 }
01125 
01126 
01127 __global__ void axpyHKernel(float a, short2 *yH, float *yN, int stride, int length) {
01128   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01129   unsigned int gridSize = gridDim.x*blockDim.x;
01130   while (i < length) {
01131     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01132     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01133     AXPY_FLOAT2(a, x0, y0);
01134     AXPY_FLOAT2(a, x1, y1);
01135     AXPY_FLOAT2(a, x2, y2);    
01136     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01137     i += gridSize;
01138   } 
01139   
01140 }
01141 
01142 // performs the operation y[i] = a*x[i] + y[i]
01143 void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
01144   checkSpinor(x,y);
01145   setBlock(4, x.length, x.precision);
01146   if (x.precision == QUDA_DOUBLE_PRECISION) {
01147     axpyKernel<<<blasGrid, blasBlock>>>(a, (double*)x.v, (double*)y.v, x.length);
01148   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01149     axpyKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, (float2*)y.v, x.length/2);
01150   } else {
01151     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01152       axpyCuda(a, x.Even(), y.Even());
01153       axpyCuda(a, x.Odd(), y.Odd());
01154       return;
01155     }
01156     int spinor_bytes = x.length*sizeof(short);
01157     if (x.nSpin == 4){ //wilson
01158       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01159       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01160       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01161       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01162       axpyHKernel<<<blasGrid, blasBlock>>>((float)a, (short4*)y.v, (float*)y.norm, y.stride, y.volume);
01163     }else if (x.nSpin == 1){ //staggered
01164       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01165       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01166       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01167       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01168       axpyHKernel<<<blasGrid, blasBlock>>>((float)a, (short2*)y.v, (float*)y.norm, y.stride, y.volume);
01169     }else{
01170       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);
01171     }
01172     blas_quda_bytes += 3*x.volume*sizeof(float);
01173   }
01174   blas_quda_bytes += 3*x.real_length*x.precision;
01175   blas_quda_flops += 2*x.real_length;
01176 
01177   if (!blasTuning) checkCudaError();
01178 }
01179 
01180 template <typename Float, typename Float2>
01181 __global__ void xpayKernel(const Float2 *x, Float a, Float2 *y, int len) {
01182   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01183   unsigned int gridSize = gridDim.x*blockDim.x;
01184   while (i < len) {
01185     y[i] = x[i] + a*y[i];
01186     i += gridSize;
01187   } 
01188 }
01189 
01190 __global__ void xpayHKernel(float a, short4 *yH, float *yN, int stride, int length) {
01191   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01192   unsigned int gridSize = gridDim.x*blockDim.x;
01193   while (i < length) {
01194     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01195     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01196     XPAY_FLOAT4(x0, a, y0);
01197     XPAY_FLOAT4(x1, a, y1);
01198     XPAY_FLOAT4(x2, a, y2);
01199     XPAY_FLOAT4(x3, a, y3);
01200     XPAY_FLOAT4(x4, a, y4);
01201     XPAY_FLOAT4(x5, a, y5);
01202     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01203     i += gridSize;
01204   }   
01205 }
01206 
01207 __global__ void xpayHKernel(float a, short2 *yH, float *yN, int stride, int length) {
01208   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01209   unsigned int gridSize = gridDim.x*blockDim.x;
01210   while (i < length) {
01211     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01212     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01213     XPAY_FLOAT2(x0, a, y0);
01214     XPAY_FLOAT2(x1, a, y1);
01215     XPAY_FLOAT2(x2, a, y2);
01216     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01217     i += gridSize;
01218   }   
01219 }
01220 
01221 
01222 // performs the operation y[i] = x[i] + a*y[i]
01223 void xpayCuda(const cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y) {
01224   checkSpinor(x,y);
01225   setBlock(5, x.length, x.precision);
01226   if (x.precision == QUDA_DOUBLE_PRECISION) {
01227     xpayKernel<<<blasGrid, blasBlock>>>((double*)x.v, a, (double*)y.v, x.length);
01228   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01229     xpayKernel<<<blasGrid, blasBlock>>>((float2*)x.v, (float)a, (float2*)y.v, x.length/2);
01230   } else {
01231     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01232       xpayCuda(x.Even(), a, y.Even());
01233       xpayCuda(x.Odd(), a, y.Odd());
01234       return;
01235     }
01236     int spinor_bytes = x.length*sizeof(short);
01237     if (x.nSpin == 4){ //wilson
01238       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01239       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01240       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01241       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01242       xpayHKernel<<<blasGrid, blasBlock>>>((float)a, (short4*)y.v, (float*)y.norm, y.stride, y.volume);
01243     }else if (x.nSpin ==1){ //staggered
01244       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01245       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01246       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01247       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01248       xpayHKernel<<<blasGrid, blasBlock>>>((float)a, (short2*)y.v, (float*)y.norm, y.stride, y.volume);
01249     }else{
01250       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);      
01251     }
01252     blas_quda_bytes += 3*x.volume*sizeof(float);
01253   }
01254   blas_quda_bytes += 3*x.real_length*x.precision;
01255   blas_quda_flops += 2*x.real_length;
01256 
01257   if (!blasTuning) checkCudaError();
01258 }
01259 
01260 template <typename Float>
01261 __global__ void mxpyKernel(Float *x, Float *y, int len) {
01262   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01263   unsigned int gridSize = gridDim.x*blockDim.x;
01264   while (i < len) {
01265     y[i] -= x[i];
01266     i += gridSize;
01267   } 
01268 }
01269 
01270 __global__ void mxpyHKernel(short4 *yH, float *yN, int stride, int length) {
01271   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01272   unsigned int gridSize = gridDim.x*blockDim.x;
01273   while (i < length) {
01274     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01275     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01276     MXPY_FLOAT4(x0, y0);
01277     MXPY_FLOAT4(x1, y1);
01278     MXPY_FLOAT4(x2, y2);
01279     MXPY_FLOAT4(x3, y3);
01280     MXPY_FLOAT4(x4, y4);
01281     MXPY_FLOAT4(x5, y5);
01282     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01283     i += gridSize;
01284   } 
01285   
01286 }
01287 
01288 __global__ void mxpyHKernel(short2 *yH, float *yN, int stride, int length) {
01289   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01290   unsigned int gridSize = gridDim.x*blockDim.x;
01291   while (i < length) {
01292     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01293     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01294     MXPY_FLOAT2(x0, y0);
01295     MXPY_FLOAT2(x1, y1);
01296     MXPY_FLOAT2(x2, y2);
01297     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01298     i += gridSize;
01299   } 
01300   
01301 }
01302 
01303 
01304 // performs the operation y[i] -= x[i] (minus x plus y)
01305 void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
01306   checkSpinor(x,y);
01307   setBlock(6, x.length, x.precision);
01308   if (x.precision == QUDA_DOUBLE_PRECISION) {
01309     mxpyKernel<<<blasGrid, blasBlock>>>((double*)x.v, (double*)y.v, x.length);
01310   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01311     mxpyKernel<<<blasGrid, blasBlock>>>((float2*)x.v, (float2*)y.v, x.length/2);
01312   } else {
01313     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01314       mxpyCuda(x.Even(), y.Even());
01315       mxpyCuda(x.Odd(), y.Odd());
01316       return;
01317     }
01318     int spinor_bytes = x.length*sizeof(short);
01319     if (x.nSpin == 4){ //wilson
01320       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01321       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01322       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01323       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01324       mxpyHKernel<<<blasGrid, blasBlock>>>((short4*)y.v, (float*)y.norm, y.stride, y.volume);
01325     }else if (x.nSpin == 1) { //staggered
01326       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01327       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01328       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01329       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01330       mxpyHKernel<<<blasGrid, blasBlock>>>((short2*)y.v, (float*)y.norm, y.stride, y.volume);
01331     }else{
01332       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);      
01333     }
01334     blas_quda_bytes += 3*x.volume*sizeof(float);
01335   }
01336   blas_quda_bytes += 3*x.real_length*x.precision;
01337   blas_quda_flops += x.real_length;
01338 
01339   if (!blasTuning) checkCudaError();
01340 }
01341 
01342 template <typename Float, typename Float2>
01343 __global__ void axKernel(Float a, Float2 *x, int len) {
01344   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01345   unsigned int gridSize = gridDim.x*blockDim.x;
01346   while (i < len) {
01347     x[i] *= a;
01348     i += gridSize;
01349   } 
01350 }
01351 
01352 __global__ void axHKernel(float a, short4 *xH, float *xN, int stride, int length) {
01353   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01354   unsigned int gridSize = gridDim.x*blockDim.x;
01355   while (i < length) {
01356     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01357     AX_FLOAT4(a, x0); AX_FLOAT4(a, x1); AX_FLOAT4(a, x2);
01358     AX_FLOAT4(a, x3); AX_FLOAT4(a, x4); AX_FLOAT4(a, x5);
01359     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(xH, xN, x, stride);
01360     i += gridSize;
01361   } 
01362   
01363 }
01364 
01365 __global__ void axHKernel(float a, short2 *xH, float *xN, int stride, int length) {
01366   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01367   unsigned int gridSize = gridDim.x*blockDim.x;
01368   while (i < length) {
01369     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01370     AX_FLOAT2(a, x0); AX_FLOAT2(a, x1); AX_FLOAT2(a, x2);
01371     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(xH, xN, x, stride);
01372     i += gridSize;
01373   } 
01374   
01375 }
01376 
01377 // performs the operation x[i] = a*x[i]
01378 void axCuda(const double &a, cudaColorSpinorField &x) {
01379   setBlock(7, x.length, x.precision);
01380   if (x.precision == QUDA_DOUBLE_PRECISION) {
01381     axKernel<<<blasGrid, blasBlock>>>(a, (double*)x.v, x.length);
01382   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01383     axKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, x.length/2);
01384   } else {
01385     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01386       axCuda(a, x.Even());
01387       axCuda(a, x.Odd());
01388       return;
01389     }
01390     int spinor_bytes = x.length*sizeof(short);
01391     if (x.nSpin == 4){ //wilson
01392       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01393       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01394       axHKernel<<<blasGrid, blasBlock>>>((float)a, (short4*)x.v, (float*)x.norm, x.stride, x.volume);
01395     }else if (x.nSpin ==1){ //staggered
01396       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01397       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01398       axHKernel<<<blasGrid, blasBlock>>>((float)a, (short2*)x.v, (float*)x.norm, x.stride, x.volume);
01399     }else{
01400       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);           
01401     }
01402     blas_quda_bytes += 2*x.volume*sizeof(float);
01403   }
01404   blas_quda_bytes += 2*x.real_length*x.precision;
01405   blas_quda_flops += x.real_length;
01406 
01407   if (!blasTuning) checkCudaError();
01408 }
01409 
01410 template <typename Float2>
01411 __global__ void caxpyDKernel(Float2 a, Float2 *x, Float2 *y, int len) {
01412   
01413   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01414   unsigned int gridSize = gridDim.x*blockDim.x;
01415   while (i < len) {
01416     Float2 Z = READ_DOUBLE2_TEXTURE(x, i);
01417     y[i].x += a.x*Z.x - a.y*Z.y;
01418     y[i].y += a.y*Z.x + a.x*Z.y;
01419     i += gridSize;
01420   } 
01421   
01422 }
01423 
01424 template <typename Float2>
01425 __global__ void caxpySKernel(Float2 a, Float2 *x, Float2 *y, int len) {
01426   
01427   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01428   unsigned int gridSize = gridDim.x*blockDim.x;
01429   while (i < len) {
01430     Float2 Z = read_Float2(x, i);
01431     y[i].x += a.x*Z.x - a.y*Z.y;
01432     y[i].y += a.y*Z.x + a.x*Z.y;
01433     i += gridSize;
01434   } 
01435   
01436 }
01437 
01438 __global__ void caxpyHKernel(float2 a, short4 *yH, float *yN, int stride, int length) {
01439   
01440   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01441   unsigned int gridSize = gridDim.x*blockDim.x;
01442   while (i < length) {
01443     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01444     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01445     CAXPY_FLOAT4(a, x0, y0);
01446     CAXPY_FLOAT4(a, x1, y1);
01447     CAXPY_FLOAT4(a, x2, y2);
01448     CAXPY_FLOAT4(a, x3, y3);
01449     CAXPY_FLOAT4(a, x4, y4);
01450     CAXPY_FLOAT4(a, x5, y5);
01451     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01452     i += gridSize;
01453   } 
01454   
01455 }
01456 
01457 __global__ void caxpyHKernel(float2 a, short2 *yH, float *yN, int stride, int length) {
01458   
01459   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01460   unsigned int gridSize = gridDim.x*blockDim.x;
01461   while (i < length) {
01462     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01463     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01464     CAXPY_FLOAT2(a, x0, y0);
01465     CAXPY_FLOAT2(a, x1, y1);
01466     CAXPY_FLOAT2(a, x2, y2);
01467     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01468     i += gridSize;
01469   } 
01470   
01471 }
01472 
01473 // performs the operation y[i] += a*x[i]
01474 void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
01475   checkSpinor(x,y);
01476   int length = x.length/2;
01477   setBlock(8, length, x.precision);
01478   blas_quda_bytes += 3*x.real_length*x.precision;
01479   blas_quda_flops += 4*x.real_length;
01480   if (x.precision == QUDA_DOUBLE_PRECISION) {
01481     int spinor_bytes = x.length*sizeof(double);
01482     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
01483     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
01484     double2 a2 = make_double2(real(a), imag(a));
01485     caxpyDKernel<<<blasGrid, blasBlock>>>(a2, (double2*)x.v, (double2*)y.v, length);
01486   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01487     float2 a2 = make_float2(real(a), imag(a));
01488     caxpySKernel<<<blasGrid, blasBlock>>>(a2, (float2*)x.v, (float2*)y.v, length);
01489   } else {
01490     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01491       caxpyCuda(a, x.Even(), y.Even());
01492       caxpyCuda(a, x.Odd(), y.Odd());
01493       return;
01494     }
01495     int spinor_bytes = x.length*sizeof(short);
01496     if (x.nSpin == 4){ //wilson
01497       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01498       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01499       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01500       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01501       float2 a2 = make_float2(real(a), imag(a));
01502       caxpyHKernel<<<blasGrid, blasBlock>>>(a2, (short4*)y.v, (float*)y.norm, y.stride, y.volume);
01503     } else if (x.nSpin == 1){ //staggered
01504       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01505       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01506       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01507       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01508       float2 a2 = make_float2(real(a), imag(a));
01509       caxpyHKernel<<<blasGrid, blasBlock>>>(a2, (short2*)y.v, (float*)y.norm, y.stride, y.volume);
01510     }else{
01511       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);     
01512     }
01513     blas_quda_bytes += 3*x.volume*sizeof(float);
01514   }
01515 
01516   if (!blasTuning) checkCudaError();
01517 }
01518 
01519 template <typename Float2>
01520 __global__ void caxpbyDKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, int len) {
01521   
01522   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01523   unsigned int gridSize = gridDim.x*blockDim.x;
01524   while (i < len) {
01525     Float2 Z1 = READ_DOUBLE2_TEXTURE(x, i);
01526     Float2 Z2 = READ_DOUBLE2_TEXTURE(y, i);
01527     y[i].x = a.x*Z1.x + b.x*Z2.x - a.y*Z1.y - b.y*Z2.y;
01528     y[i].y = a.y*Z1.x + b.y*Z2.x + a.x*Z1.y + b.x*Z2.y;
01529     i += gridSize;
01530   } 
01531 }
01532 
01533 template <typename Float2>
01534 __global__ void caxpbySKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, int len) {
01535   
01536   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01537   unsigned int gridSize = gridDim.x*blockDim.x;
01538   while (i < len) {
01539     Float2 Z1 = read_Float2(x, i);
01540     Float2 Z2 = read_Float2(y, i);
01541     y[i].x = a.x*Z1.x + b.x*Z2.x - a.y*Z1.y - b.y*Z2.y;
01542     y[i].y = a.y*Z1.x + b.y*Z2.x + a.x*Z1.y + b.x*Z2.y;
01543     i += gridSize;
01544   } 
01545 }
01546 
01547 __global__ void caxpbyHKernel(float2 a, float2 b, short4 *yH, float *yN, int stride, int length) {
01548   
01549   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01550   unsigned int gridSize = gridDim.x*blockDim.x;
01551   while (i < length) {
01552     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01553     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01554     CAXPBY_FLOAT4(a, x0, b, y0);
01555     CAXPBY_FLOAT4(a, x1, b, y1);
01556     CAXPBY_FLOAT4(a, x2, b, y2);
01557     CAXPBY_FLOAT4(a, x3, b, y3);
01558     CAXPBY_FLOAT4(a, x4, b, y4);
01559     CAXPBY_FLOAT4(a, x5, b, y5);
01560     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01561     i += gridSize;
01562   }   
01563 }
01564 
01565 __global__ void caxpbyHKernel(float2 a, float2 b, short2 *yH, float *yN, int stride, int length) {
01566   
01567   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01568   unsigned int gridSize = gridDim.x*blockDim.x;
01569   while (i < length) {
01570     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01571     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01572     CAXPBY_FLOAT2(a, x0, b, y0);
01573     CAXPBY_FLOAT2(a, x1, b, y1);
01574     CAXPBY_FLOAT2(a, x2, b, y2);
01575     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01576     i += gridSize;
01577   }   
01578 }
01579 
01580 
01581 // performs the operation y[i] = c*x[i] + b*y[i]
01582 void caxpbyCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y) {
01583   checkSpinor(x,y);
01584   int length = x.length/2;
01585   setBlock(9, length, x.precision);
01586   blas_quda_bytes += 3*x.real_length*x.precision;
01587   blas_quda_flops += 7*x.real_length;
01588   if (x.precision == QUDA_DOUBLE_PRECISION) {
01589     int spinor_bytes = x.length*sizeof(double);
01590     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
01591     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
01592     double2 a2 = make_double2(real(a), imag(a));
01593     double2 b2 = make_double2(real(b), imag(b));
01594     caxpbyDKernel<<<blasGrid, blasBlock>>>(a2, (double2*)x.v, b2, (double2*)y.v, length);
01595   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01596     float2 a2 = make_float2(real(a), imag(a));
01597     float2 b2 = make_float2(real(b), imag(b));
01598     caxpbySKernel<<<blasGrid, blasBlock>>>(a2, (float2*)x.v, b2, (float2*)y.v, length);
01599   } else {
01600     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01601       caxpbyCuda(a, x.Even(), b, y.Even());
01602       caxpbyCuda(a, x.Odd(), b, y.Odd());
01603       return;
01604     }
01605     int spinor_bytes = x.length*sizeof(short);
01606     if (x.nSpin == 4){ //wilson
01607       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01608       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01609       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01610       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01611       float2 a2 = make_float2(real(a), imag(a));
01612       float2 b2 = make_float2(real(b), imag(b));
01613       caxpbyHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short4*)y.v, (float*)y.norm, y.stride, y.volume);
01614     }else if (x.nSpin == 1){ //staggered
01615       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01616       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01617       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01618       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01619       float2 a2 = make_float2(real(a), imag(a));
01620       float2 b2 = make_float2(real(b), imag(b));
01621       caxpbyHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short2*)y.v, (float*)y.norm, y.stride, y.volume);
01622     }else{
01623       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);     
01624     }
01625     blas_quda_bytes += 3*x.volume*sizeof(float);
01626   }
01627 
01628   if (!blasTuning) checkCudaError();
01629 }
01630 
01631 template <typename Float2>
01632 __global__ void cxpaypbzDKernel(Float2 *x, Float2 a, Float2 *y, Float2 b, Float2 *z, int len) {
01633   
01634   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01635   unsigned int gridSize = gridDim.x*blockDim.x;
01636   while (i < len) {
01637     Float2 T1 = READ_DOUBLE2_TEXTURE(x, i);
01638     Float2 T2 = READ_DOUBLE2_TEXTURE(y, i);
01639     Float2 T3 = read_Float2(z, i);
01640 
01641     T1.x += a.x*T2.x - a.y*T2.y;
01642     T1.y += a.y*T2.x + a.x*T2.y;
01643     T1.x += b.x*T3.x - b.y*T3.y;
01644     T1.y += b.y*T3.x + b.x*T3.y;
01645     
01646     z[i] = make_Float2(T1);
01647     i += gridSize;
01648   } 
01649   
01650 }
01651 
01652 template <typename Float2>
01653 __global__ void cxpaypbzSKernel(Float2 *x, Float2 a, Float2 *y, Float2 b, Float2 *z, int len) {
01654   
01655   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01656   unsigned int gridSize = gridDim.x*blockDim.x;
01657   while (i < len) {
01658     Float2 T1 = read_Float2(x, i);
01659     Float2 T2 = read_Float2(y, i);
01660     Float2 T3 = read_Float2(z, i);
01661 
01662     T1.x += a.x*T2.x - a.y*T2.y;
01663     T1.y += a.y*T2.x + a.x*T2.y;
01664     T1.x += b.x*T3.x - b.y*T3.y;
01665     T1.y += b.y*T3.x + b.x*T3.y;
01666     
01667     z[i] = make_Float2(T1);
01668     i += gridSize;
01669   } 
01670   
01671 }
01672 
01673 __global__ void cxpaypbzHKernel(float2 a, float2 b, short4 *zH, float *zN, int stride, int length) {
01674   
01675   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01676   unsigned int gridSize = gridDim.x*blockDim.x;
01677   while (i < length) {
01678     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01679     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01680     RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride);
01681     CXPAYPBZ_FLOAT4(x0, a, y0, b, z0);
01682     CXPAYPBZ_FLOAT4(x1, a, y1, b, z1);
01683     CXPAYPBZ_FLOAT4(x2, a, y2, b, z2);
01684     CXPAYPBZ_FLOAT4(x3, a, y3, b, z3);
01685     CXPAYPBZ_FLOAT4(x4, a, y4, b, z4);
01686     CXPAYPBZ_FLOAT4(x5, a, y5, b, z5);
01687     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(zH, zN, z, stride);
01688     i += gridSize;
01689   }   
01690 }
01691 
01692 
01693 __global__ void cxpaypbzHKernel(float2 a, float2 b, short2 *zH, float *zN, int stride, int length) {
01694   
01695   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01696   unsigned int gridSize = gridDim.x*blockDim.x;
01697   while (i < length) {
01698     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01699     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01700     RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride);
01701     CXPAYPBZ_FLOAT2(x0, a, y0, b, z0);
01702     CXPAYPBZ_FLOAT2(x1, a, y1, b, z1);
01703     CXPAYPBZ_FLOAT2(x2, a, y2, b, z2);
01704     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(zH, zN, z, stride);
01705     i += gridSize;
01706   }   
01707 }
01708 
01709 
01710 // performs the operation z[i] = x[i] + a*y[i] + b*z[i]
01711 void cxpaypbzCuda(cudaColorSpinorField &x, const Complex &a, cudaColorSpinorField &y, 
01712                   const Complex &b, cudaColorSpinorField &z) {
01713   checkSpinor(x,y);
01714   checkSpinor(x,z);
01715   int length = x.length/2;
01716   setBlock(10, length, x.precision);
01717   blas_quda_bytes += 4*x.real_length*x.precision;
01718   blas_quda_flops += 8*x.real_length;
01719   if (x.precision == QUDA_DOUBLE_PRECISION) {
01720     int spinor_bytes = x.length*sizeof(double);
01721     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
01722     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
01723     double2 a2 = make_double2(real(a), imag(a));
01724     double2 b2 = make_double2(real(b), imag(b));
01725     cxpaypbzDKernel<<<blasGrid, blasBlock>>>((double2*)x.v, a2, (double2*)y.v, b2, (double2*)z.v, length);
01726   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01727     float2 a2 = make_float2(real(a), imag(a));
01728     float2 b2 = make_float2(real(b), imag(b));
01729     cxpaypbzSKernel<<<blasGrid, blasBlock>>>((float2*)x.v, a2, (float2*)y.v, b2, (float2*)z.v, length);
01730   } else {
01731     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
01732       cxpaypbzCuda(x.Even(), a, y.Even(), b, z.Even());
01733       cxpaypbzCuda(x.Odd(), a, y.Odd(), b, z.Odd());
01734       return;
01735     }
01736     int spinor_bytes = x.length*sizeof(short);
01737     blas_quda_bytes += 4*x.volume*sizeof(float);
01738     if (x.nSpin ==4 ){//wilson 
01739       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
01740       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
01741       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
01742       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
01743       cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 
01744       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12);    
01745       float2 a2 = make_float2(real(a), imag(a));
01746       float2 b2 = make_float2(real(b), imag(b));
01747       cxpaypbzHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short4*)z.v, (float*)z.norm, z.stride, z.volume);
01748     } else if (x.nSpin ==1 ){//staggered 
01749       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
01750       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
01751       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
01752       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
01753       cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 
01754       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3);    
01755       float2 a2 = make_float2(real(a), imag(a));
01756       float2 b2 = make_float2(real(b), imag(b));
01757       cxpaypbzHKernel<<<blasGrid, blasBlock>>>(a2, b2, (short2*)z.v, (float*)z.norm, z.stride, z.volume);
01758     }else{
01759       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);           
01760     }
01761   }
01762 
01763   if (!blasTuning) checkCudaError();
01764 }
01765 
01766 
01767 template <typename Float, typename Float2>
01768 __global__ void axpyBzpcxDKernel(Float a, Float2 *x, Float2 *y, Float b, Float2 *z, Float c, int len)
01769 {
01770   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01771   unsigned int gridSize = gridDim.x*blockDim.x;
01772   while (i < len) {
01773     Float2 x_i = READ_DOUBLE2_TEXTURE(x, i);
01774     Float2 z_i = READ_DOUBLE2_TEXTURE(z, i);
01775 
01776     y[i].x += a*x_i.x;
01777     y[i].y += a*x_i.y;
01778     
01779     x[i].x = b*z_i.x + c*x_i.x;
01780     x[i].y = b*z_i.y + c*x_i.y;
01781     
01782     i += gridSize;
01783   }
01784 }
01785 
01786 
01787 template <typename Float, typename Float2>
01788 __global__ void axpyBzpcxSKernel(Float a, Float2 *x, Float2 *y, Float b, Float2 *z, Float c, int len)
01789 {
01790   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01791   unsigned int gridSize = gridDim.x*blockDim.x;
01792   while (i < len) {
01793     Float2 x_i = read_Float2(x, i);
01794     Float2 z_i = read_Float2(z, i);
01795 
01796     y[i].x += a*x_i.x;
01797     y[i].y += a*x_i.y;
01798     
01799     x[i].x = b*z_i.x + c*x_i.x;
01800     x[i].y = b*z_i.y + c*x_i.y;
01801     
01802     i += gridSize;
01803   }
01804 }
01805 
01806 __global__ void axpyBzpcxHKernel(float a, float b, float c, short4 *xH, float *xN, short4 *yH, float *yN, int stride, int length)
01807 {
01808   
01809   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01810   unsigned int gridSize = gridDim.x*blockDim.x;
01811   while (i < length) {
01812     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01813     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01814     RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride);
01815     AXPY_FLOAT4(a, x0, y0);
01816     AXPBY_FLOAT4(b, z0, c, x0);
01817     AXPY_FLOAT4(a, x1, y1);
01818     AXPBY_FLOAT4(b, z1, c, x1);
01819     AXPY_FLOAT4(a, x2, y2);
01820     AXPBY_FLOAT4(b, z2, c, x2);
01821     AXPY_FLOAT4(a, x3, y3);
01822     AXPBY_FLOAT4(b, z3, c, x3);
01823     AXPY_FLOAT4(a, x4, y4);
01824     AXPBY_FLOAT4(b, z4, c, x4);
01825     AXPY_FLOAT4(a, x5, y5);
01826     AXPBY_FLOAT4(b, z5, c, x5);    
01827     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01828     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(xH, xN, x, stride);
01829     i += gridSize;
01830   }
01831 }
01832 
01833 
01834 __global__ void axpyBzpcxHKernel(float a, float b, float c, short2 *xH, float *xN, short2 *yH, float *yN, int stride, int length)
01835 {
01836   
01837   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01838   unsigned int gridSize = gridDim.x*blockDim.x;
01839   while (i < length) {
01840     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01841     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01842     RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride);
01843     AXPY_FLOAT2(a, x0, y0);
01844     AXPBY_FLOAT2(b, z0, c, x0);
01845     AXPY_FLOAT2(a, x1, y1);
01846     AXPBY_FLOAT2(b, z1, c, x1);
01847     AXPY_FLOAT2(a, x2, y2);
01848     AXPBY_FLOAT2(b, z2, c, x2);
01849     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01850     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(xH, xN, x, stride);
01851     i += gridSize;
01852   }
01853 }
01854 
01855 // performs the operations: {y[i] = a*x[i] + y[i]; x[i] = b*z[i] + c*x[i]}
01856 void axpyBzpcxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y, const double &b, 
01857                    cudaColorSpinorField& z, const double &c) 
01858 {
01859   checkSpinor(x,y);
01860   checkSpinor(x,z);
01861   setBlock(11, x.length, x.precision);
01862   if (x.precision == QUDA_DOUBLE_PRECISION) {
01863     int spinor_bytes = x.length*sizeof(double);
01864     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
01865     cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 
01866     axpyBzpcxDKernel<<<blasGrid, blasBlock>>>(a, (double2*)x.v, (double2*)y.v, b, (double2*)z.v, c, x.length/2);
01867   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01868     axpyBzpcxSKernel<<<blasGrid, blasBlock>>>((float)a, (float2*)x.v, (float2*)y.v, (float)b, (float2*)z.v, (float)c, x.length/2);
01869   } else {
01870     if (x.siteSubset == QUDA_FULL_SITE_SUBSET){
01871       axpyBzpcxCuda(a, x.Even(), y.Even(), b, z.Even(), c);
01872       axpyBzpcxCuda(a, x.Odd(), y.Odd(), b, z.Odd(), c);
01873       return ;
01874     }
01875 
01876     int spinor_bytes = x.length*sizeof(short);
01877     if (x.nSpin == 4){ //wilson
01878       cudaBindTexture(0, texHalf1, x.v, spinor_bytes);
01879       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);
01880       cudaBindTexture(0, texHalf2, y.v, spinor_bytes);
01881       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);
01882       cudaBindTexture(0, texHalf3, z.v, spinor_bytes);
01883       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12);
01884       axpyBzpcxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (float)c, (short4*)x.v, (float*)x.norm,
01885                                               (short4*)y.v, (float*)y.norm, z.stride, z.volume);
01886     }else if (x.nSpin == 1){ //staggered
01887       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes);
01888       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);
01889       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes);
01890       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);
01891       cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes);
01892       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3);
01893       axpyBzpcxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (float)c, (short2*)x.v, (float*)x.norm,
01894                                               (short2*)y.v, (float*)y.norm, z.stride, z.volume);
01895     }else{
01896       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);    
01897     }
01898     blas_quda_bytes += 5*x.volume*sizeof(float);
01899   }
01900   blas_quda_bytes += 5*x.real_length*x.precision;
01901   blas_quda_flops += 10*x.real_length;
01902 
01903   if (!blasTuning) checkCudaError();
01904 }
01905 
01906 
01907 
01908 template <typename Float, typename Float2>
01909 __global__ void axpyZpbxDKernel(Float a, Float2 *x, Float2 *y, Float2 *z, Float b, int len) {
01910   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01911   unsigned int gridSize = gridDim.x*blockDim.x;
01912   while (i < len) {
01913     Float2 x_i = READ_DOUBLE2_TEXTURE(x, i);
01914     Float2 z_i = READ_DOUBLE2_TEXTURE(z, i);
01915     y[i].x += a*x_i.x;
01916     y[i].y += a*x_i.y;
01917     x[i].x = z_i.x + b*x_i.x;
01918     x[i].y = z_i.y + b*x_i.y;
01919     i += gridSize;
01920   }
01921 }
01922 
01923 template <typename Float, typename Float2>
01924 __global__ void axpyZpbxSKernel(Float a, Float2 *x, Float2 *y, Float2 *z, Float b, int len) {
01925   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01926   unsigned int gridSize = gridDim.x*blockDim.x;
01927   while (i < len) {
01928     Float2 x_i = read_Float2(x, i);
01929     Float2 z_i = read_Float2(z, i);
01930     y[i].x += a*x_i.x;
01931     y[i].y += a*x_i.y;
01932     x[i].x = z_i.x + b*x_i.x;
01933     x[i].y = z_i.y + b*x_i.y;
01934     i += gridSize;
01935   }
01936 }
01937 
01938 __global__ void axpyZpbxHKernel(float a, float b, short4 *xH, float *xN, short4 *yH, float *yN, int stride, int length) {
01939   
01940   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01941   unsigned int gridSize = gridDim.x*blockDim.x;
01942   while (i < length) {
01943     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
01944     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
01945     AXPY_FLOAT4(a, x0, y0);
01946     AXPY_FLOAT4(a, x1, y1);
01947     AXPY_FLOAT4(a, x2, y2);
01948     AXPY_FLOAT4(a, x3, y3);
01949     AXPY_FLOAT4(a, x4, y4);
01950     AXPY_FLOAT4(a, x5, y5);
01951     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
01952     RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride);
01953     XPAY_FLOAT4(z0, b, x0);
01954     XPAY_FLOAT4(z1, b, x1);
01955     XPAY_FLOAT4(z2, b, x2);
01956     XPAY_FLOAT4(z3, b, x3);
01957     XPAY_FLOAT4(z4, b, x4);
01958     XPAY_FLOAT4(z5, b, x5);
01959     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(xH, xN, x, stride);
01960     i += gridSize;
01961   }   
01962 }
01963 
01964 __global__ void axpyZpbxHKernel(float a, float b, short2 *xH, float *xN, short2 *yH, float *yN, int stride, int length) {
01965   
01966   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
01967   unsigned int gridSize = gridDim.x*blockDim.x;
01968   while (i < length) {
01969     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
01970     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
01971     RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride);
01972     AXPY_FLOAT2(a, x0, y0);
01973     XPAY_FLOAT2(z0, b, x0);
01974     AXPY_FLOAT2(a, x1, y1);
01975     XPAY_FLOAT2(z1, b, x1);
01976     AXPY_FLOAT2(a, x2, y2);
01977     XPAY_FLOAT2(z2, b, x2);
01978     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
01979     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(xH, xN, x, stride);
01980     i += gridSize;
01981   }   
01982 }
01983 
01984 
01985 // performs the operations: {y[i] = a*x[i] + y[i]; x[i] = z[i] + b*x[i]}
01986 void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, 
01987                   cudaColorSpinorField &z, const double &b) {
01988   checkSpinor(x,y);
01989   checkSpinor(x,z);
01990   setBlock(12, x.length, x.precision);
01991   if (x.precision == QUDA_DOUBLE_PRECISION) {
01992     int spinor_bytes = x.length*sizeof(double);
01993     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
01994     cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 
01995     axpyZpbxDKernel<<<blasGrid, blasBlock>>>
01996       (a, (double2*)x.v, (double2*)y.v, (double2*)z.v, b, x.length/2);
01997   } else if (x.precision == QUDA_SINGLE_PRECISION) {
01998     axpyZpbxSKernel<<<blasGrid, blasBlock>>>
01999       ((float)a, (float2*)x.v, (float2*)y.v, (float2*)z.v, (float)b, x.length/2);
02000   } else {
02001     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
02002       axpyZpbxCuda(a, x.Even(), y.Even(), z.Even(), b);
02003       axpyZpbxCuda(a, x.Odd(), y.Odd(), z.Odd(), b);
02004       return;
02005     }
02006     int spinor_bytes = x.length*sizeof(short);
02007     if (x.nSpin ==4){ //wilson 
02008       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
02009       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
02010       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
02011       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
02012       cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 
02013       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12);    
02014       axpyZpbxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short4*)x.v, (float*)x.norm,
02015                                                (short4*)y.v, (float*)y.norm, z.stride, z.volume);
02016     }else if (x.nSpin == 1){ //staggered
02017       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
02018       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
02019       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
02020       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
02021       cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 
02022       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3);    
02023       axpyZpbxHKernel<<<blasGrid, blasBlock>>>((float)a, (float)b, (short2*)x.v, (float*)x.norm,
02024                                                (short2*)y.v, (float*)y.norm, z.stride, z.volume);
02025     }else{
02026       errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);           
02027     }
02028     blas_quda_bytes += 5*x.volume*sizeof(float);
02029   }
02030   blas_quda_bytes += 5*x.real_length*x.precision;
02031   blas_quda_flops += 8*x.real_length;
02032 
02033   if (!blasTuning) checkCudaError();
02034 }
02035 
02036 template <typename Float2>
02037 __global__ void caxpbypzYmbwDKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, int len) {
02038 
02039   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
02040   unsigned int gridSize = gridDim.x*blockDim.x;
02041   while (i < len) {
02042     Float2 X = READ_DOUBLE2_TEXTURE(x, i);
02043     Float2 Z = read_Float2(z, i);
02044 
02045     Z.x += a.x*X.x - a.y*X.y;
02046     Z.y += a.y*X.x + a.x*X.y;
02047 
02048     Float2 Y = READ_DOUBLE2_TEXTURE(y, i);
02049     Z.x += b.x*Y.x - b.y*Y.y;
02050     Z.y += b.y*Y.x + b.x*Y.y;
02051     z[i] = make_Float2(Z);
02052 
02053     Float2 W = read_Float2(w, i);
02054 
02055     Y.x -= b.x*W.x - b.y*W.y;
02056     Y.y -= b.y*W.x + b.x*W.y;   
02057     
02058     y[i] = make_Float2(Y);
02059     i += gridSize;
02060   } 
02061 }
02062 
02063 template <typename Float2>
02064 __global__ void caxpbypzYmbwSKernel(Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, int len) {
02065 
02066   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
02067   unsigned int gridSize = gridDim.x*blockDim.x;
02068   while (i < len) {
02069     Float2 X = read_Float2(x, i);
02070     Float2 Z = read_Float2(z, i);
02071 
02072     Z.x += a.x*X.x - a.y*X.y;
02073     Z.y += a.y*X.x + a.x*X.y;
02074 
02075     Float2 Y = read_Float2(y, i);
02076     Z.x += b.x*Y.x - b.y*Y.y;
02077     Z.y += b.y*Y.x + b.x*Y.y;
02078     z[i] = make_Float2(Z);
02079 
02080     Float2 W = read_Float2(w, i);
02081 
02082     Y.x -= b.x*W.x - b.y*W.y;
02083     Y.y -= b.y*W.x + b.x*W.y;   
02084     
02085     y[i] = make_Float2(Y);
02086     i += gridSize;
02087   } 
02088 }
02089 
02090 __global__ void caxpbypzYmbwHKernel(float2 a, float2 b, float *xN, short4 *yH, float *yN, 
02091                                     short4 *zH, float *zN, float *wN, int stride, int length) {
02092   
02093   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
02094   unsigned int gridSize = gridDim.x*blockDim.x;
02095   while (i < length) {
02096     RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);
02097     RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);
02098     RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride);
02099     CAXPBYPZ_FLOAT4(a, x0, b, y0, z0);
02100     CAXPBYPZ_FLOAT4(a, x1, b, y1, z1);
02101     CAXPBYPZ_FLOAT4(a, x2, b, y2, z2);
02102     CAXPBYPZ_FLOAT4(a, x3, b, y3, z3);
02103     CAXPBYPZ_FLOAT4(a, x4, b, y4, z4);
02104     CAXPBYPZ_FLOAT4(a, x5, b, y5, z5);
02105     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(zH, zN, z, stride);
02106     READ_HALF_SPINOR(w, texHalf4, stride);
02107     float2 b2 = -wc*b; 
02108     CAXPY_FLOAT4(b2, w0, y0);
02109     CAXPY_FLOAT4(b2, w1, y1);
02110     CAXPY_FLOAT4(b2, w2, y2);
02111     CAXPY_FLOAT4(b2, w3, y3);
02112     CAXPY_FLOAT4(b2, w4, y4);
02113     CAXPY_FLOAT4(b2, w5, y5);
02114     CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
02115     i += gridSize;
02116   }   
02117 }
02118 
02119 __global__ void caxpbypzYmbwHKernel(float2 a, float2 b, float *xN, short2 *yH, float *yN, 
02120                                     short2 *zH, float *zN, float *wN, int stride, int length) {
02121   
02122   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
02123   unsigned int gridSize = gridDim.x*blockDim.x;
02124   while (i < length) {
02125     RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);
02126     RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);
02127     RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride);
02128     CAXPBYPZ_FLOAT2(a, x0, b, y0, z0);
02129     CAXPBYPZ_FLOAT2(a, x1, b, y1, z1);
02130     CAXPBYPZ_FLOAT2(a, x2, b, y2, z2);
02131     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(zH, zN, z, stride);
02132     READ_HALF_SPINOR_ST(w, texHalfSt4, stride);
02133     float2 b2 = -wc*b; 
02134     CAXPY_FLOAT2(b2, w0, y0);
02135     CAXPY_FLOAT2(b2, w1, y1);
02136     CAXPY_FLOAT2(b2, w2, y2);
02137     CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
02138     i += gridSize;
02139   }   
02140 }
02141 
02142 // performs the operation z[i] = a*x[i] + b*y[i] + z[i] and y[i] -= b*w[i]
02143 void caxpbypzYmbwCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y,
02144                       cudaColorSpinorField &z, cudaColorSpinorField &w) {
02145   checkSpinor(x,y);
02146   checkSpinor(x,z);
02147   checkSpinor(x,w);
02148   int length = x.length/2;
02149   setBlock(13, length, x.precision);
02150   if (x.precision == QUDA_DOUBLE_PRECISION) {
02151     int spinor_bytes = x.length*sizeof(double);
02152     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
02153     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
02154     cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 
02155     double2 a2 = make_double2(real(a), imag(a));
02156     double2 b2 = make_double2(real(b), imag(b));
02157     caxpbypzYmbwDKernel<<<blasGrid, blasBlock>>>(a2, (double2*)x.v, b2, (double2*)y.v, 
02158                                           (double2*)z.v, (double2*)w.v, length); 
02159   } else if (x.precision == QUDA_SINGLE_PRECISION) {
02160     float2 a2 = make_float2(real(a), imag(a));
02161     float2 b2 = make_float2(real(b), imag(b));
02162     caxpbypzYmbwSKernel<<<blasGrid, blasBlock>>>(a2, (float2*)x.v, b2, (float2*)y.v, 
02163                                                  (float2*)z.v, (float2*)w.v, length); 
02164   } else {
02165     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) {
02166       caxpbypzYmbwCuda(a, x.Even(), b, y.Even(), z.Even(), w.Even());
02167       caxpbypzYmbwCuda(a, x.Odd(), b, y.Odd(), z.Odd(), w.Odd());
02168       return;
02169     }
02170     int spinor_bytes = x.length*sizeof(short);
02171     blas_quda_bytes += 6*x.volume*sizeof(float);
02172     float2 a2 = make_float2(real(a), imag(a));
02173     float2 b2 = make_float2(real(b), imag(b));
02174     if (x.nSpin == 4){ //wilson
02175       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
02176       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
02177       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
02178       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
02179       cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 
02180       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12);    
02181       cudaBindTexture(0, texHalf4, w.v, spinor_bytes); 
02182       cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/12); 
02183       caxpbypzYmbwHKernel<<<blasGrid, blasBlock>>>(a2, b2, (float*)x.norm, (short4*)y.v, (float*)y.norm,
02184                                                    (short4*)z.v, (float*)z.norm, (float*)w.norm,
02185                                                    z.stride, z.volume);
02186     } else if (x.nSpin == 1){ //staggered
02187       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
02188       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
02189       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
02190       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
02191       cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 
02192       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3);    
02193       cudaBindTexture(0, texHalfSt4, w.v, spinor_bytes);
02194       cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/3); 
02195       caxpbypzYmbwHKernel<<<blasGrid, blasBlock>>>(a2, b2, (float*)x.norm, (short2*)y.v, (float*)y.norm,
02196                                                    (short2*)z.v, (float*)z.norm, (float*)w.norm,
02197                                                    z.stride, z.volume);
02198     }else{
02199      errorQuda("ERROR: nSpin=%d is not supported\n", x.nSpin);                 
02200     }
02201   }
02202   blas_quda_bytes += 6*x.real_length*x.precision;
02203   blas_quda_flops += 12*x.real_length;
02204 
02205   if (!blasTuning) checkCudaError();
02206 }
02207 
02208 
02209 // Computes c = a + b in "double single" precision.
02210 __device__ void dsadd(volatile QudaSumFloat &c0, volatile QudaSumFloat &c1, const volatile QudaSumFloat &a0, 
02211                       const volatile QudaSumFloat &a1, const float b0, const float b1) {
02212   // Compute dsa + dsb using Knuth's trick.
02213   QudaSumFloat t1 = a0 + b0;
02214   QudaSumFloat e = t1 - a0;
02215   QudaSumFloat t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1;
02216   // The result is t1 + t2, after normalization.
02217   c0 = e = t1 + t2;
02218   c1 = t2 - (e - t1);
02219 }
02220 
02221 // Computes c = a + b in "double single" precision (complex version)
02222 __device__ void zcadd(volatile QudaSumComplex &c0, volatile QudaSumComplex &c1, const volatile QudaSumComplex &a0, 
02223                       const volatile QudaSumComplex &a1, const volatile QudaSumComplex &b0, const volatile QudaSumComplex &b1) {
02224   // Compute dsa + dsb using Knuth's trick.
02225   QudaSumFloat t1 = a0.x + b0.x;
02226   QudaSumFloat e = t1 - a0.x;
02227   QudaSumFloat t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x;
02228   // The result is t1 + t2, after normalization.
02229   c0.x = e = t1 + t2;
02230   c1.x = t2 - (e - t1);
02231   
02232   // Compute dsa + dsb using Knuth's trick.
02233   t1 = a0.y + b0.y;
02234   e = t1 - a0.y;
02235   t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y;
02236   // The result is t1 + t2, after normalization.
02237   c0.y = e = t1 + t2;
02238   c1.y = t2 - (e - t1);
02239 }
02240 
02241 // Computes c = a + b in "double single" precision (float3 version)
02242 __device__ void dsadd3(volatile QudaSumFloat3 &c0, volatile QudaSumFloat3 &c1, const volatile QudaSumFloat3 &a0, 
02243                        const volatile QudaSumFloat3 &a1, const volatile QudaSumFloat3 &b0, const volatile QudaSumFloat3 &b1) {
02244   // Compute dsa + dsb using Knuth's trick.
02245   QudaSumFloat t1 = a0.x + b0.x;
02246   QudaSumFloat e = t1 - a0.x;
02247   QudaSumFloat t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x;
02248   // The result is t1 + t2, after normalization.
02249   c0.x = e = t1 + t2;
02250   c1.x = t2 - (e - t1);
02251   
02252   // Compute dsa + dsb using Knuth's trick.
02253   t1 = a0.y + b0.y;
02254   e = t1 - a0.y;
02255   t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y;
02256   // The result is t1 + t2, after normalization.
02257   c0.y = e = t1 + t2;
02258   c1.y = t2 - (e - t1);
02259   
02260   // Compute dsa + dsb using Knuth's trick.
02261   t1 = a0.z + b0.z;
02262   e = t1 - a0.z;
02263   t2 = ((b0.z - e) + (a0.z - (t1 - e))) + a1.z + b1.z;
02264   // The result is t1 + t2, after normalization.
02265   c0.z = e = t1 + t2;
02266   c1.z = t2 - (e - t1);
02267 }
02268 
02269 //
02270 // double sumCuda(float *a, int n) {}
02271 //
02272 template <unsigned int reduce_threads, typename Float>
02273 #define REDUCE_FUNC_NAME(suffix) sumD##suffix
02274 #define REDUCE_TYPES Float *a
02275 #define REDUCE_PARAMS a
02276 #define REDUCE_AUXILIARY(i)
02277 #define REDUCE_OPERATION(i) a[i]
02278 #include "reduce_core.h"
02279 #undef REDUCE_FUNC_NAME
02280 #undef REDUCE_TYPES
02281 #undef REDUCE_PARAMS
02282 #undef REDUCE_AUXILIARY
02283 #undef REDUCE_OPERATION
02284 
02285 template <unsigned int reduce_threads, typename Float>
02286 #define REDUCE_FUNC_NAME(suffix) sumS##suffix
02287 #define REDUCE_TYPES Float *a
02288 #define REDUCE_PARAMS a
02289 #define REDUCE_AUXILIARY(i)
02290 #define REDUCE_OPERATION(i) a[i].x + a[i].y
02291 #include "reduce_core.h"
02292 #undef REDUCE_FUNC_NAME
02293 #undef REDUCE_TYPES
02294 #undef REDUCE_PARAMS
02295 #undef REDUCE_AUXILIARY
02296 #undef REDUCE_OPERATION
02297 
02298 template <unsigned int reduce_threads, typename Float>
02299 #define REDUCE_FUNC_NAME(suffix) sumH##suffix
02300 #define REDUCE_TYPES Float *aN, int stride
02301 #define REDUCE_PARAMS aN, stride
02302 #define REDUCE_AUXILIARY(i)                                             \
02303   READ_HALF_SPINOR(a, texHalf1, stride);                                \
02304   SUM_FLOAT4(s0, a0);                                                   \
02305   SUM_FLOAT4(s1, a1);                                                   \
02306   SUM_FLOAT4(s2, a2);                                                   \
02307   SUM_FLOAT4(s3, a3);                                                   \
02308   SUM_FLOAT4(s4, a4);                                                   \
02309   SUM_FLOAT4(s5, a5);                                                   \
02310   s0 += s1; s2 += s3; s4 += s5; s0 += s2; s0 += s4;
02311 #define REDUCE_OPERATION(i) (ac*s0)
02312 #include "reduce_core.h"
02313 #undef REDUCE_FUNC_NAME
02314 #undef REDUCE_TYPES
02315 #undef REDUCE_PARAMS
02316 #undef REDUCE_AUXILIARY
02317 #undef REDUCE_OPERATION
02318 
02319 template <unsigned int reduce_threads, typename Float>
02320 #define REDUCE_FUNC_NAME(suffix) sumHSt##suffix
02321 #define REDUCE_TYPES Float *aN, int stride
02322 #define REDUCE_PARAMS aN, stride
02323 #define REDUCE_AUXILIARY(i)                                             \
02324   READ_HALF_SPINOR_ST(a, texHalfSt1, stride);                           \
02325   SUM_FLOAT2(s0, a0);                                                   \
02326   SUM_FLOAT2(s1, a1);                                                   \
02327   SUM_FLOAT2(s2, a2);                                                   \
02328   s0 += s1; s0 += s2;
02329 #define REDUCE_OPERATION(i) (ac*s0)
02330 #include "reduce_core.h"
02331 #undef REDUCE_FUNC_NAME
02332 #undef REDUCE_TYPES
02333 #undef REDUCE_PARAMS
02334 #undef REDUCE_AUXILIARY
02335 #undef REDUCE_OPERATION
02336 
02337 double sumCuda(cudaColorSpinorField &a) {
02338   const int id = 14;
02339   blas_quda_flops += a.real_length;
02340   blas_quda_bytes += a.real_length*a.precision;
02341   if (a.precision == QUDA_DOUBLE_PRECISION) {
02342     return sumDCuda((double*)a.v, a.length, id, a.precision);
02343   } else if (a.precision == QUDA_SINGLE_PRECISION) {
02344     return sumSCuda((float2*)a.v, a.length/2, id, a.precision);
02345   } else {
02346     if (a.siteSubset == QUDA_FULL_SITE_SUBSET) return sumCuda(a.Even()) + sumCuda(a.Odd());
02347     int spinor_bytes = a.length*sizeof(short);
02348     blas_quda_bytes += a.volume*sizeof(float);
02349     if (a.nSpin ==4) { // wilson
02350       cudaBindTexture(0, texHalf1, a.v, spinor_bytes); 
02351       cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/12);    
02352       return sumHCuda((float*)a.norm, a.stride, a.volume, id, a.precision);
02353     } else if (a.nSpin ==1) { // staggered
02354       cudaBindTexture(0, texHalfSt1, a.v, spinor_bytes); 
02355       cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/3);    
02356       return sumHStCuda((float*)a.norm, a.stride, a.volume, id, a.precision);
02357     } else {
02358       errorQuda("ERROR: nSpin=%d is not supported\n", a.nSpin);           
02359       return 0;
02360     }
02361   }
02362 }
02363 
02364 //
02365 // double normCuda(float *a, int n) {}
02366 //
02367 template <unsigned int reduce_threads, typename Float>
02368 #define REDUCE_FUNC_NAME(suffix) normD##suffix
02369 #define REDUCE_TYPES Float *a
02370 #define REDUCE_PARAMS a
02371 #define REDUCE_AUXILIARY(i)
02372 #define REDUCE_OPERATION(i) (a[i]*a[i])
02373 #include "reduce_core.h"
02374 #undef REDUCE_FUNC_NAME
02375 #undef REDUCE_TYPES
02376 #undef REDUCE_PARAMS
02377 #undef REDUCE_AUXILIARY
02378 #undef REDUCE_OPERATION
02379 
02380 template <unsigned int reduce_threads, typename Float>
02381 #define REDUCE_FUNC_NAME(suffix) normS##suffix
02382 #define REDUCE_TYPES Float *a
02383 #define REDUCE_PARAMS a
02384 #define REDUCE_AUXILIARY(i)
02385 #define REDUCE_OPERATION(i) (a[i].x*a[i].x + a[i].y*a[i].y)
02386 #include "reduce_core.h"
02387 #undef REDUCE_FUNC_NAME
02388 #undef REDUCE_TYPES
02389 #undef REDUCE_PARAMS
02390 #undef REDUCE_AUXILIARY
02391 #undef REDUCE_OPERATION
02392 
02393 //
02394 // double normHCuda(char *, int n) {}
02395 //
02396 template <unsigned int reduce_threads, typename Float>
02397 #define REDUCE_FUNC_NAME(suffix) normH##suffix
02398 #define REDUCE_TYPES Float *aN, int stride // dummy type
02399 #define REDUCE_PARAMS aN, stride
02400 #define REDUCE_AUXILIARY(i)                                             \
02401   READ_HALF_SPINOR(a, texHalf1, stride);                                \
02402   REAL_DOT_FLOAT4(norm0, a0, a0);                                       \
02403   REAL_DOT_FLOAT4(norm1, a1, a1);                                       \
02404   REAL_DOT_FLOAT4(norm2, a2, a2);                                       \
02405   REAL_DOT_FLOAT4(norm3, a3, a3);                                       \
02406   REAL_DOT_FLOAT4(norm4, a4, a4);                                       \
02407   REAL_DOT_FLOAT4(norm5, a5, a5);                                       \
02408   norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4;
02409 #define REDUCE_OPERATION(i) (ac*ac*norm0)
02410 #include "reduce_core.h"
02411 #undef REDUCE_FUNC_NAME
02412 #undef REDUCE_TYPES
02413 #undef REDUCE_PARAMS
02414 #undef REDUCE_AUXILIARY
02415 #undef REDUCE_OPERATION
02416 
02417 template <unsigned int reduce_threads, typename Float>
02418 #define REDUCE_FUNC_NAME(suffix) normHSt##suffix
02419 #define REDUCE_TYPES Float *aN, int stride // dummy type
02420 #define REDUCE_PARAMS aN, stride
02421 #define REDUCE_AUXILIARY(i)                                             \
02422     READ_HALF_SPINOR_ST(a, texHalfSt1, stride);                         \
02423     REAL_DOT_FLOAT2(norm0, a0, a0);                                     \
02424     REAL_DOT_FLOAT2(norm1, a1, a1);                                     \
02425     REAL_DOT_FLOAT2(norm2, a2, a2);                                     \
02426     norm0 += norm1; norm0 += norm2; 
02427 #define REDUCE_OPERATION(i) (ac*ac*norm0)
02428 #include "reduce_core.h"
02429 #undef REDUCE_FUNC_NAME
02430 #undef REDUCE_TYPES
02431 #undef REDUCE_PARAMS
02432 #undef REDUCE_AUXILIARY
02433 #undef REDUCE_OPERATION
02434 
02435 double normCuda(const cudaColorSpinorField &a) {
02436   const int id = 15;
02437   blas_quda_flops += 2*a.real_length;
02438   blas_quda_bytes += a.real_length*a.precision;
02439   if (a.precision == QUDA_DOUBLE_PRECISION) {
02440     return normDCuda((double*)a.v, a.length, id, a.precision);
02441   } else if (a.precision == QUDA_SINGLE_PRECISION) {
02442     return normSCuda((float2*)a.v, a.length/2, id, a.precision);
02443   } else {
02444     if (a.siteSubset == QUDA_FULL_SITE_SUBSET) return normCuda(a.Even()) + normCuda(a.Odd());
02445     int spinor_bytes = a.length*sizeof(short);
02446     int half_norm_ratio = (a.nColor*a.nSpin*2*sizeof(short))/sizeof(float);
02447     blas_quda_bytes += (a.real_length*a.precision) / (a.nColor * a.nSpin);
02448     cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/half_norm_ratio);    
02449     if (a.nSpin == 4){ //wilson
02450         cudaBindTexture(0, texHalf1, a.v, spinor_bytes); 
02451         return normHCuda((float*)a.norm, a.stride, a.volume, id, a.precision);
02452     }else if (a.nSpin == 1) { //staggered
02453         cudaBindTexture(0, texHalfSt1, a.v, spinor_bytes);
02454         return normHStCuda((float*)a.norm, a.stride, a.volume, id, a.precision);        
02455     }else{
02456       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, a.nSpin);
02457       return 0;
02458     }
02459   }
02460 
02461 }
02462 
02463 //
02464 // double reDotProductFCuda(float *a, float *b, int n) {}
02465 //
02466 template <unsigned int reduce_threads, typename Float>
02467 #define REDUCE_FUNC_NAME(suffix) reDotProductD##suffix
02468 #define REDUCE_TYPES Float *a, Float *b
02469 #define REDUCE_PARAMS a, b
02470 #define REDUCE_AUXILIARY(i)
02471 #define REDUCE_OPERATION(i) (a[i]*b[i])
02472 #include "reduce_core.h"
02473 #undef REDUCE_FUNC_NAME
02474 #undef REDUCE_TYPES
02475 #undef REDUCE_PARAMS
02476 #undef REDUCE_AUXILIARY
02477 #undef REDUCE_OPERATION
02478 
02479 template <unsigned int reduce_threads, typename Float>
02480 #define REDUCE_FUNC_NAME(suffix) reDotProductS##suffix
02481 #define REDUCE_TYPES Float *a, Float *b
02482 #define REDUCE_PARAMS a, b
02483 #define REDUCE_AUXILIARY(i)
02484 #define REDUCE_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
02485 #include "reduce_core.h"
02486 #undef REDUCE_FUNC_NAME
02487 #undef REDUCE_TYPES
02488 #undef REDUCE_PARAMS
02489 #undef REDUCE_AUXILIARY
02490 #undef REDUCE_OPERATION
02491 
02492 //
02493 // double reDotProductHCuda(float *a, float *b, int n) {}
02494 //
02495 template <unsigned int reduce_threads, typename Float>
02496 #define REDUCE_FUNC_NAME(suffix) reDotProductH##suffix
02497 #define REDUCE_TYPES Float *aN, Float *bN, int stride
02498 #define REDUCE_PARAMS aN, bN, stride
02499 #define REDUCE_AUXILIARY(i)                                             \
02500   READ_HALF_SPINOR(a, texHalf1, stride);                                \
02501   READ_HALF_SPINOR(b, texHalf2, stride);                                \
02502   REAL_DOT_FLOAT4(rdot0, a0, b0);                                       \
02503   REAL_DOT_FLOAT4(rdot1, a1, b1);                                       \
02504   REAL_DOT_FLOAT4(rdot2, a2, b2);                                       \
02505   REAL_DOT_FLOAT4(rdot3, a3, b3);                                       \
02506   REAL_DOT_FLOAT4(rdot4, a4, b4);                                       \
02507   REAL_DOT_FLOAT4(rdot5, a5, b5);                                       \
02508   rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4;
02509 #define REDUCE_OPERATION(i) (ac*bc*rdot0)
02510 #include "reduce_core.h"
02511 #undef REDUCE_FUNC_NAME
02512 #undef REDUCE_TYPES
02513 #undef REDUCE_PARAMS
02514 #undef REDUCE_AUXILIARY
02515 #undef REDUCE_OPERATION
02516 
02517 template <unsigned int reduce_threads, typename Float>
02518 #define REDUCE_FUNC_NAME(suffix) reDotProductHSt##suffix
02519 #define REDUCE_TYPES Float *aN, Float *bN, int stride
02520 #define REDUCE_PARAMS aN, bN, stride
02521 #define REDUCE_AUXILIARY(i)                                             \
02522   READ_HALF_SPINOR_ST(a, texHalfSt1, stride);                           \
02523   READ_HALF_SPINOR_ST(b, texHalfSt2, stride);                           \
02524   REAL_DOT_FLOAT2(rdot0, a0, b0);                                       \
02525   REAL_DOT_FLOAT2(rdot1, a1, b1);                                       \
02526   REAL_DOT_FLOAT2(rdot2, a2, b2);                                       \
02527   rdot0 += rdot1; rdot0 += rdot2; 
02528 #define REDUCE_OPERATION(i) (ac*bc*rdot0)
02529 #include "reduce_core.h"
02530 #undef REDUCE_FUNC_NAME
02531 #undef REDUCE_TYPES
02532 #undef REDUCE_PARAMS
02533 #undef REDUCE_AUXILIARY
02534 #undef REDUCE_OPERATION  
02535 
02536 double reDotProductCuda(cudaColorSpinorField &a, cudaColorSpinorField &b) {
02537   const int id = 16;
02538   blas_quda_flops += 2*a.real_length;
02539   checkSpinor(a, b);
02540   blas_quda_bytes += 2*a.real_length*a.precision;
02541   if (a.precision == QUDA_DOUBLE_PRECISION) {
02542     return reDotProductDCuda((double*)a.v, (double*)b.v, a.length, id, a.precision);
02543   } else if (a.precision == QUDA_SINGLE_PRECISION) {
02544     return reDotProductSCuda((float2*)a.v, (float2*)b.v, a.length/2, id, a.precision);
02545   } else {
02546     if (a.siteSubset == QUDA_FULL_SITE_SUBSET) {
02547       return reDotProductCuda(a.Even(), b.Even()) + reDotProductCuda(a.Odd(), b.Odd());
02548     }
02549     blas_quda_bytes += 2*a.volume*sizeof(float);
02550     int spinor_bytes = a.length*sizeof(short);
02551     if (a.nSpin == 4){ //wilson
02552       cudaBindTexture(0, texHalf1, a.v, spinor_bytes); 
02553       cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/12);    
02554       cudaBindTexture(0, texHalf2, b.v, spinor_bytes); 
02555       cudaBindTexture(0, texNorm2, b.norm, spinor_bytes/12);    
02556       return reDotProductHCuda((float*)a.norm, (float*)b.norm, a.stride, a.volume, id, a.precision);
02557     }else if (a.nSpin == 1){ //staggered
02558       cudaBindTexture(0, texHalfSt1, a.v, spinor_bytes); 
02559       cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/3);    
02560       cudaBindTexture(0, texHalfSt2, b.v, spinor_bytes); 
02561       cudaBindTexture(0, texNorm2, b.norm, spinor_bytes/3);    
02562       return reDotProductHStCuda((float*)a.norm, (float*)b.norm, a.stride, a.volume, id, a.precision);
02563     }else{
02564       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, a.nSpin);      
02565       return 0;
02566     }
02567   }
02568 
02569 }
02570 
02571 //
02572 // double axpyNormCuda(float a, float *x, float *y, n){}
02573 //
02574 // First performs the operation y[i] = a*x[i] + y[i]
02575 // Second returns the norm of y
02576 //
02577 
02578 template <unsigned int reduce_threads, typename Float>
02579 #define REDUCE_FUNC_NAME(suffix) axpyNormF##suffix
02580 #define REDUCE_TYPES Float a, Float *x, Float *y
02581 #define REDUCE_PARAMS a, x, y
02582 #define REDUCE_AUXILIARY(i) y[i] = a*x[i] + y[i]
02583 #define REDUCE_OPERATION(i) (y[i]*y[i])
02584 #include "reduce_core.h"
02585 #undef REDUCE_FUNC_NAME
02586 #undef REDUCE_TYPES
02587 #undef REDUCE_PARAMS
02588 #undef REDUCE_AUXILIARY
02589 #undef REDUCE_OPERATION
02590 
02591 template <unsigned int reduce_threads, typename Float>
02592 #define REDUCE_FUNC_NAME(suffix) axpyNormH##suffix
02593 #define REDUCE_TYPES Float a, short4 *yH, float *yN, int stride
02594 #define REDUCE_PARAMS a, yH, yN, stride
02595 #define REDUCE_AUXILIARY(i)                                             \
02596   RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);               \
02597   RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);               \
02598   AXPY_FLOAT4(a, x0, y0);                                               \
02599   REAL_DOT_FLOAT4(norm0, y0, y0);                                       \
02600   AXPY_FLOAT4(a, x1, y1);                                               \
02601   REAL_DOT_FLOAT4(norm1, y1, y1);                                       \
02602   AXPY_FLOAT4(a, x2, y2);                                               \
02603   REAL_DOT_FLOAT4(norm2, y2, y2);                                       \
02604   AXPY_FLOAT4(a, x3, y3);                                               \
02605   REAL_DOT_FLOAT4(norm3, y3, y3);                                       \
02606   AXPY_FLOAT4(a, x4, y4);                                               \
02607   REAL_DOT_FLOAT4(norm4, y4, y4);                                       \
02608   AXPY_FLOAT4(a, x5, y5);                                               \
02609   REAL_DOT_FLOAT4(norm5, y5, y5);                                       \
02610   norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2; norm0 += norm4; \
02611   CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
02612 #define REDUCE_OPERATION(i) (norm0)
02613 #include "reduce_core.h"
02614 #undef REDUCE_FUNC_NAME
02615 #undef REDUCE_TYPES
02616 #undef REDUCE_PARAMS
02617 #undef REDUCE_AUXILIARY
02618 #undef REDUCE_OPERATION
02619 
02620 template <unsigned int reduce_threads, typename Float>
02621 #define REDUCE_FUNC_NAME(suffix) axpyNormH##suffix
02622 #define REDUCE_TYPES Float a, short2 *yH, float *yN, int stride
02623 #define REDUCE_PARAMS a, yH, yN, stride
02624 #define REDUCE_AUXILIARY(i)                                             \
02625   RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);          \
02626   RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);          \
02627   AXPY_FLOAT2(a, x0, y0);                                               \
02628   REAL_DOT_FLOAT2(norm0, y0, y0);                                       \
02629   AXPY_FLOAT2(a, x1, y1);                                               \
02630   REAL_DOT_FLOAT2(norm1, y1, y1);                                       \
02631   AXPY_FLOAT2(a, x2, y2);                                               \
02632   REAL_DOT_FLOAT2(norm2, y2, y2);                                       \
02633   norm0 += norm1; norm0 += norm2;                                       \
02634   CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
02635 #define REDUCE_OPERATION(i) (norm0)
02636 #include "reduce_core.h"
02637 #undef REDUCE_FUNC_NAME
02638 #undef REDUCE_TYPES
02639 #undef REDUCE_PARAMS
02640 #undef REDUCE_AUXILIARY
02641 #undef REDUCE_OPERATION
02642 
02643 double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
02644   const int id = 17;
02645   blas_quda_flops += 4*x.real_length;
02646   checkSpinor(x,y);
02647   blas_quda_bytes += 3*x.real_length*x.precision;
02648   if (x.precision == QUDA_DOUBLE_PRECISION) {
02649     return axpyNormFCuda(a, (double*)x.v, (double*)y.v, x.length, id, x.precision);
02650   } else if (x.precision == QUDA_SINGLE_PRECISION) {
02651     return axpyNormFCuda((float)a, (float*)x.v, (float*)y.v, x.length, id, x.precision);
02652   } else {
02653     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
02654       return axpyNormCuda(a, x.Even(), y.Even()) + axpyNormCuda(a, x.Odd(), y.Odd());
02655 
02656     cudaBindTexture(0, texNorm1, x.norm, x.bytes/(x.nColor*x.nSpin));    
02657     cudaBindTexture(0, texNorm2, y.norm, x.bytes/(x.nColor*x.nSpin));    
02658     blas_quda_bytes += 3*x.volume*sizeof(float);
02659     
02660     if (x.nSpin == 4){ //wilson
02661       cudaBindTexture(0, texHalf1, x.v, x.bytes); 
02662       cudaBindTexture(0, texHalf2, y.v, x.bytes); 
02663       return axpyNormHCuda((float)a, (short4*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision);
02664     }else if (x.nSpin == 1){ //staggered
02665       cudaBindTexture(0, texHalfSt1, x.v, x.bytes); 
02666       cudaBindTexture(0, texHalfSt2, y.v, x.bytes); 
02667       return axpyNormHCuda((float)a, (short2*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision);
02668     }else{
02669       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);            
02670       return 0;
02671     }
02672   }
02673 
02674 }
02675 
02676 //
02677 // double xmyNormCuda(float a, float *x, float *y, n){}
02678 //
02679 // First performs the operation y[i] = x[i] - y[i]
02680 // Second returns the norm of y
02681 //
02682 
02683 template <unsigned int reduce_threads, typename Float>
02684 #define REDUCE_FUNC_NAME(suffix) xmyNormF##suffix
02685 #define REDUCE_TYPES Float *x, Float *y
02686 #define REDUCE_PARAMS x, y
02687 #define REDUCE_AUXILIARY(i) y[i] = x[i] - y[i]
02688 #define REDUCE_OPERATION(i) (y[i]*y[i])
02689 #include "reduce_core.h"
02690 #undef REDUCE_FUNC_NAME
02691 #undef REDUCE_TYPES
02692 #undef REDUCE_PARAMS
02693 #undef REDUCE_AUXILIARY
02694 #undef REDUCE_OPERATION
02695 
02696 template <unsigned int reduce_threads, typename Float>
02697 #define REDUCE_FUNC_NAME(suffix) xmyNormH##suffix
02698 #define REDUCE_TYPES Float *d1, Float *d2, short4 *yH, float *yN, int stride
02699 #define REDUCE_PARAMS d1, d2, yH, yN, stride
02700 #define REDUCE_AUXILIARY(i)                                             \
02701   RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);               \
02702   RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);               \
02703   XMY_FLOAT4(x0, y0);                                                   \
02704   REAL_DOT_FLOAT4(norm0, y0, y0);                                       \
02705   XMY_FLOAT4(x1, y1);                                                   \
02706   REAL_DOT_FLOAT4(norm1, y1, y1);                                       \
02707   XMY_FLOAT4(x2, y2);                                                   \
02708   REAL_DOT_FLOAT4(norm2, y2, y2);                                       \
02709   XMY_FLOAT4(x3, y3);                                                   \
02710   REAL_DOT_FLOAT4(norm3, y3, y3);                                       \
02711   XMY_FLOAT4(x4, y4);                                                   \
02712   REAL_DOT_FLOAT4(norm4, y4, y4);                                       \
02713   XMY_FLOAT4(x5, y5);                                                   \
02714   REAL_DOT_FLOAT4(norm5, y5, y5);                                       \
02715   norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2; norm0 += norm4; \
02716   CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
02717 #define REDUCE_OPERATION(i) (norm0)
02718 #include "reduce_core.h"
02719 #undef REDUCE_FUNC_NAME
02720 #undef REDUCE_TYPES
02721 #undef REDUCE_PARAMS
02722 #undef REDUCE_AUXILIARY
02723 #undef REDUCE_OPERATION
02724 
02725 template <unsigned int reduce_threads, typename Float>
02726 #define REDUCE_FUNC_NAME(suffix) xmyNormH##suffix
02727 #define REDUCE_TYPES Float *d1, Float *d2, short2 *yH, float *yN, int stride
02728 #define REDUCE_PARAMS d1, d2, yH, yN, stride
02729 #define REDUCE_AUXILIARY(i)                                             \
02730   RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);          \
02731   RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);          \
02732   XMY_FLOAT2(x0, y0);                                                   \
02733   REAL_DOT_FLOAT2(norm0, y0, y0);                                       \
02734   XMY_FLOAT2(x1, y1);                                                   \
02735   REAL_DOT_FLOAT2(norm1, y1, y1);                                       \
02736   XMY_FLOAT2(x2, y2);                                                   \
02737   REAL_DOT_FLOAT2(norm2, y2, y2);                                       \
02738   norm0 += norm1; norm0 += norm2;                                       \
02739   CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
02740 #define REDUCE_OPERATION(i) (norm0)
02741 #include "reduce_core.h"
02742 #undef REDUCE_FUNC_NAME
02743 #undef REDUCE_TYPES
02744 #undef REDUCE_PARAMS
02745 #undef REDUCE_AUXILIARY
02746 #undef REDUCE_OPERATION
02747 
02748 
02749 double xmyNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
02750   const int id = 18;
02751   blas_quda_flops += 3*x.real_length;
02752   checkSpinor(x,y);
02753   blas_quda_bytes += 3*x.real_length*x.precision;
02754   if (x.precision == QUDA_DOUBLE_PRECISION) {
02755     return xmyNormFCuda((double*)x.v, (double*)y.v, x.length, id, x.precision);
02756   } else if (x.precision == QUDA_SINGLE_PRECISION) {
02757     return xmyNormFCuda((float*)x.v, (float*)y.v, x.length, id, x.precision);
02758   } else { 
02759     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
02760       return xmyNormCuda(x.Even(), y.Even()) + xmyNormCuda(x.Odd(), y.Odd());
02761 
02762     cudaBindTexture(0, texNorm1, x.norm, x.bytes/(x.nColor*x.nSpin));    
02763     cudaBindTexture(0, texNorm2, y.norm, x.bytes/(x.nColor*x.nSpin));    
02764     blas_quda_bytes += 3*x.volume*sizeof(float);
02765     
02766     if (x.nSpin ==4 ){ //wilsin
02767       cudaBindTexture(0, texHalf1, x.v, x.bytes); 
02768       cudaBindTexture(0, texHalf2, y.v, x.bytes); 
02769       return xmyNormHCuda((char*)0, (char*)0, (short4*)y.v, (float*)y.norm, y.stride, y.volume, id, x.precision);
02770     }else if (x.nSpin == 1){
02771       cudaBindTexture(0, texHalfSt1, x.v, x.bytes); 
02772       cudaBindTexture(0, texHalfSt2, y.v, x.bytes); 
02773       return xmyNormHCuda((char*)0, (char*)0, (short2*)y.v, (float*)y.norm, y.stride, y.volume, id, x.precision);
02774     }else{
02775       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);                  
02776     }
02777   }
02778 
02779 }
02780 
02781 
02782 //
02783 // double2 cDotProductCuda(float2 *x, float2 *y, int n) {}
02784 //
02785 template <unsigned int reduce_threads, typename Float, typename Float2>
02786 #define REDUCE_FUNC_NAME(suffix) cDotProductD##suffix
02787 #define REDUCE_TYPES Float2 *x, Float2 *y, Float c
02788 #define REDUCE_PARAMS x, y, c
02789 #define REDUCE_REAL_AUXILIARY(i) Float2 a = READ_DOUBLE2_TEXTURE(x, i);
02790 #define REDUCE_IMAG_AUXILIARY(i) Float2 b = READ_DOUBLE2_TEXTURE(y, i);
02791 #define REDUCE_REAL_OPERATION(i) (a.x*b.x + a.y*b.y)
02792 #define REDUCE_IMAG_OPERATION(i) (a.x*b.y - a.y*b.x)
02793 #include "reduce_complex_core.h"
02794 #undef REDUCE_FUNC_NAME
02795 #undef REDUCE_TYPES
02796 #undef REDUCE_PARAMS
02797 #undef REDUCE_REAL_AUXILIARY
02798 #undef REDUCE_IMAG_AUXILIARY
02799 #undef REDUCE_REAL_OPERATION
02800 #undef REDUCE_IMAG_OPERATION
02801 
02802 template <unsigned int reduce_threads, typename Float, typename Float2>
02803 #define REDUCE_FUNC_NAME(suffix) cDotProductS##suffix
02804 #define REDUCE_TYPES Float2 *x, Float2 *y, Float c
02805 #define REDUCE_PARAMS x, y, c
02806 #define REDUCE_REAL_AUXILIARY(i) Float2 a = read_Float2(x, i);
02807 #define REDUCE_IMAG_AUXILIARY(i) Float2 b = read_Float2(y, i);
02808 #define REDUCE_REAL_OPERATION(i) (a.x*b.x + a.y*b.y)
02809 #define REDUCE_IMAG_OPERATION(i) (a.x*b.y - a.y*b.x)
02810 #include "reduce_complex_core.h"
02811 #undef REDUCE_FUNC_NAME
02812 #undef REDUCE_TYPES
02813 #undef REDUCE_PARAMS
02814 #undef REDUCE_REAL_AUXILIARY
02815 #undef REDUCE_IMAG_AUXILIARY
02816 #undef REDUCE_REAL_OPERATION
02817 #undef REDUCE_IMAG_OPERATION
02818 
02819 template <unsigned int reduce_threads, typename Float, typename Float2>
02820 #define REDUCE_FUNC_NAME(suffix) cDotProductH##suffix
02821 #define REDUCE_TYPES Float *aN, Float2 *bN, int stride
02822 #define REDUCE_PARAMS aN, bN, stride
02823 #define REDUCE_REAL_AUXILIARY(i)                                        \
02824   READ_HALF_SPINOR(a, texHalf1, stride);                                \
02825   READ_HALF_SPINOR(b, texHalf2, stride);                                \
02826   REAL_DOT_FLOAT4(rdot0, a0, b0);                                       \
02827   REAL_DOT_FLOAT4(rdot1, a1, b1);                                       \
02828   REAL_DOT_FLOAT4(rdot2, a2, b2);                                       \
02829   REAL_DOT_FLOAT4(rdot3, a3, b3);                                       \
02830   REAL_DOT_FLOAT4(rdot4, a4, b4);                                       \
02831   REAL_DOT_FLOAT4(rdot5, a5, b5);                                       \
02832   rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4;
02833 #define REDUCE_IMAG_AUXILIARY(i)                                        \
02834   IMAG_DOT_FLOAT4(idot0, a0, b0);                                       \
02835   IMAG_DOT_FLOAT4(idot1, a1, b1);                                       \
02836   IMAG_DOT_FLOAT4(idot2, a2, b2);                                       \
02837   IMAG_DOT_FLOAT4(idot3, a3, b3);                                       \
02838   IMAG_DOT_FLOAT4(idot4, a4, b4);                                       \
02839   IMAG_DOT_FLOAT4(idot5, a5, b5);                                       \
02840   idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4;
02841 #define REDUCE_REAL_OPERATION(i) (ac*bc*rdot0)
02842 #define REDUCE_IMAG_OPERATION(i) (ac*bc*idot0)
02843 #include "reduce_complex_core.h"
02844 #undef REDUCE_FUNC_NAME
02845 #undef REDUCE_TYPES
02846 #undef REDUCE_PARAMS
02847 #undef REDUCE_REAL_AUXILIARY
02848 #undef REDUCE_IMAG_AUXILIARY
02849 #undef REDUCE_REAL_OPERATION
02850 #undef REDUCE_IMAG_OPERATION
02851 
02852 template <unsigned int reduce_threads, typename Float, typename Float2>
02853 #define REDUCE_FUNC_NAME(suffix) cDotProductHSt##suffix
02854 #define REDUCE_TYPES Float *aN, Float2 *bN, int stride
02855 #define REDUCE_PARAMS aN, bN, stride
02856 #define REDUCE_REAL_AUXILIARY(i)                                        \
02857   READ_HALF_SPINOR_ST(a, texHalfSt1, stride);                           \
02858   READ_HALF_SPINOR_ST(b, texHalfSt2, stride);                           \
02859   REAL_DOT_FLOAT2(rdot0, a0, b0);                                       \
02860   REAL_DOT_FLOAT2(rdot1, a1, b1);                                       \
02861   REAL_DOT_FLOAT2(rdot2, a2, b2);                                       \
02862   rdot0 += rdot1; rdot0 += rdot2;
02863 #define REDUCE_IMAG_AUXILIARY(i)                                        \
02864   IMAG_DOT_FLOAT2(idot0, a0, b0);                                       \
02865   IMAG_DOT_FLOAT2(idot1, a1, b1);                                       \
02866   IMAG_DOT_FLOAT2(idot2, a2, b2);                                       \
02867   idot0 += idot1; idot0 += idot2;
02868 #define REDUCE_REAL_OPERATION(i) (ac*bc*rdot0)
02869 #define REDUCE_IMAG_OPERATION(i) (ac*bc*idot0)
02870 #include "reduce_complex_core.h"
02871 #undef REDUCE_FUNC_NAME
02872 #undef REDUCE_TYPES
02873 #undef REDUCE_PARAMS
02874 #undef REDUCE_REAL_AUXILIARY
02875 #undef REDUCE_IMAG_AUXILIARY
02876 #undef REDUCE_REAL_OPERATION
02877 #undef REDUCE_IMAG_OPERATION
02878 
02879 Complex cDotProductCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
02880   const int id = 19;
02881   blas_quda_flops += 4*x.real_length;
02882   checkSpinor(x,y);
02883   int length = x.length/2;
02884   blas_quda_bytes += 2*x.real_length*x.precision;
02885   double2 dot;
02886   if (x.precision == QUDA_DOUBLE_PRECISION) {
02887     char c = 0;
02888     int spinor_bytes = x.length*sizeof(double);
02889     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
02890     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
02891     dot = cDotProductDCuda((double2*)x.v, (double2*)y.v, c, length, id, x.precision);
02892   } else if (x.precision == QUDA_SINGLE_PRECISION) {
02893     char c = 0;
02894     int spinor_bytes = x.length*sizeof(float);
02895     cudaBindTexture(0, xTexSingle2, x.v, spinor_bytes); 
02896     cudaBindTexture(0, yTexSingle2, y.v, spinor_bytes); 
02897     dot = cDotProductSCuda((float2*)x.v, (float2*)y.v, c, length, id, x.precision);
02898   } else {
02899     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
02900       return cDotProductCuda(x.Even(), y.Even()) + cDotProductCuda(x.Odd(), y.Odd());
02901     int spinor_bytes = x.length*sizeof(short);
02902     blas_quda_bytes += 2*x.volume*sizeof(float);
02903     if (x.nSpin == 4){
02904       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
02905       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
02906       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
02907       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
02908       dot = cDotProductHCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision);
02909     } else if (x.nSpin == 1){
02910       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
02911       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
02912       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
02913       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
02914       dot = cDotProductHStCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision);
02915     }else{
02916       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);      
02917     } 
02918   }
02919 
02920   return Complex(dot.x, dot.y);
02921 }
02922 
02923 //
02924 // double2 xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int n) {}
02925 //
02926 // First performs the operation y = x + a*y
02927 // Second returns complex dot product (z,y)
02928 //
02929 
02930 template <unsigned int reduce_threads, typename Float, typename Float2>
02931 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyD##suffix
02932 #define REDUCE_TYPES Float2 *x, Float a, Float2 *y, Float2 *z
02933 #define REDUCE_PARAMS x, a, y, z
02934 #define REDUCE_REAL_AUXILIARY(i)                \
02935   Float2 X = READ_DOUBLE2_TEXTURE(x, i);        \
02936   Float2 Y = READ_DOUBLE2_TEXTURE(y, i);        \
02937   Float2 Z = READ_DOUBLE2_TEXTURE(z, i);
02938 #define REDUCE_IMAG_AUXILIARY(i) y[i].x = X.x + a*Y.x; y[i].y = X.y + a*Y.y
02939 #define REDUCE_REAL_OPERATION(i) (Z.x*y[i].x + Z.y*y[i].y)
02940 #define REDUCE_IMAG_OPERATION(i) (Z.x*y[i].y - Z.y*y[i].x)
02941 #include "reduce_complex_core.h"
02942 #undef REDUCE_FUNC_NAME
02943 #undef REDUCE_TYPES
02944 #undef REDUCE_PARAMS
02945 #undef REDUCE_REAL_AUXILIARY
02946 #undef REDUCE_IMAG_AUXILIARY
02947 #undef REDUCE_REAL_OPERATION
02948 #undef REDUCE_IMAG_OPERATION
02949 
02950 template <unsigned int reduce_threads, typename Float, typename Float2>
02951 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyS##suffix
02952 #define REDUCE_TYPES Float2 *x, Float a, Float2 *y, Float2 *z
02953 #define REDUCE_PARAMS x, a, y, z
02954 #define REDUCE_REAL_AUXILIARY(i) y[i].x = x[i].x + a*y[i].x
02955 #define REDUCE_IMAG_AUXILIARY(i) y[i].y = x[i].y + a*y[i].y
02956 #define REDUCE_REAL_OPERATION(i) (z[i].x*y[i].x + z[i].y*y[i].y)
02957 #define REDUCE_IMAG_OPERATION(i) (z[i].x*y[i].y - z[i].y*y[i].x)
02958 #include "reduce_complex_core.h"
02959 #undef REDUCE_FUNC_NAME
02960 #undef REDUCE_TYPES
02961 #undef REDUCE_PARAMS
02962 #undef REDUCE_REAL_AUXILIARY
02963 #undef REDUCE_IMAG_AUXILIARY
02964 #undef REDUCE_REAL_OPERATION
02965 #undef REDUCE_IMAG_OPERATION
02966 
02967 template <unsigned int reduce_threads, typename Float, typename Float2>
02968 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyH##suffix
02969 #define REDUCE_TYPES Float a, short4 *yH, Float2 *yN, int stride
02970 #define REDUCE_PARAMS a, yH, yN, stride
02971 #define REDUCE_REAL_AUXILIARY(i)                                        \
02972   RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);               \
02973   RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);               \
02974   RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride);               \
02975   XPAY_FLOAT4(x0, a, y0);                                               \
02976   XPAY_FLOAT4(x1, a, y1);                                               \
02977   XPAY_FLOAT4(x2, a, y2);                                               \
02978   XPAY_FLOAT4(x3, a, y3);                                               \
02979   XPAY_FLOAT4(x4, a, y4);                                               \
02980   XPAY_FLOAT4(x5, a, y5);                                               \
02981   REAL_DOT_FLOAT4(rdot0, z0, y0);                                       \
02982   REAL_DOT_FLOAT4(rdot1, z1, y1);                                       \
02983   REAL_DOT_FLOAT4(rdot2, z2, y2);                                       \
02984   REAL_DOT_FLOAT4(rdot3, z3, y3);                                       \
02985   REAL_DOT_FLOAT4(rdot4, z4, y4);                                       \
02986   REAL_DOT_FLOAT4(rdot5, z5, y5);                                       \
02987   rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4;
02988 #define REDUCE_IMAG_AUXILIARY(i)                                        \
02989   IMAG_DOT_FLOAT4(idot0, z0, y0);                                       \
02990   IMAG_DOT_FLOAT4(idot1, z1, y1);                                       \
02991   IMAG_DOT_FLOAT4(idot2, z2, y2);                                       \
02992   IMAG_DOT_FLOAT4(idot3, z3, y3);                                       \
02993   IMAG_DOT_FLOAT4(idot4, z4, y4);                                       \
02994   IMAG_DOT_FLOAT4(idot5, z5, y5);                                       \
02995   idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4; \
02996   CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);
02997 #define REDUCE_REAL_OPERATION(i) (rdot0)
02998 #define REDUCE_IMAG_OPERATION(i) (idot0)
02999 #include "reduce_complex_core.h"
03000 #undef REDUCE_FUNC_NAME
03001 #undef REDUCE_TYPES
03002 #undef REDUCE_PARAMS
03003 #undef REDUCE_REAL_AUXILIARY
03004 #undef REDUCE_IMAG_AUXILIARY
03005 #undef REDUCE_REAL_OPERATION
03006 #undef REDUCE_IMAG_OPERATION
03007 
03008 template <unsigned int reduce_threads, typename Float, typename Float2>
03009 #define REDUCE_FUNC_NAME(suffix) xpaycDotzyH##suffix
03010 #define REDUCE_TYPES Float a, short2 *yH, Float2 *yN, int stride
03011 #define REDUCE_PARAMS a, yH, yN, stride
03012 #define REDUCE_REAL_AUXILIARY(i)                                        \
03013   RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);          \
03014   RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);          \
03015   RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride);          \
03016   XPAY_FLOAT2(x0, a, y0);                                               \
03017   XPAY_FLOAT2(x1, a, y1);                                               \
03018   XPAY_FLOAT2(x2, a, y2);                                               \
03019   REAL_DOT_FLOAT2(rdot0, z0, y0);                                       \
03020   REAL_DOT_FLOAT2(rdot1, z1, y1);                                       \
03021   REAL_DOT_FLOAT2(rdot2, z2, y2);                                       \
03022   rdot0 += rdot1; rdot0 += rdot2;
03023 #define REDUCE_IMAG_AUXILIARY(i)                                        \
03024   IMAG_DOT_FLOAT2(idot0, z0, y0);                                       \
03025   IMAG_DOT_FLOAT2(idot1, z1, y1);                                       \
03026   IMAG_DOT_FLOAT2(idot2, z2, y2);                                       \
03027   idot0 += idot1; idot0 += idot2;                                       \
03028   CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);
03029 #define REDUCE_REAL_OPERATION(i) (rdot0)
03030 #define REDUCE_IMAG_OPERATION(i) (idot0)
03031 #include "reduce_complex_core.h"
03032 #undef REDUCE_FUNC_NAME
03033 #undef REDUCE_TYPES
03034 #undef REDUCE_PARAMS
03035 #undef REDUCE_REAL_AUXILIARY
03036 #undef REDUCE_IMAG_AUXILIARY
03037 #undef REDUCE_REAL_OPERATION
03038 #undef REDUCE_IMAG_OPERATION
03039 
03040 Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z) {
03041   const int id = 20;
03042   blas_quda_flops += 6*x.real_length;
03043   checkSpinor(x,y);
03044   checkSpinor(x,z);
03045   int length = x.length/2;
03046   blas_quda_bytes += 4*x.real_length*x.precision;
03047   double2 dot;
03048   if (x.precision == QUDA_DOUBLE_PRECISION) {
03049     int spinor_bytes = x.length*sizeof(double);
03050     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
03051     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
03052     cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 
03053     dot = xpaycDotzyDCuda((double2*)x.v, a, (double2*)y.v, (double2*)z.v, length, id, x.precision);
03054   } else if (x.precision == QUDA_SINGLE_PRECISION) {
03055     dot = xpaycDotzySCuda((float2*)x.v, (float)a, (float2*)y.v, (float2*)z.v, length, id, x.precision);
03056   } else {
03057     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
03058       return xpaycDotzyCuda(x.Even(), a, y.Even(), z.Even()) + xpaycDotzyCuda(x.Odd(), a, y.Odd(), z.Odd());
03059     int spinor_bytes = x.length*sizeof(short);
03060     blas_quda_bytes += 4*x.volume*sizeof(float);
03061     if (x.nSpin ==4 ){//wilson
03062       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
03063       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
03064       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
03065       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
03066       cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 
03067       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12);    
03068       dot = xpaycDotzyHCuda((float)a, (short4*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision);
03069     } else if (x.nSpin ==1 ){//wilson
03070       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
03071       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
03072       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
03073       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
03074       cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 
03075       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3);    
03076       dot = xpaycDotzyHCuda((float)a, (short2*)y.v, (float*)y.norm, x.stride, x.volume, id, x.precision);
03077     }else{
03078       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);            
03079     }
03080   }
03081   return Complex(dot.x, dot.y);
03082 }
03083 
03084 
03085 //
03086 // double3 cDotProductNormACuda(float2 *a, float2 *b, int n) {}
03087 //
03088 template <unsigned int reduce_threads, typename Float2>
03089 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAD##suffix
03090 #define REDUCE_TYPES Float2 *x, Float2 *y
03091 #define REDUCE_PARAMS x, y
03092 #define REDUCE_X_AUXILIARY(i) Float2 a = READ_DOUBLE2_TEXTURE(x, i);
03093 #define REDUCE_Y_AUXILIARY(i) Float2 b = READ_DOUBLE2_TEXTURE(y, i);
03094 #define REDUCE_Z_AUXILIARY(i)
03095 #define REDUCE_X_OPERATION(i) (a.x*b.x + a.y*b.y)
03096 #define REDUCE_Y_OPERATION(i) (a.x*b.y - a.y*b.x)
03097 #define REDUCE_Z_OPERATION(i) (a.x*a.x + a.y*a.y)
03098 #include "reduce_triple_core.h"
03099 #undef REDUCE_FUNC_NAME
03100 #undef REDUCE_TYPES
03101 #undef REDUCE_PARAMS
03102 #undef REDUCE_X_AUXILIARY
03103 #undef REDUCE_Y_AUXILIARY
03104 #undef REDUCE_Z_AUXILIARY
03105 #undef REDUCE_X_OPERATION
03106 #undef REDUCE_Y_OPERATION
03107 #undef REDUCE_Z_OPERATION
03108 
03109 template <unsigned int reduce_threads, typename Float2>
03110 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAS##suffix
03111 #define REDUCE_TYPES Float2 *a, Float2 *b
03112 #define REDUCE_PARAMS a, b
03113 #define REDUCE_X_AUXILIARY(i)
03114 #define REDUCE_Y_AUXILIARY(i)
03115 #define REDUCE_Z_AUXILIARY(i)
03116 #define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
03117 #define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
03118 #define REDUCE_Z_OPERATION(i) (a[i].x*a[i].x + a[i].y*a[i].y)
03119 #include "reduce_triple_core.h"
03120 #undef REDUCE_FUNC_NAME
03121 #undef REDUCE_TYPES
03122 #undef REDUCE_PARAMS
03123 #undef REDUCE_X_AUXILIARY
03124 #undef REDUCE_Y_AUXILIARY
03125 #undef REDUCE_Z_AUXILIARY
03126 #undef REDUCE_X_OPERATION
03127 #undef REDUCE_Y_OPERATION
03128 #undef REDUCE_Z_OPERATION
03129 
03130 template <unsigned int reduce_threads, typename Float2>
03131 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAH##suffix
03132 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride
03133 #define REDUCE_PARAMS xN, yN, stride
03134 #define REDUCE_X_AUXILIARY(i)                                           \
03135   READ_HALF_SPINOR(x, texHalf1, stride);                                \
03136   READ_HALF_SPINOR(y, texHalf2, stride);                                \
03137   REAL_DOT_FLOAT4(norm0, x0, x0);                                       \
03138   REAL_DOT_FLOAT4(norm1, x1, x1);                                       \
03139   REAL_DOT_FLOAT4(norm2, x2, x2);                                       \
03140   REAL_DOT_FLOAT4(norm3, x3, x3);                                       \
03141   REAL_DOT_FLOAT4(norm4, x4, x4);                                       \
03142   REAL_DOT_FLOAT4(norm5, x5, x5);                                       \
03143   norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4;
03144 #define REDUCE_Y_AUXILIARY(i)                                           \
03145   REAL_DOT_FLOAT4(rdot0, x0, y0);                                       \
03146   REAL_DOT_FLOAT4(rdot1, x1, y1);                                       \
03147   REAL_DOT_FLOAT4(rdot2, x2, y2);                                       \
03148   REAL_DOT_FLOAT4(rdot3, x3, y3);                                       \
03149   REAL_DOT_FLOAT4(rdot4, x4, y4);                                       \
03150   REAL_DOT_FLOAT4(rdot5, x5, y5);                                       \
03151   rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4;
03152 #define REDUCE_Z_AUXILIARY(i)                                           \
03153   IMAG_DOT_FLOAT4(idot0, x0, y0);                                       \
03154   IMAG_DOT_FLOAT4(idot1, x1, y1);                                       \
03155   IMAG_DOT_FLOAT4(idot2, x2, y2);                                       \
03156   IMAG_DOT_FLOAT4(idot3, x3, y3);                                       \
03157   IMAG_DOT_FLOAT4(idot4, x4, y4);                                       \
03158   IMAG_DOT_FLOAT4(idot5, x5, y5);                                       \
03159   idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4;  
03160 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0)
03161 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0)
03162 #define REDUCE_Z_OPERATION(i) (xc*xc*norm0)
03163 #include "reduce_triple_core.h"
03164 #undef REDUCE_FUNC_NAME
03165 #undef REDUCE_TYPES
03166 #undef REDUCE_PARAMS
03167 #undef REDUCE_X_AUXILIARY
03168 #undef REDUCE_Y_AUXILIARY
03169 #undef REDUCE_Z_AUXILIARY
03170 #undef REDUCE_X_OPERATION
03171 #undef REDUCE_Y_OPERATION
03172 #undef REDUCE_Z_OPERATION
03173 
03174 template <unsigned int reduce_threads, typename Float2>
03175 #define REDUCE_FUNC_NAME(suffix) cDotProductNormAHSt##suffix
03176 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride
03177 #define REDUCE_PARAMS xN, yN, stride
03178 #define REDUCE_X_AUXILIARY(i)                                           \
03179   READ_HALF_SPINOR_ST(x, texHalfSt1, stride);                           \
03180   READ_HALF_SPINOR_ST(y, texHalfSt2, stride);                           \
03181   REAL_DOT_FLOAT2(norm0, x0, x0);                                       \
03182   REAL_DOT_FLOAT2(norm1, x1, x1);                                       \
03183   REAL_DOT_FLOAT2(norm2, x2, x2);                                       \
03184   norm0 += norm1; norm0 += norm2;
03185 #define REDUCE_Y_AUXILIARY(i)                                           \
03186   REAL_DOT_FLOAT2(rdot0, x0, y0);                                       \
03187   REAL_DOT_FLOAT2(rdot1, x1, y1);                                       \
03188   REAL_DOT_FLOAT2(rdot2, x2, y2);                                       \
03189   rdot0 += rdot1; rdot0 += rdot2;
03190 #define REDUCE_Z_AUXILIARY(i)                                           \
03191   IMAG_DOT_FLOAT2(idot0, x0, y0);                                       \
03192   IMAG_DOT_FLOAT2(idot1, x1, y1);                                       \
03193   IMAG_DOT_FLOAT2(idot2, x2, y2);                                       \
03194   idot0 += idot1; idot0 += idot2;
03195 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0)
03196 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0)
03197 #define REDUCE_Z_OPERATION(i) (xc*xc*norm0)
03198 #include "reduce_triple_core.h"
03199 #undef REDUCE_FUNC_NAME
03200 #undef REDUCE_TYPES
03201 #undef REDUCE_PARAMS
03202 #undef REDUCE_X_AUXILIARY
03203 #undef REDUCE_Y_AUXILIARY
03204 #undef REDUCE_Z_AUXILIARY
03205 #undef REDUCE_X_OPERATION
03206 #undef REDUCE_Y_OPERATION
03207 #undef REDUCE_Z_OPERATION
03208 
03209 double3 cDotProductNormACuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
03210   const int id = 21;
03211   blas_quda_flops += 6*x.real_length;
03212   checkSpinor(x,y);
03213   int length = x.length/2;
03214   blas_quda_bytes += 2*x.real_length*x.precision;
03215   if (x.precision == QUDA_DOUBLE_PRECISION) {
03216     int spinor_bytes = x.length*sizeof(double);
03217     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
03218     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
03219     return cDotProductNormADCuda((double2*)x.v, (double2*)y.v, length, id, x.precision);
03220   } else if (x.precision == QUDA_SINGLE_PRECISION) {
03221     return cDotProductNormASCuda((float2*)x.v, (float2*)y.v, length, id, x.precision);
03222   } else {
03223     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
03224       return cDotProductNormACuda(x.Even(), y.Even()) + cDotProductNormACuda(x.Odd(), y.Odd());
03225     int spinor_bytes = x.length*sizeof(short);
03226     blas_quda_bytes += 2*x.volume*sizeof(float);
03227     if (x.nSpin == 4){ //wilson
03228       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
03229       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
03230       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
03231       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
03232       return cDotProductNormAHCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision);
03233     } else if (x.nSpin == 1){ //staggered
03234       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
03235       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
03236       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
03237       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
03238       return cDotProductNormAHStCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision);
03239     }else{
03240       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);            
03241     }
03242   }
03243 
03244 }
03245 
03246 //
03247 // double3 cDotProductNormBCuda(float2 *a, float2 *b, int n) {}
03248 //
03249 template <unsigned int reduce_threads, typename Float2>
03250 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBD##suffix
03251 #define REDUCE_TYPES Float2 *x, Float2 *y
03252 #define REDUCE_PARAMS x, y
03253 #define REDUCE_X_AUXILIARY(i) Float2 a = READ_DOUBLE2_TEXTURE(x, i);
03254 #define REDUCE_Y_AUXILIARY(i) Float2 b = READ_DOUBLE2_TEXTURE(y, i);
03255 #define REDUCE_Z_AUXILIARY(i)
03256 #define REDUCE_X_OPERATION(i) (a.x*b.x + a.y*b.y)
03257 #define REDUCE_Y_OPERATION(i) (a.x*b.y - a.y*b.x)
03258 #define REDUCE_Z_OPERATION(i) (b.x*b.x + b.y*b.y)
03259 #include "reduce_triple_core.h"
03260 #undef REDUCE_FUNC_NAME
03261 #undef REDUCE_TYPES
03262 #undef REDUCE_PARAMS
03263 #undef REDUCE_X_AUXILIARY
03264 #undef REDUCE_Y_AUXILIARY
03265 #undef REDUCE_Z_AUXILIARY
03266 #undef REDUCE_X_OPERATION
03267 #undef REDUCE_Y_OPERATION
03268 #undef REDUCE_Z_OPERATION
03269 
03270 template <unsigned int reduce_threads, typename Float2>
03271 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBS##suffix
03272 #define REDUCE_TYPES Float2 *a, Float2 *b
03273 #define REDUCE_PARAMS a, b
03274 #define REDUCE_X_AUXILIARY(i)
03275 #define REDUCE_Y_AUXILIARY(i)
03276 #define REDUCE_Z_AUXILIARY(i)
03277 #define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
03278 #define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
03279 #define REDUCE_Z_OPERATION(i) (b[i].x*b[i].x + b[i].y*b[i].y)
03280 #include "reduce_triple_core.h"
03281 #undef REDUCE_FUNC_NAME
03282 #undef REDUCE_TYPES
03283 #undef REDUCE_PARAMS
03284 #undef REDUCE_X_AUXILIARY
03285 #undef REDUCE_Y_AUXILIARY
03286 #undef REDUCE_Z_AUXILIARY
03287 #undef REDUCE_X_OPERATION
03288 #undef REDUCE_Y_OPERATION
03289 #undef REDUCE_Z_OPERATION
03290 
03291 template <unsigned int reduce_threads, typename Float2>
03292 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBH##suffix
03293 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride
03294 #define REDUCE_PARAMS xN, yN, stride
03295 #define REDUCE_X_AUXILIARY(i)                                           \
03296   READ_HALF_SPINOR(x, texHalf1, stride);                                \
03297   READ_HALF_SPINOR(y, texHalf2, stride);                                \
03298   REAL_DOT_FLOAT4(norm0, y0, y0);                                       \
03299   REAL_DOT_FLOAT4(norm1, y1, y1);                                       \
03300   REAL_DOT_FLOAT4(norm2, y2, y2);                                       \
03301   REAL_DOT_FLOAT4(norm3, y3, y3);                                       \
03302   REAL_DOT_FLOAT4(norm4, y4, y4);                                       \
03303   REAL_DOT_FLOAT4(norm5, y5, y5);                                       \
03304   norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4;
03305 #define REDUCE_Y_AUXILIARY(i)                                           \
03306   REAL_DOT_FLOAT4(rdot0, x0, y0);                                       \
03307   REAL_DOT_FLOAT4(rdot1, x1, y1);                                       \
03308   REAL_DOT_FLOAT4(rdot2, x2, y2);                                       \
03309   REAL_DOT_FLOAT4(rdot3, x3, y3);                                       \
03310   REAL_DOT_FLOAT4(rdot4, x4, y4);                                       \
03311   REAL_DOT_FLOAT4(rdot5, x5, y5);                                       \
03312   rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4;
03313 #define REDUCE_Z_AUXILIARY(i)                                           \
03314   IMAG_DOT_FLOAT4(idot0, x0, y0);                                       \
03315   IMAG_DOT_FLOAT4(idot1, x1, y1);                                       \
03316   IMAG_DOT_FLOAT4(idot2, x2, y2);                                       \
03317   IMAG_DOT_FLOAT4(idot3, x3, y3);                                       \
03318   IMAG_DOT_FLOAT4(idot4, x4, y4);                                       \
03319   IMAG_DOT_FLOAT4(idot5, x5, y5);                                       \
03320   idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4;
03321 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0)
03322 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0)
03323 #define REDUCE_Z_OPERATION(i) (yc*yc*norm0)
03324 #include "reduce_triple_core.h"
03325 #undef REDUCE_FUNC_NAME
03326 #undef REDUCE_TYPES
03327 #undef REDUCE_PARAMS
03328 #undef REDUCE_X_AUXILIARY
03329 #undef REDUCE_Y_AUXILIARY
03330 #undef REDUCE_Z_AUXILIARY
03331 #undef REDUCE_X_OPERATION
03332 #undef REDUCE_Y_OPERATION
03333 #undef REDUCE_Z_OPERATION
03334 
03335 template <unsigned int reduce_threads, typename Float2>
03336 #define REDUCE_FUNC_NAME(suffix) cDotProductNormBHSt##suffix
03337 #define REDUCE_TYPES Float2 *xN, Float2 *yN, int stride
03338 #define REDUCE_PARAMS xN, yN, stride
03339 #define REDUCE_X_AUXILIARY(i)                                           \
03340   READ_HALF_SPINOR_ST(x, texHalfSt1, stride);                           \
03341   READ_HALF_SPINOR_ST(y, texHalfSt2, stride);                           \
03342   REAL_DOT_FLOAT2(norm0, y0, y0);                                       \
03343   REAL_DOT_FLOAT2(norm1, y1, y1);                                       \
03344   REAL_DOT_FLOAT2(norm2, y2, y2);                                       \
03345   norm0 += norm1; norm0 += norm2;
03346 #define REDUCE_Y_AUXILIARY(i)                                           \
03347   REAL_DOT_FLOAT2(rdot0, x0, y0);                                       \
03348   REAL_DOT_FLOAT2(rdot1, x1, y1);                                       \
03349   REAL_DOT_FLOAT2(rdot2, x2, y2);                                       \
03350   rdot0 += rdot1; rdot0 += rdot2;
03351 #define REDUCE_Z_AUXILIARY(i)                                           \
03352   IMAG_DOT_FLOAT2(idot0, x0, y0);                                       \
03353   IMAG_DOT_FLOAT2(idot1, x1, y1);                                       \
03354   IMAG_DOT_FLOAT2(idot2, x2, y2);                                       \
03355   idot0 += idot1; idot0 += idot2;
03356 #define REDUCE_X_OPERATION(i) (xc*yc*rdot0)
03357 #define REDUCE_Y_OPERATION(i) (xc*yc*idot0)
03358 #define REDUCE_Z_OPERATION(i) (yc*yc*norm0)
03359 #include "reduce_triple_core.h"
03360 #undef REDUCE_FUNC_NAME
03361 #undef REDUCE_TYPES
03362 #undef REDUCE_PARAMS
03363 #undef REDUCE_X_AUXILIARY
03364 #undef REDUCE_Y_AUXILIARY
03365 #undef REDUCE_Z_AUXILIARY
03366 #undef REDUCE_X_OPERATION
03367 #undef REDUCE_Y_OPERATION
03368 #undef REDUCE_Z_OPERATION
03369 
03370 double3 cDotProductNormBCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
03371   const int id = 22;
03372   blas_quda_flops += 6*x.real_length;
03373   checkSpinor(x,y);
03374   int length = x.length/2;
03375   blas_quda_bytes += 2*x.real_length*x.precision;
03376   if (x.precision == QUDA_DOUBLE_PRECISION) {
03377     int spinor_bytes = x.length*sizeof(double);
03378     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
03379     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
03380     return cDotProductNormBDCuda((double2*)x.v, (double2*)y.v, length, id, x.precision);
03381   } else if (x.precision == QUDA_SINGLE_PRECISION) {
03382     return cDotProductNormBSCuda((float2*)x.v, (float2*)y.v, length, id, x.precision);
03383   } else {
03384     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
03385       return cDotProductNormBCuda(x.Even(), y.Even()) + cDotProductNormBCuda(x.Odd(), y.Odd());
03386     int spinor_bytes = x.length*sizeof(short);
03387     blas_quda_bytes += 2*x.volume*sizeof(float);
03388     if (x.nSpin == 4){ //wilson
03389       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
03390       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
03391       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
03392       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
03393       return cDotProductNormBHCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision);
03394     } else if (x.nSpin == 1){ //staggered
03395       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
03396       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
03397       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
03398       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
03399       return cDotProductNormBHStCuda((float*)x.norm, (float*)y.norm, x.stride, x.volume, id, x.precision);
03400     }else{
03401       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);            
03402     }
03403   }
03404 
03405 }
03406 
03407 
03408 //
03409 // double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y, 
03410 // float2 *z, float2 *w, float2 *u, int len)
03411 //
03412 template <unsigned int reduce_threads, typename Float2>
03413 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYD##suffix
03414 #define REDUCE_TYPES Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, Float2 *u
03415 #define REDUCE_PARAMS a, x, b, y, z, w, u
03416 #define REDUCE_X_AUXILIARY(i)                           \
03417   Float2 X = READ_DOUBLE2_TEXTURE(x, i);                \
03418   Float2 Y = READ_DOUBLE2_TEXTURE(y, i);                \
03419   Float2 W = READ_DOUBLE2_TEXTURE(w, i);                                
03420 #define REDUCE_Y_AUXILIARY(i)                   \
03421   Float2 Z = read_Float2(z, i);                 \
03422   Z.x += a.x*X.x - a.y*X.y;                     \
03423   Z.y += a.y*X.x + a.x*X.y;                     \
03424   Z.x += b.x*Y.x - b.y*Y.y;                     \
03425   Z.y += b.y*Y.x + b.x*Y.y;                     \
03426   Y.x -= b.x*W.x - b.y*W.y;                     \
03427   Y.y -= b.y*W.x + b.x*W.y;     
03428 #define REDUCE_Z_AUXILIARY(i)         \
03429   z[i] = make_Float2(Z);              \
03430   y[i] = make_Float2(Y);              
03431 #define REDUCE_X_OPERATION(i) (u[i].x*y[i].x + u[i].y*y[i].y)
03432 #define REDUCE_Y_OPERATION(i) (u[i].x*y[i].y - u[i].y*y[i].x)
03433 #define REDUCE_Z_OPERATION(i) (y[i].x*y[i].x + y[i].y*y[i].y)
03434 #include "reduce_triple_core.h"
03435 #undef REDUCE_FUNC_NAME
03436 #undef REDUCE_TYPES
03437 #undef REDUCE_PARAMS
03438 #undef REDUCE_X_AUXILIARY
03439 #undef REDUCE_Y_AUXILIARY
03440 #undef REDUCE_Z_AUXILIARY
03441 #undef REDUCE_X_OPERATION
03442 #undef REDUCE_Y_OPERATION
03443 #undef REDUCE_Z_OPERATION
03444 
03445 template <unsigned int reduce_threads, typename Float2>
03446 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYS##suffix
03447 #define REDUCE_TYPES Float2 a, Float2 *x, Float2 b, Float2 *y, Float2 *z, Float2 *w, Float2 *u
03448 #define REDUCE_PARAMS a, x, b, y, z, w, u
03449 #define REDUCE_X_AUXILIARY(i)                           \
03450   Float2 X = read_Float2(x, i);         \
03451   Float2 Y = read_Float2(y, i);         \
03452   Float2 W = read_Float2(w, i);         
03453 #define REDUCE_Y_AUXILIARY(i)                   \
03454   Float2 Z = read_Float2(z, i); \
03455   Z.x += a.x*X.x - a.y*X.y;                     \
03456   Z.y += a.y*X.x + a.x*X.y;                     \
03457   Z.x += b.x*Y.x - b.y*Y.y;                     \
03458   Z.y += b.y*Y.x + b.x*Y.y;                     \
03459   Y.x -= b.x*W.x - b.y*W.y;                     \
03460   Y.y -= b.y*W.x + b.x*W.y;     
03461 #define REDUCE_Z_AUXILIARY(i)         \
03462   z[i] = make_Float2(Z);              \
03463   y[i] = make_Float2(Y);              
03464 #define REDUCE_X_OPERATION(i) (u[i].x*y[i].x + u[i].y*y[i].y)
03465 #define REDUCE_Y_OPERATION(i) (u[i].x*y[i].y - u[i].y*y[i].x)
03466 #define REDUCE_Z_OPERATION(i) (y[i].x*y[i].x + y[i].y*y[i].y)
03467 #include "reduce_triple_core.h"
03468 #undef REDUCE_FUNC_NAME
03469 #undef REDUCE_TYPES
03470 #undef REDUCE_PARAMS
03471 #undef REDUCE_X_AUXILIARY
03472 #undef REDUCE_Y_AUXILIARY
03473 #undef REDUCE_Z_AUXILIARY
03474 #undef REDUCE_X_OPERATION
03475 #undef REDUCE_Y_OPERATION
03476 #undef REDUCE_Z_OPERATION
03477 
03478 //
03479 // double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y, 
03480 // float2 *z, float2 *w, float2 *u, int len)
03481 //
03482 template <unsigned int reduce_threads, typename Float2>
03483 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYH##suffix
03484 #define REDUCE_TYPES Float2 a, Float2 b, short4 *yH, float *yN, short4 *zH, float *zN, float *wN, float *uN, int stride
03485 #define REDUCE_PARAMS a, b, yH, yN, zH, zN, wN, uN, stride
03486 #define REDUCE_X_AUXILIARY(i)                                           \
03487   RECONSTRUCT_HALF_SPINOR(x, texHalf1, texNorm1, stride);               \
03488   RECONSTRUCT_HALF_SPINOR(y, texHalf2, texNorm2, stride);               \
03489   RECONSTRUCT_HALF_SPINOR(z, texHalf3, texNorm3, stride);               \
03490   CAXPBYPZ_FLOAT4(a, x0, b, y0, z0);                                    \
03491   CAXPBYPZ_FLOAT4(a, x1, b, y1, z1);                                    \
03492   CAXPBYPZ_FLOAT4(a, x2, b, y2, z2);                                    \
03493   CAXPBYPZ_FLOAT4(a, x3, b, y3, z3);                                    \
03494   CAXPBYPZ_FLOAT4(a, x4, b, y4, z4);                                    \
03495   CAXPBYPZ_FLOAT4(a, x5, b, y5, z5);                                    \
03496   CONSTRUCT_HALF_SPINOR_FROM_SINGLE(zH, zN, z, stride);                 \
03497   READ_HALF_SPINOR(w, texHalf4, stride);                                \
03498   float2 bwc = -wc*b;                                                   \
03499   CAXPY_FLOAT4(bwc, w0, y0);                                            \
03500   CAXPY_FLOAT4(bwc, w1, y1);                                            \
03501   CAXPY_FLOAT4(bwc, w2, y2);                                            \
03502   CAXPY_FLOAT4(bwc, w3, y3);                                            \
03503   CAXPY_FLOAT4(bwc, w4, y4);                                            \
03504   CAXPY_FLOAT4(bwc, w5, y5);                                            \
03505   REAL_DOT_FLOAT4(norm0, y0, y0);                                       \
03506   REAL_DOT_FLOAT4(norm1, y1, y1);                                       \
03507   REAL_DOT_FLOAT4(norm2, y2, y2);                                       \
03508   REAL_DOT_FLOAT4(norm3, y3, y3);                                       \
03509   REAL_DOT_FLOAT4(norm4, y4, y4);                                       \
03510   REAL_DOT_FLOAT4(norm5, y5, y5);                                       \
03511   CONSTRUCT_HALF_SPINOR_FROM_SINGLE(yH, yN, y, stride);                 
03512 #define REDUCE_Y_AUXILIARY(i)                                           \
03513   READ_HALF_SPINOR(u, texHalf5, stride);                                \
03514   REAL_DOT_FLOAT4(rdot0, u0, y0);                                       \
03515   REAL_DOT_FLOAT4(rdot1, u1, y1);                                       \
03516   REAL_DOT_FLOAT4(rdot2, u2, y2);                                       \
03517   REAL_DOT_FLOAT4(rdot3, u3, y3);                                       \
03518   REAL_DOT_FLOAT4(rdot4, u4, y4);                                       \
03519   REAL_DOT_FLOAT4(rdot5, u5, y5);                                       \
03520   IMAG_DOT_FLOAT4(idot0, u0, y0);                                       \
03521   IMAG_DOT_FLOAT4(idot1, u1, y1);                                       \
03522   IMAG_DOT_FLOAT4(idot2, u2, y2);                                       \
03523   IMAG_DOT_FLOAT4(idot3, u3, y3);                                       \
03524   IMAG_DOT_FLOAT4(idot4, u4, y4);                                       \
03525   IMAG_DOT_FLOAT4(idot5, u5, y5);                                       
03526 #define REDUCE_Z_AUXILIARY(i)                                           \
03527   norm0 += norm1; norm2 += norm3; norm4 += norm5; norm0 += norm2, norm0 += norm4; \
03528   rdot0 += rdot1; rdot2 += rdot3; rdot4 += rdot5; rdot0 += rdot2; rdot0 += rdot4; \
03529   idot0 += idot1; idot2 += idot3; idot4 += idot5; idot0 += idot2; idot0 += idot4;
03530 
03531 #define REDUCE_X_OPERATION(i) (uc*rdot0)
03532 #define REDUCE_Y_OPERATION(i) (uc*idot0)
03533 #define REDUCE_Z_OPERATION(i) (norm0)
03534 
03535 #include "reduce_triple_core.h"
03536 #undef REDUCE_FUNC_NAME
03537 #undef REDUCE_TYPES
03538 #undef REDUCE_PARAMS
03539 #undef REDUCE_X_AUXILIARY
03540 #undef REDUCE_Y_AUXILIARY
03541 #undef REDUCE_Z_AUXILIARY
03542 #undef REDUCE_X_OPERATION
03543 #undef REDUCE_Y_OPERATION
03544 #undef REDUCE_Z_OPERATION
03545 
03546 
03547 template <unsigned int reduce_threads, typename Float2>
03548 #define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormYH##suffix
03549 #define REDUCE_TYPES Float2 a, Float2 b, short2 *yH, float *yN, short2 *zH, float *zN, float *wN, float *uN, int stride
03550 #define REDUCE_PARAMS a, b, yH, yN, zH, zN, wN, uN, stride
03551 #define REDUCE_X_AUXILIARY(i)                                           \
03552   RECONSTRUCT_HALF_SPINOR_ST(x, texHalfSt1, texNorm1, stride);          \
03553   RECONSTRUCT_HALF_SPINOR_ST(y, texHalfSt2, texNorm2, stride);          \
03554   RECONSTRUCT_HALF_SPINOR_ST(z, texHalfSt3, texNorm3, stride);          \
03555   CAXPBYPZ_FLOAT2(a, x0, b, y0, z0);                                    \
03556   CAXPBYPZ_FLOAT2(a, x1, b, y1, z1);                                    \
03557   CAXPBYPZ_FLOAT2(a, x2, b, y2, z2);                                    \
03558   CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(zH, zN, z, stride);                      \
03559   READ_HALF_SPINOR_ST(w, texHalfSt4, stride);                           \
03560   float2 bwc = -wc*b;                                                   \
03561   CAXPY_FLOAT2(bwc, w0, y0);                                            \
03562   CAXPY_FLOAT2(bwc, w1, y1);                                            \
03563   CAXPY_FLOAT2(bwc, w2, y2);                                            \
03564   REAL_DOT_FLOAT2(norm0, y0, y0);                                       \
03565   REAL_DOT_FLOAT2(norm1, y1, y1);                                       \
03566   REAL_DOT_FLOAT2(norm2, y2, y2);                                       \
03567   CONSTRUCT_HALF_SPINOR_FROM_SINGLE_ST(yH, yN, y, stride);                      
03568 #define REDUCE_Y_AUXILIARY(i)                                           \
03569   READ_HALF_SPINOR_ST(u, texHalfSt5, stride);                           \
03570   REAL_DOT_FLOAT2(rdot0, u0, y0);                                       \
03571   REAL_DOT_FLOAT2(rdot1, u1, y1);                                       \
03572   REAL_DOT_FLOAT2(rdot2, u2, y2);                                       \
03573   IMAG_DOT_FLOAT2(idot0, u0, y0);                                       \
03574   IMAG_DOT_FLOAT2(idot1, u1, y1);                                       \
03575   IMAG_DOT_FLOAT2(idot2, u2, y2);                                       
03576 #define REDUCE_Z_AUXILIARY(i)                                           \
03577   norm0 += norm1; norm0 += norm2; \
03578   rdot0 += rdot1; rdot0 += rdot2; \
03579   idot0 += idot1; idot0 += idot2; 
03580 
03581 #define REDUCE_X_OPERATION(i) (uc*rdot0)
03582 #define REDUCE_Y_OPERATION(i) (uc*idot0)
03583 #define REDUCE_Z_OPERATION(i) (norm0)
03584 
03585 #include "reduce_triple_core.h"
03586 #undef REDUCE_FUNC_NAME
03587 #undef REDUCE_TYPES
03588 #undef REDUCE_PARAMS
03589 #undef REDUCE_X_AUXILIARY
03590 #undef REDUCE_Y_AUXILIARY
03591 #undef REDUCE_Z_AUXILIARY
03592 #undef REDUCE_X_OPERATION
03593 #undef REDUCE_Y_OPERATION
03594 #undef REDUCE_Z_OPERATION
03595 
03596 // This convoluted kernel does the following: z += a*x + b*y, y -= b*w, norm = (y,y), dot = (u, y)
03597 double3 caxpbypzYmbwcDotProductWYNormYCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y,
03598                                            cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &u) {
03599   const int id = 23;
03600   blas_quda_flops += 18*x.real_length;
03601   checkSpinor(x,y);
03602   checkSpinor(x,z);
03603   checkSpinor(x,w);
03604   checkSpinor(x,u);
03605   int length = x.length/2;
03606   blas_quda_bytes += 7*x.real_length*x.precision;
03607   if (x.precision == QUDA_DOUBLE_PRECISION) {
03608     int spinor_bytes = x.length*sizeof(double);
03609     cudaBindTexture(0, xTexDouble2, x.v, spinor_bytes); 
03610     cudaBindTexture(0, yTexDouble2, y.v, spinor_bytes); 
03611     cudaBindTexture(0, zTexDouble2, z.v, spinor_bytes); 
03612     cudaBindTexture(0, wTexDouble2, w.v, spinor_bytes); 
03613     cudaBindTexture(0, uTexDouble2, u.v, spinor_bytes); 
03614     double2 a2 = make_double2(real(a), imag(a));
03615     double2 b2 = make_double2(real(b), imag(b));
03616     return caxpbypzYmbwcDotProductWYNormYDCuda(a2, (double2*)x.v, b2, (double2*)y.v, (double2*)z.v, 
03617                                                (double2*)w.v, (double2*)u.v, length, id, x.precision);
03618   } else if (x.precision == QUDA_SINGLE_PRECISION) {
03619     float2 a2 = make_float2(real(a), imag(a));
03620     float2 b2 = make_float2(real(b), imag(b));
03621     return caxpbypzYmbwcDotProductWYNormYSCuda(a2, (float2*)x.v, b2, (float2*)y.v, (float2*)z.v,
03622                                                (float2*)w.v, (float2*)u.v, length, id, x.precision);
03623   } else {
03624     // fused nSpin=4 kernel is slow on Fermi
03625     // N.B. this introduces an extra half truncation so will affect convergence
03626     if (!blasTuning && (__CUDA_ARCH__ >= 200) && x.nSpin == 4) {
03627       caxpbypzYmbwCuda(a, x, b, y, z, w);
03628       return cDotProductNormBCuda(u, y);
03629     }
03630       
03631     if (x.siteSubset == QUDA_FULL_SITE_SUBSET) 
03632       return caxpbypzYmbwcDotProductWYNormYCuda(a, x.Even(), b, y.Even(), z.Even(), w.Even(), u.Even()) + 
03633         caxpbypzYmbwcDotProductWYNormYCuda(a, x.Odd(), b, y.Odd(), z.Odd(), w.Odd(), u.Odd());
03634     int spinor_bytes = x.length*sizeof(short);
03635     blas_quda_bytes += 7*x.volume*sizeof(float);
03636     if (x.nSpin == 4) { // wilson
03637       cudaBindTexture(0, texHalf1, x.v, spinor_bytes); 
03638       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/12);    
03639       cudaBindTexture(0, texHalf2, y.v, spinor_bytes); 
03640       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/12);    
03641       cudaBindTexture(0, texHalf3, z.v, spinor_bytes); 
03642       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/12);    
03643       cudaBindTexture(0, texHalf4, w.v, spinor_bytes); 
03644       cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/12);    
03645       cudaBindTexture(0, texHalf5, u.v, spinor_bytes); 
03646       cudaBindTexture(0, texNorm5, u.norm, spinor_bytes/12);    
03647       float2 a2 = make_float2(real(a), imag(a));
03648       float2 b2 = make_float2(real(b), imag(b));
03649       return caxpbypzYmbwcDotProductWYNormYHCuda(a2, b2, (short4*)y.v, (float*)y.norm, 
03650                                                  (short4*)z.v, (float*)z.norm, (float*)w.norm, (float*)u.norm, 
03651                                                  y.stride, y.volume, id, x.precision);
03652     } else if (x.nSpin == 1){ // staggered
03653       cudaBindTexture(0, texHalfSt1, x.v, spinor_bytes); 
03654       cudaBindTexture(0, texNorm1, x.norm, spinor_bytes/3);    
03655       cudaBindTexture(0, texHalfSt2, y.v, spinor_bytes); 
03656       cudaBindTexture(0, texNorm2, y.norm, spinor_bytes/3);    
03657       cudaBindTexture(0, texHalfSt3, z.v, spinor_bytes); 
03658       cudaBindTexture(0, texNorm3, z.norm, spinor_bytes/3);    
03659       cudaBindTexture(0, texHalfSt4, w.v, spinor_bytes); 
03660       cudaBindTexture(0, texNorm4, w.norm, spinor_bytes/3);    
03661       cudaBindTexture(0, texHalfSt5, u.v, spinor_bytes); 
03662       cudaBindTexture(0, texNorm5, u.norm, spinor_bytes/3);    
03663       float2 a2 = make_float2(real(a), imag(a));
03664       float2 b2 = make_float2(real(b), imag(b));
03665       return caxpbypzYmbwcDotProductWYNormYHCuda(a2, b2, (short2*)y.v, (float*)y.norm, 
03666                                                  (short2*)z.v, (float*)z.norm, (float*)w.norm, (float*)u.norm, 
03667                                                  y.stride, y.volume, id, x.precision);
03668     } else {
03669       errorQuda("%s: nSpin(%d) is not supported\n", __FUNCTION__, x.nSpin);            
03670     }
03671   }
03672 
03673 }
03674