QUDA: quda/lib/blas_quda.cu Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 
00004 #include <quda_internal.h>
00005 #include <blas_quda.h>
00006 #include <color_spinor_field.h>
00007 #include <face_quda.h> // this is where the MPI / QMP depdendent code is
00008 
00009 #define REDUCE_MAX_BLOCKS 65536
00010 
00011 #if (__COMPUTE_CAPABILITY__ >= 130)
00012 #define QudaSumFloat double
00013 #define QudaSumFloat2 double2
00014 #define QudaSumFloat3 double3
00015 #else
00016 #define QudaSumFloat doublesingle
00017 #define QudaSumFloat2 doublesingle2
00018 #define QudaSumFloat3 doublesingle3
00019 #include <double_single.h>
00020 #endif
00021 
00022 // These are used for reduction kernels
00023 static QudaSumFloat *d_reduce=0;
00024 static QudaSumFloat *h_reduce=0;
00025 static QudaSumFloat *hd_reduce=0;
00026 static cudaEvent_t reduceEnd;
00027 
00028 namespace quda {
00029   unsigned long long blas_flops;
00030   unsigned long long blas_bytes;
00031 }
00032 
00033 #include <float_vector.h>
00034 #include <texture.h>
00035 
00036 #include <tune_quda.h>
00037 #include <typeinfo>
00038 
00039 void zeroCuda(cudaColorSpinorField &a) { a.zero(); }
00040 
00041 #define checkSpinor(a, b)                                               \
00042   {                                                                     \
00043     if (a.Precision() != b.Precision())                                 \
00044       errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
00045     if (a.Length() != b.Length())                                       \
00046       errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
00047     if (a.Stride() != b.Stride())                                       \
00048       errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
00049   }
00050 
00051 // For kernels with precision conversion built in
00052 #define checkSpinorLength(a, b)                                         \
00053   {                                                                     \
00054     if (a.Length() != b.Length())                                       \
00055       errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
00056     if (a.Stride() != b.Stride())                                       \
00057       errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
00058   }
00059 
00060 // blasTuning = 1 turns off error checking
00061 static QudaTune blasTuning = QUDA_TUNE_NO;
00062 static QudaVerbosity verbosity = QUDA_SILENT;
00063 
00064 static cudaStream_t *blasStream;
00065 
00066 static struct {
00067   int x[QUDA_MAX_DIM];
00068   int stride;
00069 } blasConstants;
00070 
00071 namespace quda {
00072 
00073 void initBlas()
00074 { 
00075   // reduction buffer size
00076   size_t bytes = 3*REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat);
00077 
00078   if (!d_reduce) {
00079     if (cudaMalloc((void**) &d_reduce, bytes) == cudaErrorMemoryAllocation) {
00080       errorQuda("Error allocating device reduction array");
00081     }
00082   }
00083 
00084   // these arrays are acutally oversized currently (only needs to be QudaSumFloat3)
00085 
00086   // if the device supports host-mapped memory then use a host-mapped array for the reduction
00087   if(deviceProp.canMapHostMemory) {
00088     if (!h_reduce) {
00089       if (cudaHostAlloc((void**) &h_reduce, bytes, cudaHostAllocMapped) == cudaErrorMemoryAllocation) {
00090         errorQuda("Error allocating host reduction array");
00091       }
00092       
00093       // set the matching device pointer
00094       cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0);
00095     }
00096 
00097   } else {
00098 
00099     if (!h_reduce) {
00100       if (cudaMallocHost((void**) &h_reduce, bytes) == cudaErrorMemoryAllocation) {
00101         errorQuda("Error allocating host reduction array");
00102       }
00103     }
00104 
00105     hd_reduce = d_reduce;
00106   }
00107 
00108   blasStream = &streams[Nstream-1];
00109   cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
00110 
00111   checkCudaError();
00112 }
00113 
00114 
00115 void endBlas(void)
00116 {
00117   if (d_reduce) {
00118     cudaFree(d_reduce);
00119     d_reduce = 0;
00120   }
00121 
00122   if (h_reduce) {
00123     cudaFreeHost(h_reduce);
00124     h_reduce = 0;
00125   }
00126 
00127   hd_reduce = 0;
00128 
00129   cudaEventDestroy(reduceEnd);
00130 }
00131 
00132 void setBlasTuning(QudaTune tune, QudaVerbosity verbose)
00133 {
00134   blasTuning = tune;
00135   verbosity = verbose;
00136 }
00137 
00138 } // namespace quda
00139 
00140 // FIXME: this should be queried from the device
00141 #if (__COMPUTE_CAPACITY__ < 200)
00142 #define MAX_BLOCK 512
00143 #else
00144 #define MAX_BLOCK 1024
00145 #endif
00146 
00147 template <typename FloatN, int N, typename Output, typename Input>
00148 __global__ void copyKernel(Output Y, Input X, int length) {
00149   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00150   unsigned int gridSize = gridDim.x*blockDim.x;
00151 
00152   while (i < length) {
00153     FloatN x[N];
00154     X.load(x, i);
00155     Y.save(x, i);
00156     i += gridSize;
00157   }
00158 }
00159 
00160 template <typename FloatN, int N, typename Output, typename Input>
00161 class CopyCuda : public Tunable {
00162 
00163 private:
00164   Input &X;
00165   Output &Y;
00166   const int length;
00167 
00168   int sharedBytesPerThread() const { return 0; }
00169   int sharedBytesPerBlock() const { return 0; }
00170 
00171   virtual bool advanceSharedBytes(TuneParam &param) const
00172   {
00173     TuneParam next(param);
00174     advanceBlockDim(next); // to get next blockDim
00175     int nthreads = next.block.x * next.block.y * next.block.z;
00176     param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock() ?
00177       sharedBytesPerThread()*nthreads : sharedBytesPerBlock();
00178     return false;
00179   }
00180 
00181 public:
00182   CopyCuda(Output &Y, Input &X, int length) : X(X), Y(Y), length(length) { ; }
00183   virtual ~CopyCuda() { ; }
00184 
00185   TuneKey tuneKey() const {
00186     std::stringstream vol, aux;
00187     vol << blasConstants.x[0] << "x";
00188     vol << blasConstants.x[1] << "x";
00189     vol << blasConstants.x[2] << "x";
00190     vol << blasConstants.x[3];
00191     aux << "stride=" << blasConstants.stride << ",out_prec=" << Y.Precision() << ",in_prec=" << X.Precision();
00192     return TuneKey(vol.str(), "copyKernel", aux.str());
00193   }  
00194 
00195   void apply(const cudaStream_t &stream) {
00196     TuneParam tp = tuneLaunch(*this, blasTuning, verbosity);
00197     copyKernel<FloatN, N><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(Y, X, length);
00198   }
00199 
00200   void preTune() { ; } // no need to save state for copy kernels
00201   void postTune() { ; } // no need to restore state for copy kernels
00202 
00203   long long flops() const { return 0; }
00204   long long bytes() const { 
00205     const int Ninternal = (sizeof(FloatN)/sizeof(((FloatN*)0)->x))*N;
00206     size_t bytes = (X.Precision() + Y.Precision())*Ninternal;
00207     if (X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
00208     if (Y.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
00209     return bytes*length; 
00210   }
00211 };
00212 
00213 
00214 void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src) {
00215   if (&src == &dst) return; // aliasing fields
00216   if (src.Nspin() != 1 && src.Nspin() != 4) errorQuda("nSpin(%d) not supported\n", src.Nspin());
00217 
00218   if (dst.SiteSubset() == QUDA_FULL_SITE_SUBSET || src.SiteSubset() == QUDA_FULL_SITE_SUBSET) {
00219     copyCuda(dst.Even(), src.Even());
00220     copyCuda(dst.Odd(), src.Odd());
00221     return;
00222   }
00223 
00224   checkSpinorLength(dst, src);
00225 
00226   for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = src.X()[d];
00227   blasConstants.stride = src.Stride();
00228 
00229   // For a given dst precision, there are two non-trivial possibilities for the
00230   // src precision.
00231 
00232   quda::blas_bytes += src.RealLength()*((int)src.Precision() + (int)dst.Precision());
00233 
00234   Tunable *copy = 0;
00235 
00236   if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
00237     if (src.Nspin() == 4){
00238       SpinorTexture<float4, float4, float4, 6, 0> src_tex(src);
00239       Spinor<float4, float2, double2, 6> dst_spinor(dst);
00240       copy = new CopyCuda<float4, 6, Spinor<float4, float2, double2, 6>, 
00241                           SpinorTexture<float4, float4, float4, 6, 0> >
00242         (dst_spinor, src_tex, src.Stride());
00243     } else { //src.Nspin() == 1
00244       SpinorTexture<float2, float2, float2, 3, 0> src_tex(src);
00245       Spinor<float2, float2, double2, 3> dst_spinor(dst);
00246       copy = new CopyCuda<float2, 3, Spinor<float2, float2, double2, 3>,
00247                           SpinorTexture<float2, float2, float2, 3, 0> >
00248         (dst_spinor, src_tex, src.Stride());
00249     }
00250 
00251   } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
00252     if (src.Nspin() == 4){
00253       SpinorTexture<float4, float2, double2, 6, 0> src_tex(src);
00254       Spinor<float4, float4, float4, 6> dst_spinor(dst);
00255       copy = new CopyCuda<float4, 6, Spinor<float4, float4, float4, 6>,
00256                           SpinorTexture<float4, float2, double2, 6, 0> >
00257         (dst_spinor, src_tex, src.Stride());
00258     } else { //src.Nspin() ==1
00259       SpinorTexture<float2, float2, double2, 3, 0> src_tex(src);
00260       Spinor<float2, float2, float2, 3> dst_spinor(dst);
00261       copy = new CopyCuda<float2, 3, Spinor<float2, float2, float2, 3>,
00262                           SpinorTexture<float2, float2, double2, 3, 0> >
00263         (dst_spinor, src_tex, src.Stride());
00264     }
00265   } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
00266     quda::blas_bytes += src.Volume()*sizeof(float);
00267     if (src.Nspin() == 4){      
00268       SpinorTexture<float4, float4, short4, 6, 0> src_tex(src);
00269       Spinor<float4, float4, float4, 6> dst_spinor(dst);
00270       copy = new CopyCuda<float4, 6, Spinor<float4, float4, float4, 6>,
00271                           SpinorTexture<float4, float4, short4, 6, 0> >
00272         (dst_spinor, src_tex, src.Stride());
00273     } else { //nSpin== 1;
00274       SpinorTexture<float2, float2, short2, 3, 0> src_tex(src);
00275       Spinor<float2, float2, float2, 3> dst_spinor(dst);
00276       copy = new CopyCuda<float2, 3, Spinor<float2, float2, float2, 3>,
00277                           SpinorTexture<float2, float2, short2, 3, 0> >
00278         (dst_spinor, src_tex, src.Stride());
00279     }
00280   } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
00281     quda::blas_bytes += dst.Volume()*sizeof(float);
00282     if (src.Nspin() == 4){
00283       SpinorTexture<float4, float4, float4, 6, 0> src_tex(src);
00284       Spinor<float4, float4, short4, 6> dst_spinor(dst);
00285       copy = new CopyCuda<float4, 6, Spinor<float4, float4, short4, 6>,
00286                           SpinorTexture<float4, float4, float4, 6, 0> >
00287         (dst_spinor, src_tex, src.Stride());
00288     } else { //nSpin == 1
00289       SpinorTexture<float2, float2, float2, 3, 0> src_tex(src);
00290       Spinor<float2, float2, short2, 3> dst_spinor(dst);
00291       copy = new CopyCuda<float2, 3, Spinor<float2, float2, short2, 3>,
00292                           SpinorTexture<float2, float2, float2, 3, 0> >
00293         (dst_spinor, src_tex, src.Stride());
00294     }
00295   } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
00296     quda::blas_bytes += src.Volume()*sizeof(float);
00297     if (src.Nspin() == 4){
00298       SpinorTexture<double2, float4, short4, 12, 0> src_tex(src);
00299       Spinor<double2, double2, double2, 12> dst_spinor(dst);
00300       copy = new CopyCuda<double2, 12, Spinor<double2, double2, double2, 12>,
00301                           SpinorTexture<double2, float4, short4, 12, 0> >
00302         (dst_spinor, src_tex, src.Stride());
00303     } else { //nSpin == 1
00304       SpinorTexture<double2, float2, short2, 3, 0> src_tex(src);
00305       Spinor<double2, double2, double2, 3> dst_spinor(dst);
00306       copy = new CopyCuda<double2, 3, Spinor<double2, double2, double2, 3>,
00307                           SpinorTexture<double2, float2, short2, 3, 0> >
00308         (dst_spinor, src_tex, src.Stride());
00309     }
00310   } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
00311     quda::blas_bytes += dst.Volume()*sizeof(float);
00312     if (src.Nspin() == 4){
00313       SpinorTexture<double2, double2, double2, 12, 0> src_tex(src);
00314       Spinor<double2, double4, short4, 12> dst_spinor(dst);
00315       copy = new CopyCuda<double2, 12, Spinor<double2, double4, short4, 12>,
00316                           SpinorTexture<double2, double2, double2, 12, 0> >
00317         (dst_spinor, src_tex, src.Stride());
00318     } else { //nSpin == 1
00319       SpinorTexture<double2, double2, double2, 3, 0> src_tex(src);
00320       Spinor<double2, double2, short2, 3> dst_spinor(dst);
00321       copy = new CopyCuda<double2, 3, Spinor<double2, double2, short2, 3>,
00322                           SpinorTexture<double2, double2, double2, 3, 0> >
00323         (dst_spinor, src_tex, src.Stride());
00324     }
00325   }
00326   
00327   if (dst.Precision() != src.Precision()) {
00328     copy->apply(*blasStream);
00329     delete copy;
00330   } else {
00331     cudaMemcpy(dst.V(), src.V(), dst.Bytes(), cudaMemcpyDeviceToDevice);
00332     if (dst.Precision() == QUDA_HALF_PRECISION) {
00333       cudaMemcpy(dst.Norm(), src.Norm(), dst.NormBytes(), cudaMemcpyDeviceToDevice);
00334       quda::blas_bytes += 2*dst.RealLength()*sizeof(float);
00335     }
00336   }
00337 
00338   checkCudaError();
00339 }
00340 
00341 #include <blas_core.h>
00342 
00346 template <typename Float2, typename FloatN>
00347 struct axpby {
00348   const Float2 a;
00349   const Float2 b;
00350   axpby(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00351   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = a.x*x + b.x*y; }
00352   static int streams() { return 3; } 
00353   static int flops() { return 3; } 
00354 };
00355 
00356 void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y) {
00357   const int kernel = 2;
00358   blasCuda<axpby,0,1,0,0>(kernel, make_double2(a, 0.0), make_double2(b, 0.0), make_double2(0.0, 0.0),
00359                   x, y, x, x);
00360 }
00361 
00365 template <typename Float2, typename FloatN>
00366 struct xpy {
00367   xpy(const Float2 &a, const Float2 &b, const Float2 &c) { ; }
00368   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y += x ; }
00369   static int streams() { return 3; } 
00370   static int flops() { return 1; } 
00371 };
00372 
00373 void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00374   const int kernel = 3;
00375   blasCuda<xpy,0,1,0,0>(kernel, make_double2(1.0, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0), 
00376                 x, y, x, x);
00377 }
00378 
00382 template <typename Float2, typename FloatN>
00383 struct axpy {
00384   const Float2 a;
00385   axpy(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
00386   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = a.x*x + y; }
00387   static int streams() { return 3; } 
00388   static int flops() { return 2; } 
00389 };
00390 
00391 void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
00392   const int kernel = 4;
00393   blasCuda<axpy,0,1,0,0>(kernel, make_double2(a, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0), 
00394                  x, y, x, x);
00395 }
00396 
00400 template <typename Float2, typename FloatN>
00401 struct xpay {
00402   const Float2 a;
00403   xpay(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
00404   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = x + a.x*y; }
00405   static int streams() { return 3; } 
00406   static int flops() { return 2; } 
00407 };
00408 
00409 void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y) {
00410   const int kernel = 5;
00411   blasCuda<xpay,0,1,0,0>(kernel, make_double2(a,0.0), make_double2(0.0, 0.0), make_double2(0.0, 0.0),
00412                          x, y, x, x);
00413 }
00414 
00418 template <typename Float2, typename FloatN>
00419 struct mxpy {
00420   mxpy(const Float2 &a, const Float2 &b, const Float2 &c) { ; }
00421   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y -= x; }
00422   static int streams() { return 3; } 
00423   static int flops() { return 1; } 
00424 };
00425 
00426 void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00427   const int kernel = 6;
00428   blasCuda<mxpy,0,1,0,0>(kernel, make_double2(1.0, 0.0), make_double2(1.0, 0.0), 
00429                          make_double2(0.0, 0.0), x, y, x, x);
00430 }
00431 
00435 template <typename Float2, typename FloatN>
00436 struct ax {
00437   const Float2 a;
00438   ax(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
00439   __device__ void operator()(FloatN &x, const FloatN &y, const FloatN &z, const FloatN &w) { x *= a.x; }
00440   static int streams() { return 2; } 
00441   static int flops() { return 1; } 
00442 };
00443 
00444 void axCuda(const double &a, cudaColorSpinorField &x) {
00445   const int kernel = 7;
00446   blasCuda<ax,1,0,0,0>(kernel, make_double2(a, 0.0), make_double2(0.0, 0.0), 
00447                        make_double2(0.0, 0.0), x, x, x, x);
00448 }
00449 
00454 __device__ void caxpy_(const float2 &a, const float4 &x, float4 &y) {
00455   y.x += a.x*x.x; y.x -= a.y*x.y;
00456   y.y += a.y*x.x; y.y += a.x*x.y;
00457   y.z += a.x*x.z; y.z -= a.y*x.w;
00458   y.w += a.y*x.z; y.w += a.x*x.w;
00459 }
00460 
00461 __device__ void caxpy_(const float2 &a, const float2 &x, float2 &y) {
00462   y.x += a.x*x.x; y.x -= a.y*x.y;
00463   y.y += a.y*x.x; y.y += a.x*x.y;
00464 }
00465 
00466 __device__ void caxpy_(const double2 &a, const double2 &x, double2 &y) {
00467   y.x += a.x*x.x; y.x -= a.y*x.y;
00468   y.y += a.y*x.x; y.y += a.x*x.y;
00469 }
00470 
00471 template <typename Float2, typename FloatN>
00472 struct caxpy {
00473   const Float2 a;
00474   caxpy(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
00475   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { caxpy_(a, x, y); }
00476   static int streams() { return 3; } 
00477   static int flops() { return 4; } 
00478 };
00479 
00480 void caxpyCuda(const quda::Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
00481   const int kernel = 8;
00482   blasCuda<caxpy,0,1,0,0>(kernel, make_double2(real(a),imag(a)), make_double2(0.0, 0.0), 
00483                           make_double2(0.0, 0.0), x, y, x, x);
00484 }
00485 
00490 __device__ void caxpby_(const float2 &a, const float4 &x, const float2 &b, float4 &y)                                   
00491   { float4 yy;                                                          
00492   yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;    
00493   yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;    
00494   yy.z = a.x*x.z; yy.z -= a.y*x.w; yy.z += b.x*y.z; yy.z -= b.y*y.w;    
00495   yy.w = a.y*x.z; yy.w += a.x*x.w; yy.w += b.y*y.z; yy.w += b.x*y.w;    
00496   y = yy; }
00497 
00498 __device__ void caxpby_(const float2 &a, const float2 &x, const float2 &b, float2 &y)
00499   { float2 yy;                                                          
00500   yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;    
00501   yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;    
00502   y = yy; }
00503 
00504 __device__ void caxpby_(const double2 &a, const double2 &x, const double2 &b, double2 &y)                                
00505   { double2 yy;                                                         
00506   yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;    
00507   yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;    
00508   y = yy; }
00509 
00510 template <typename Float2, typename FloatN>
00511 struct caxpby {
00512   const Float2 a;
00513   const Float2 b;
00514   caxpby(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00515   __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { caxpby_(a, x, b, y); }
00516   static int streams() { return 3; } 
00517   static int flops() { return 7; } 
00518 };
00519 
00520 void caxpbyCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, cudaColorSpinorField &y) {
00521   const int kernel = 9;
00522   blasCuda<caxpby,0,1,0,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(), b.imag()), 
00523                            make_double2(0.0, 0.0), x, y, x, x);
00524 }
00525 
00530 __device__ void cxpaypbz_(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z) {
00531   float4 zz;
00532   zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
00533   zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
00534   zz.z = x.z + a.x*y.z; zz.z -= a.y*y.w; zz.z += b.x*z.z; zz.z -= b.y*z.w;
00535   zz.w = x.w + a.y*y.z; zz.w += a.x*y.w; zz.w += b.y*z.z; zz.w += b.x*z.w;
00536   z = zz;
00537 }
00538 
00539 __device__ void cxpaypbz_(const float2 &x, const float2 &a, const float2 &y, const float2 &b, float2 &z) {
00540   float2 zz;
00541   zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
00542   zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
00543   z = zz;
00544 }
00545 
00546 __device__ void cxpaypbz_(const double2 &x, const double2 &a, const double2 &y, const double2 &b, double2 &z) {
00547   double2 zz;
00548   zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
00549   zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
00550   z = zz;
00551 }
00552 
00553 template <typename Float2, typename FloatN>
00554 struct cxpaypbz {
00555   const Float2 a;
00556   const Float2 b;
00557   cxpaypbz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00558   __device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, FloatN &w) 
00559   { cxpaypbz_(x, a, y, b, z); }
00560   static int streams() { return 4; } 
00561   static int flops() { return 8; } 
00562 };
00563 
00564 void cxpaypbzCuda(cudaColorSpinorField &x, const quda::Complex &a, cudaColorSpinorField &y, 
00565                   const quda::Complex &b, cudaColorSpinorField &z) {
00566   const int kernel = 10;
00567   blasCuda<cxpaypbz,0,0,1,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(), b.imag()), 
00568                              make_double2(0.0, 0.0), x, y, z, z);
00569 }
00570 
00574 template <typename Float2, typename FloatN>
00575 struct axpyBzpcx {
00576   const Float2 a;
00577   const Float2 b;
00578   const Float2 c;
00579   axpyBzpcx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; }
00580   __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
00581   { y += a.x*x; x = b.x*z + c.x*x; }
00582   static int streams() { return 5; } 
00583   static int flops() { return 10; } 
00584 };
00585 
00586 void axpyBzpcxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y, const double &b, 
00587                    cudaColorSpinorField& z, const double &c) {
00588   const int kernel = 11;
00589   blasCuda<axpyBzpcx,1,1,0,0>(kernel, make_double2(a,0.0), make_double2(b,0.0), make_double2(c,0.0), 
00590                               x, y, z, x);
00591 }
00592 
00596 template <typename Float2, typename FloatN>
00597 struct axpyZpbx {
00598   const Float2 a;
00599   const Float2 b;
00600   axpyZpbx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00601   __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
00602   { y += a.x*x; x = z + b.x*x; }
00603   static int streams() { return 5; } 
00604   static int flops() { return 8; } 
00605 };
00606 
00607 void axpyZpbxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y,
00608                   cudaColorSpinorField& z, const double &b) {
00609   const int kernel = 12;
00610   // swap arguments around 
00611   blasCuda<axpyZpbx,1,1,0,0>(kernel, make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
00612                              x, y, z, x);
00613 }
00614 
00618 template <typename Float2, typename FloatN>
00619 struct caxpbypzYmbw {
00620   const Float2 a;
00621   const Float2 b;
00622   caxpbypzYmbw(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00623   __device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, const FloatN &w)
00624   { caxpy_(a, x, z); caxpy_(b, y, z); caxpy_(-b, w, y); }
00625 
00626   static int streams() { return 6; } 
00627   static int flops() { return 12; } 
00628 };
00629 
00630 void caxpbypzYmbwCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, 
00631                       cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w) {
00632   const int kernel = 12;
00633   blasCuda<caxpbypzYmbw,0,1,1,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(), b.imag()), 
00634                                  make_double2(0.0,0.0), x, y, z, w);
00635 }
00636 
00640 template <typename Float2, typename FloatN>
00641 struct cabxpyAx {
00642   const Float2 a;
00643   const Float2 b;
00644   cabxpyAx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00645   __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) 
00646   { x *= a.x; caxpy_(b, x, y); }
00647   static int streams() { return 4; } 
00648   static int flops() { return 5; } 
00649 };
00650 
00651 void cabxpyAxCuda(const double &a, const quda::Complex &b, 
00652                   cudaColorSpinorField &x, cudaColorSpinorField &y) {
00653   const int kernel = 14;
00654   // swap arguments around 
00655   blasCuda<cabxpyAx,1,1,0,0>(kernel, make_double2(a,0.0), make_double2(b.real(),b.imag()), 
00656                              make_double2(0.0,0.0), x, y, x, x);
00657 }
00658 
00662 template <typename Float2, typename FloatN>
00663 struct caxpbypz {
00664   const Float2 a;
00665   const Float2 b;
00666   caxpbypz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
00667   __device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, const FloatN &w) 
00668   { caxpy_(a, x, z); caxpy_(b, y, z); }
00669   static int streams() { return 4; } 
00670   static int flops() { return 5; } 
00671 };
00672 
00673 void caxpbypzCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, 
00674                   cudaColorSpinorField &y, cudaColorSpinorField &z) {
00675   const int kernel = 15;
00676   blasCuda<caxpbypz,0,0,1,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(),b.imag()), 
00677                              make_double2(0.0,0.0), x, y, z, z);
00678 }
00679 
00683 template <typename Float2, typename FloatN>
00684 struct caxpbypczpw {
00685   const Float2 a;
00686   const Float2 b;
00687   const Float2 c;
00688   caxpbypczpw(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; }
00689   __device__ void operator()(const FloatN &x, const FloatN &y, const FloatN &z, FloatN &w) 
00690   { caxpy_(a, x, w); caxpy_(b, y, w); caxpy_(c, z, w); }
00691 
00692   static int streams() { return 4; } 
00693   static int flops() { return 5; } 
00694 };
00695 
00696 void caxpbypczpwCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, 
00697                      cudaColorSpinorField &y, const quda::Complex &c, cudaColorSpinorField &z, 
00698                      cudaColorSpinorField &w) {
00699   const int kernel = 16;
00700   blasCuda<caxpbypczpw,0,0,0,1>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(),b.imag()), 
00701                                 make_double2(c.real(), c.imag()), x, y, z, w);
00702 }
00703 
00710 template <typename Float2, typename FloatN>
00711 struct caxpyxmaz {
00712   Float2 a;
00713   caxpyxmaz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
00714   __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) 
00715   { caxpy_(a, x, y); x-= a.x*z; }
00716   static int streams() { return 5; } 
00717   static int flops() { return 8; } 
00718 };
00719 
00720 void caxpyXmazCuda(const quda::Complex &a, cudaColorSpinorField &x, 
00721                           cudaColorSpinorField &y, cudaColorSpinorField &z) {
00722   const int kernel = 17;
00723   blasCuda<caxpyxmaz,1,1,0,0>(kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), 
00724                               make_double2(0.0, 0.0), x, y, z, x);
00725 }
00726 
00727 
00731 __device__ double norm2_(const double2 &a) { return a.x*a.x + a.y*a.y; }
00732 __device__ float norm2_(const float2 &a) { return a.x*a.x + a.y*a.y; }
00733 __device__ float norm2_(const float4 &a) { return a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w; }
00734 
00735 template <typename ReduceType, typename Float2, typename FloatN>
00736 struct Norm2 {
00737   Norm2(const Float2 &a, const Float2 &b) { ; }
00738   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += norm2_(x); }
00739   static int streams() { return 1; } 
00740   static int flops() { return 2; } 
00741 };
00742 
00743 #include <reduce_core.h>
00744 
00745 double normCuda(const cudaColorSpinorField &x) {
00746   const int kernel = 18;
00747   cudaColorSpinorField &y = (cudaColorSpinorField&)x; // FIXME
00748   return reduceCuda<double,QudaSumFloat,QudaSumFloat,Norm2,0,0,0>
00749     (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
00750 }
00751 
00755 __device__ double dot_(const double2 &a, const double2 &b) { return a.x*b.x + a.y*b.y; }
00756 __device__ float dot_(const float2 &a, const float2 &b) { return a.x*b.x + a.y*b.y; }
00757 __device__ float dot_(const float4 &a, const float4 &b) { return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; }
00758 
00759 template <typename ReduceType, typename Float2, typename FloatN>
00760 struct Dot {
00761   Dot(const Float2 &a, const Float2 &b) { ; }
00762   __device__ void operator()(ReduceType &sum, const FloatN &x, const FloatN &y, const FloatN &z, 
00763                              const FloatN &w, const FloatN &v) { sum += dot_(x,y); }
00764   static int streams() { return 2; } 
00765   static int flops() { return 2; } 
00766 };
00767 
00768 double reDotProductCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00769   const int kernel = 19;
00770   return reduceCuda<double,QudaSumFloat,QudaSumFloat,Dot,0,0,0>
00771     (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
00772 }
00773 
00778 template <typename ReduceType, typename Float2, typename FloatN>
00779 struct axpyNorm2 {
00780   Float2 a;
00781   axpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
00782   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { 
00783     y += a.x*x; sum += norm2_(y); }
00784   static int streams() { return 3; } 
00785   static int flops() { return 4; } 
00786 };
00787 
00788 double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
00789   const int kernel = 20;
00790   return reduceCuda<double,QudaSumFloat,QudaSumFloat,axpyNorm2,0,1,0>
00791     (kernel, make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
00792 }
00793 
00798 template <typename ReduceType, typename Float2, typename FloatN>
00799 struct xmyNorm2 {
00800   xmyNorm2(const Float2 &a, const Float2 &b) { ; }
00801   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { 
00802     y = x - y; sum += norm2_(y); }
00803   static int streams() { return 3; } 
00804   static int flops() { return 3; } 
00805 };
00806 
00807 double xmyNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00808   const int kernel = 21;
00809   return reduceCuda<double,QudaSumFloat,QudaSumFloat,xmyNorm2,0,1,0>
00810     (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
00811 }
00812 
00817 template <typename ReduceType, typename Float2, typename FloatN>
00818 struct caxpyNorm2 {
00819   Float2 a;
00820   caxpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
00821   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { 
00822     caxpy_(a, x, y); sum += norm2_(y); }
00823   static int streams() { return 3; } 
00824   static int flops() { return 6; } 
00825 };
00826 
00827 double caxpyNormCuda(const quda::Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y) {
00828   const int kernel = 22;
00829   return reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyNorm2,0,1,0>
00830     (kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, x, x, x);
00831 }
00832 
00840 template <typename ReduceType, typename Float2, typename FloatN>
00841 struct caxpyxmaznormx {
00842   Float2 a;
00843   caxpyxmaznormx(const Float2 &a, const Float2 &b) : a(a) { ; }
00844   __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { caxpy_(a, x, y); x-= a.x*z; sum += norm2_(x); }
00845   static int streams() { return 5; } 
00846   static int flops() { return 10; } 
00847 };
00848 
00849 double caxpyXmazNormXCuda(const quda::Complex &a, cudaColorSpinorField &x, 
00850                           cudaColorSpinorField &y, cudaColorSpinorField &z) {
00851   const int kernel = 23;
00852   return reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyxmaznormx,1,1,0>
00853     (kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, z, x, x);
00854 }
00855 
00863 template <typename ReduceType, typename Float2, typename FloatN>
00864 struct cabxpyaxnorm {
00865   Float2 a;
00866   Float2 b;
00867   cabxpyaxnorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
00868   __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { x *= a.x; caxpy_(b, x, y); sum += norm2_(y); }
00869   static int streams() { return 4; } 
00870   static int flops() { return 10; } 
00871 };
00872 
00873 double cabxpyAxNormCuda(const double &a, const quda::Complex &b, 
00874                         cudaColorSpinorField &x, cudaColorSpinorField &y) {
00875   const int kernel = 24;
00876   return reduceCuda<double,QudaSumFloat,QudaSumFloat,cabxpyaxnorm,1,1,0>
00877     (kernel, make_double2(a, 0.0), make_double2(b.real(), b.imag()), x, y, x, x, x);
00878 }
00879 
00883 __device__ double2 cdot_(const double2 &a, const double2 &b) 
00884 { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
00885 __device__ double2 cdot_(const float2 &a, const float2 &b) 
00886 { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); }
00887 __device__ double2 cdot_(const float4 &a, const float4 &b) 
00888 { return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z); }
00889 
00890 template <typename ReduceType, typename Float2, typename FloatN>
00891 struct Cdot {
00892   Cdot(const Float2 &a, const Float2 &b) { ; }
00893   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += cdot_(x,y); }
00894   static int streams() { return 2; } 
00895   static int flops() { return 4; } 
00896 };
00897 
00898 quda::Complex cDotProductCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00899   const int kernel = 25;
00900   double2 cdot = reduceCuda<double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0>
00901     (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
00902   return quda::Complex(cdot.x, cdot.y);
00903 }
00904 
00911 template <typename ReduceType, typename Float2, typename FloatN>
00912 struct xpaycdotzy {
00913   Float2 a;
00914   xpaycdotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
00915   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { y = x + a.x*y; sum += cdot_(z,y); }
00916   static int streams() { return 4; } 
00917   static int flops() { return 6; } 
00918 };
00919 
00920 quda::Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z) {
00921   const int kernel = 26;
00922   double2 cdot = reduceCuda<double2,QudaSumFloat2,QudaSumFloat,xpaycdotzy,0,1,0>
00923     (kernel, make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
00924   return quda::Complex(cdot.x, cdot.y);
00925 }
00926 
00933 template <typename ReduceType, typename Float2, typename FloatN>
00934 struct caxpydotzy {
00935   Float2 a;
00936    caxpydotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
00937   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { caxpy_(a, x, y); sum += cdot_(z,y); }
00938   static int streams() { return 4; } 
00939   static int flops() { return 8; } 
00940 };
00941 
00942 quda::Complex caxpyDotzyCuda(const quda::Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y,
00943                        cudaColorSpinorField &z) {
00944   const int kernel = 27;
00945   double2 cdot = reduceCuda<double2,QudaSumFloat2,QudaSumFloat,caxpydotzy,0,1,0>
00946     (kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, z, x, x);
00947   return quda::Complex(cdot.x, cdot.y);
00948 }
00949 
00954 __device__ double3 cdotNormA_(const double2 &a, const double2 &b) 
00955 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
00956 __device__ double3 cdotNormA_(const float2 &a, const float2 &b) 
00957 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); }
00958 __device__ double3 cdotNormA_(const float4 &a, const float4 &b) 
00959 { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, 
00960                       a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
00961                       a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); }
00962 
00963 template <typename ReduceType, typename Float2, typename FloatN>
00964 struct CdotNormA {
00965   CdotNormA(const Float2 &a, const Float2 &b) { ; }
00966   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += cdotNormA_(x,y); }
00967   static int streams() { return 2; } 
00968   static int flops() { return 6; } 
00969 };
00970 
00971 double3 cDotProductNormACuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00972   const int kernel = 28;
00973   return reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormA,0,0,0>
00974     (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
00975 }
00976 
00981 __device__ double3 cdotNormB_(const double2 &a, const double2 &b) 
00982 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
00983 __device__ double3 cdotNormB_(const float2 &a, const float2 &b) 
00984 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); }
00985 __device__ double3 cdotNormB_(const float4 &a, const float4 &b) 
00986 { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z,
00987                       b.x*b.x + b.y*b.y + b.z*b.z + b.w*b.w); }
00988 
00989 template <typename ReduceType, typename Float2, typename FloatN>
00990 struct CdotNormB {
00991   CdotNormB(const Float2 &a, const Float2 &b) { ; }
00992   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += cdotNormB_(x,y); }
00993   static int streams() { return 2; } 
00994   static int flops() { return 6; } 
00995 };
00996 
00997 double3 cDotProductNormBCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) {
00998   const int kernel = 29;
00999   return reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormB,0,0,0>
01000     (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
01001 }
01002 
01007 template <typename ReduceType, typename Float2, typename FloatN>
01008 struct caxpbypzYmbwcDotProductUYNormY {
01009   Float2 a;
01010   Float2 b;
01011   caxpbypzYmbwcDotProductUYNormY(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
01012   __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, FloatN &z, const FloatN &w, const FloatN &v) { caxpy_(a, x, z); caxpy_(b, y, z); caxpy_(-b, w, y); sum += cdotNormB_(v,y); }
01013   static int streams() { return 7; } 
01014   static int flops() { return 18; } 
01015 };
01016 
01017 double3 caxpbypzYmbwcDotProductUYNormYCuda(const quda::Complex &a, cudaColorSpinorField &x, 
01018                                            const quda::Complex &b, cudaColorSpinorField &y,
01019                                            cudaColorSpinorField &z, cudaColorSpinorField &w,
01020                                            cudaColorSpinorField &u) {
01021   const int kernel = 30;
01022   return reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1>
01023     (kernel, make_double2(a.real(), a.imag()), make_double2(b.real(), b.imag()), x, y, z, w, u);
01024 }