QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 00004 #include <quda_internal.h> 00005 #include <blas_quda.h> 00006 #include <color_spinor_field.h> 00007 #include <face_quda.h> // this is where the MPI / QMP depdendent code is 00008 00009 #define REDUCE_MAX_BLOCKS 65536 00010 00011 #if (__COMPUTE_CAPABILITY__ >= 130) 00012 #define QudaSumFloat double 00013 #define QudaSumFloat2 double2 00014 #define QudaSumFloat3 double3 00015 #else 00016 #define QudaSumFloat doublesingle 00017 #define QudaSumFloat2 doublesingle2 00018 #define QudaSumFloat3 doublesingle3 00019 #include <double_single.h> 00020 #endif 00021 00022 // These are used for reduction kernels 00023 static QudaSumFloat *d_reduce=0; 00024 static QudaSumFloat *h_reduce=0; 00025 static QudaSumFloat *hd_reduce=0; 00026 static cudaEvent_t reduceEnd; 00027 00028 namespace quda { 00029 unsigned long long blas_flops; 00030 unsigned long long blas_bytes; 00031 } 00032 00033 #include <float_vector.h> 00034 #include <texture.h> 00035 00036 #include <tune_quda.h> 00037 #include <typeinfo> 00038 00039 void zeroCuda(cudaColorSpinorField &a) { a.zero(); } 00040 00041 #define checkSpinor(a, b) \ 00042 { \ 00043 if (a.Precision() != b.Precision()) \ 00044 errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \ 00045 if (a.Length() != b.Length()) \ 00046 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \ 00047 if (a.Stride() != b.Stride()) \ 00048 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \ 00049 } 00050 00051 // For kernels with precision conversion built in 00052 #define checkSpinorLength(a, b) \ 00053 { \ 00054 if (a.Length() != b.Length()) \ 00055 errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \ 00056 if (a.Stride() != b.Stride()) \ 00057 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \ 00058 } 00059 00060 // blasTuning = 1 turns off error checking 00061 static QudaTune blasTuning = QUDA_TUNE_NO; 00062 static QudaVerbosity verbosity = QUDA_SILENT; 00063 00064 static cudaStream_t *blasStream; 00065 00066 static struct { 00067 int x[QUDA_MAX_DIM]; 00068 int stride; 00069 } blasConstants; 00070 00071 namespace quda { 00072 00073 void initBlas() 00074 { 00075 // reduction buffer size 00076 size_t bytes = 3*REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat); 00077 00078 if (!d_reduce) { 00079 if (cudaMalloc((void**) &d_reduce, bytes) == cudaErrorMemoryAllocation) { 00080 errorQuda("Error allocating device reduction array"); 00081 } 00082 } 00083 00084 // these arrays are acutally oversized currently (only needs to be QudaSumFloat3) 00085 00086 // if the device supports host-mapped memory then use a host-mapped array for the reduction 00087 if(deviceProp.canMapHostMemory) { 00088 if (!h_reduce) { 00089 if (cudaHostAlloc((void**) &h_reduce, bytes, cudaHostAllocMapped) == cudaErrorMemoryAllocation) { 00090 errorQuda("Error allocating host reduction array"); 00091 } 00092 00093 // set the matching device pointer 00094 cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0); 00095 } 00096 00097 } else { 00098 00099 if (!h_reduce) { 00100 if (cudaMallocHost((void**) &h_reduce, bytes) == cudaErrorMemoryAllocation) { 00101 errorQuda("Error allocating host reduction array"); 00102 } 00103 } 00104 00105 hd_reduce = d_reduce; 00106 } 00107 00108 blasStream = &streams[Nstream-1]; 00109 cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming); 00110 00111 checkCudaError(); 00112 } 00113 00114 00115 void endBlas(void) 00116 { 00117 if (d_reduce) { 00118 cudaFree(d_reduce); 00119 d_reduce = 0; 00120 } 00121 00122 if (h_reduce) { 00123 cudaFreeHost(h_reduce); 00124 h_reduce = 0; 00125 } 00126 00127 hd_reduce = 0; 00128 00129 cudaEventDestroy(reduceEnd); 00130 } 00131 00132 void setBlasTuning(QudaTune tune, QudaVerbosity verbose) 00133 { 00134 blasTuning = tune; 00135 verbosity = verbose; 00136 } 00137 00138 } // namespace quda 00139 00140 // FIXME: this should be queried from the device 00141 #if (__COMPUTE_CAPACITY__ < 200) 00142 #define MAX_BLOCK 512 00143 #else 00144 #define MAX_BLOCK 1024 00145 #endif 00146 00147 template <typename FloatN, int N, typename Output, typename Input> 00148 __global__ void copyKernel(Output Y, Input X, int length) { 00149 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00150 unsigned int gridSize = gridDim.x*blockDim.x; 00151 00152 while (i < length) { 00153 FloatN x[N]; 00154 X.load(x, i); 00155 Y.save(x, i); 00156 i += gridSize; 00157 } 00158 } 00159 00160 template <typename FloatN, int N, typename Output, typename Input> 00161 class CopyCuda : public Tunable { 00162 00163 private: 00164 Input &X; 00165 Output &Y; 00166 const int length; 00167 00168 int sharedBytesPerThread() const { return 0; } 00169 int sharedBytesPerBlock() const { return 0; } 00170 00171 virtual bool advanceSharedBytes(TuneParam ¶m) const 00172 { 00173 TuneParam next(param); 00174 advanceBlockDim(next); // to get next blockDim 00175 int nthreads = next.block.x * next.block.y * next.block.z; 00176 param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock() ? 00177 sharedBytesPerThread()*nthreads : sharedBytesPerBlock(); 00178 return false; 00179 } 00180 00181 public: 00182 CopyCuda(Output &Y, Input &X, int length) : X(X), Y(Y), length(length) { ; } 00183 virtual ~CopyCuda() { ; } 00184 00185 TuneKey tuneKey() const { 00186 std::stringstream vol, aux; 00187 vol << blasConstants.x[0] << "x"; 00188 vol << blasConstants.x[1] << "x"; 00189 vol << blasConstants.x[2] << "x"; 00190 vol << blasConstants.x[3]; 00191 aux << "stride=" << blasConstants.stride << ",out_prec=" << Y.Precision() << ",in_prec=" << X.Precision(); 00192 return TuneKey(vol.str(), "copyKernel", aux.str()); 00193 } 00194 00195 void apply(const cudaStream_t &stream) { 00196 TuneParam tp = tuneLaunch(*this, blasTuning, verbosity); 00197 copyKernel<FloatN, N><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(Y, X, length); 00198 } 00199 00200 void preTune() { ; } // no need to save state for copy kernels 00201 void postTune() { ; } // no need to restore state for copy kernels 00202 00203 long long flops() const { return 0; } 00204 long long bytes() const { 00205 const int Ninternal = (sizeof(FloatN)/sizeof(((FloatN*)0)->x))*N; 00206 size_t bytes = (X.Precision() + Y.Precision())*Ninternal; 00207 if (X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float); 00208 if (Y.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float); 00209 return bytes*length; 00210 } 00211 }; 00212 00213 00214 void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src) { 00215 if (&src == &dst) return; // aliasing fields 00216 if (src.Nspin() != 1 && src.Nspin() != 4) errorQuda("nSpin(%d) not supported\n", src.Nspin()); 00217 00218 if (dst.SiteSubset() == QUDA_FULL_SITE_SUBSET || src.SiteSubset() == QUDA_FULL_SITE_SUBSET) { 00219 copyCuda(dst.Even(), src.Even()); 00220 copyCuda(dst.Odd(), src.Odd()); 00221 return; 00222 } 00223 00224 checkSpinorLength(dst, src); 00225 00226 for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = src.X()[d]; 00227 blasConstants.stride = src.Stride(); 00228 00229 // For a given dst precision, there are two non-trivial possibilities for the 00230 // src precision. 00231 00232 quda::blas_bytes += src.RealLength()*((int)src.Precision() + (int)dst.Precision()); 00233 00234 Tunable *copy = 0; 00235 00236 if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) { 00237 if (src.Nspin() == 4){ 00238 SpinorTexture<float4, float4, float4, 6, 0> src_tex(src); 00239 Spinor<float4, float2, double2, 6> dst_spinor(dst); 00240 copy = new CopyCuda<float4, 6, Spinor<float4, float2, double2, 6>, 00241 SpinorTexture<float4, float4, float4, 6, 0> > 00242 (dst_spinor, src_tex, src.Stride()); 00243 } else { //src.Nspin() == 1 00244 SpinorTexture<float2, float2, float2, 3, 0> src_tex(src); 00245 Spinor<float2, float2, double2, 3> dst_spinor(dst); 00246 copy = new CopyCuda<float2, 3, Spinor<float2, float2, double2, 3>, 00247 SpinorTexture<float2, float2, float2, 3, 0> > 00248 (dst_spinor, src_tex, src.Stride()); 00249 } 00250 00251 } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) { 00252 if (src.Nspin() == 4){ 00253 SpinorTexture<float4, float2, double2, 6, 0> src_tex(src); 00254 Spinor<float4, float4, float4, 6> dst_spinor(dst); 00255 copy = new CopyCuda<float4, 6, Spinor<float4, float4, float4, 6>, 00256 SpinorTexture<float4, float2, double2, 6, 0> > 00257 (dst_spinor, src_tex, src.Stride()); 00258 } else { //src.Nspin() ==1 00259 SpinorTexture<float2, float2, double2, 3, 0> src_tex(src); 00260 Spinor<float2, float2, float2, 3> dst_spinor(dst); 00261 copy = new CopyCuda<float2, 3, Spinor<float2, float2, float2, 3>, 00262 SpinorTexture<float2, float2, double2, 3, 0> > 00263 (dst_spinor, src_tex, src.Stride()); 00264 } 00265 } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) { 00266 quda::blas_bytes += src.Volume()*sizeof(float); 00267 if (src.Nspin() == 4){ 00268 SpinorTexture<float4, float4, short4, 6, 0> src_tex(src); 00269 Spinor<float4, float4, float4, 6> dst_spinor(dst); 00270 copy = new CopyCuda<float4, 6, Spinor<float4, float4, float4, 6>, 00271 SpinorTexture<float4, float4, short4, 6, 0> > 00272 (dst_spinor, src_tex, src.Stride()); 00273 } else { //nSpin== 1; 00274 SpinorTexture<float2, float2, short2, 3, 0> src_tex(src); 00275 Spinor<float2, float2, float2, 3> dst_spinor(dst); 00276 copy = new CopyCuda<float2, 3, Spinor<float2, float2, float2, 3>, 00277 SpinorTexture<float2, float2, short2, 3, 0> > 00278 (dst_spinor, src_tex, src.Stride()); 00279 } 00280 } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) { 00281 quda::blas_bytes += dst.Volume()*sizeof(float); 00282 if (src.Nspin() == 4){ 00283 SpinorTexture<float4, float4, float4, 6, 0> src_tex(src); 00284 Spinor<float4, float4, short4, 6> dst_spinor(dst); 00285 copy = new CopyCuda<float4, 6, Spinor<float4, float4, short4, 6>, 00286 SpinorTexture<float4, float4, float4, 6, 0> > 00287 (dst_spinor, src_tex, src.Stride()); 00288 } else { //nSpin == 1 00289 SpinorTexture<float2, float2, float2, 3, 0> src_tex(src); 00290 Spinor<float2, float2, short2, 3> dst_spinor(dst); 00291 copy = new CopyCuda<float2, 3, Spinor<float2, float2, short2, 3>, 00292 SpinorTexture<float2, float2, float2, 3, 0> > 00293 (dst_spinor, src_tex, src.Stride()); 00294 } 00295 } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) { 00296 quda::blas_bytes += src.Volume()*sizeof(float); 00297 if (src.Nspin() == 4){ 00298 SpinorTexture<double2, float4, short4, 12, 0> src_tex(src); 00299 Spinor<double2, double2, double2, 12> dst_spinor(dst); 00300 copy = new CopyCuda<double2, 12, Spinor<double2, double2, double2, 12>, 00301 SpinorTexture<double2, float4, short4, 12, 0> > 00302 (dst_spinor, src_tex, src.Stride()); 00303 } else { //nSpin == 1 00304 SpinorTexture<double2, float2, short2, 3, 0> src_tex(src); 00305 Spinor<double2, double2, double2, 3> dst_spinor(dst); 00306 copy = new CopyCuda<double2, 3, Spinor<double2, double2, double2, 3>, 00307 SpinorTexture<double2, float2, short2, 3, 0> > 00308 (dst_spinor, src_tex, src.Stride()); 00309 } 00310 } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) { 00311 quda::blas_bytes += dst.Volume()*sizeof(float); 00312 if (src.Nspin() == 4){ 00313 SpinorTexture<double2, double2, double2, 12, 0> src_tex(src); 00314 Spinor<double2, double4, short4, 12> dst_spinor(dst); 00315 copy = new CopyCuda<double2, 12, Spinor<double2, double4, short4, 12>, 00316 SpinorTexture<double2, double2, double2, 12, 0> > 00317 (dst_spinor, src_tex, src.Stride()); 00318 } else { //nSpin == 1 00319 SpinorTexture<double2, double2, double2, 3, 0> src_tex(src); 00320 Spinor<double2, double2, short2, 3> dst_spinor(dst); 00321 copy = new CopyCuda<double2, 3, Spinor<double2, double2, short2, 3>, 00322 SpinorTexture<double2, double2, double2, 3, 0> > 00323 (dst_spinor, src_tex, src.Stride()); 00324 } 00325 } 00326 00327 if (dst.Precision() != src.Precision()) { 00328 copy->apply(*blasStream); 00329 delete copy; 00330 } else { 00331 cudaMemcpy(dst.V(), src.V(), dst.Bytes(), cudaMemcpyDeviceToDevice); 00332 if (dst.Precision() == QUDA_HALF_PRECISION) { 00333 cudaMemcpy(dst.Norm(), src.Norm(), dst.NormBytes(), cudaMemcpyDeviceToDevice); 00334 quda::blas_bytes += 2*dst.RealLength()*sizeof(float); 00335 } 00336 } 00337 00338 checkCudaError(); 00339 } 00340 00341 #include <blas_core.h> 00342 00346 template <typename Float2, typename FloatN> 00347 struct axpby { 00348 const Float2 a; 00349 const Float2 b; 00350 axpby(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00351 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = a.x*x + b.x*y; } 00352 static int streams() { return 3; } 00353 static int flops() { return 3; } 00354 }; 00355 00356 void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y) { 00357 const int kernel = 2; 00358 blasCuda<axpby,0,1,0,0>(kernel, make_double2(a, 0.0), make_double2(b, 0.0), make_double2(0.0, 0.0), 00359 x, y, x, x); 00360 } 00361 00365 template <typename Float2, typename FloatN> 00366 struct xpy { 00367 xpy(const Float2 &a, const Float2 &b, const Float2 &c) { ; } 00368 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y += x ; } 00369 static int streams() { return 3; } 00370 static int flops() { return 1; } 00371 }; 00372 00373 void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00374 const int kernel = 3; 00375 blasCuda<xpy,0,1,0,0>(kernel, make_double2(1.0, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0), 00376 x, y, x, x); 00377 } 00378 00382 template <typename Float2, typename FloatN> 00383 struct axpy { 00384 const Float2 a; 00385 axpy(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; } 00386 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = a.x*x + y; } 00387 static int streams() { return 3; } 00388 static int flops() { return 2; } 00389 }; 00390 00391 void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 00392 const int kernel = 4; 00393 blasCuda<axpy,0,1,0,0>(kernel, make_double2(a, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0), 00394 x, y, x, x); 00395 } 00396 00400 template <typename Float2, typename FloatN> 00401 struct xpay { 00402 const Float2 a; 00403 xpay(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; } 00404 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = x + a.x*y; } 00405 static int streams() { return 3; } 00406 static int flops() { return 2; } 00407 }; 00408 00409 void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y) { 00410 const int kernel = 5; 00411 blasCuda<xpay,0,1,0,0>(kernel, make_double2(a,0.0), make_double2(0.0, 0.0), make_double2(0.0, 0.0), 00412 x, y, x, x); 00413 } 00414 00418 template <typename Float2, typename FloatN> 00419 struct mxpy { 00420 mxpy(const Float2 &a, const Float2 &b, const Float2 &c) { ; } 00421 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y -= x; } 00422 static int streams() { return 3; } 00423 static int flops() { return 1; } 00424 }; 00425 00426 void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00427 const int kernel = 6; 00428 blasCuda<mxpy,0,1,0,0>(kernel, make_double2(1.0, 0.0), make_double2(1.0, 0.0), 00429 make_double2(0.0, 0.0), x, y, x, x); 00430 } 00431 00435 template <typename Float2, typename FloatN> 00436 struct ax { 00437 const Float2 a; 00438 ax(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; } 00439 __device__ void operator()(FloatN &x, const FloatN &y, const FloatN &z, const FloatN &w) { x *= a.x; } 00440 static int streams() { return 2; } 00441 static int flops() { return 1; } 00442 }; 00443 00444 void axCuda(const double &a, cudaColorSpinorField &x) { 00445 const int kernel = 7; 00446 blasCuda<ax,1,0,0,0>(kernel, make_double2(a, 0.0), make_double2(0.0, 0.0), 00447 make_double2(0.0, 0.0), x, x, x, x); 00448 } 00449 00454 __device__ void caxpy_(const float2 &a, const float4 &x, float4 &y) { 00455 y.x += a.x*x.x; y.x -= a.y*x.y; 00456 y.y += a.y*x.x; y.y += a.x*x.y; 00457 y.z += a.x*x.z; y.z -= a.y*x.w; 00458 y.w += a.y*x.z; y.w += a.x*x.w; 00459 } 00460 00461 __device__ void caxpy_(const float2 &a, const float2 &x, float2 &y) { 00462 y.x += a.x*x.x; y.x -= a.y*x.y; 00463 y.y += a.y*x.x; y.y += a.x*x.y; 00464 } 00465 00466 __device__ void caxpy_(const double2 &a, const double2 &x, double2 &y) { 00467 y.x += a.x*x.x; y.x -= a.y*x.y; 00468 y.y += a.y*x.x; y.y += a.x*x.y; 00469 } 00470 00471 template <typename Float2, typename FloatN> 00472 struct caxpy { 00473 const Float2 a; 00474 caxpy(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; } 00475 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { caxpy_(a, x, y); } 00476 static int streams() { return 3; } 00477 static int flops() { return 4; } 00478 }; 00479 00480 void caxpyCuda(const quda::Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 00481 const int kernel = 8; 00482 blasCuda<caxpy,0,1,0,0>(kernel, make_double2(real(a),imag(a)), make_double2(0.0, 0.0), 00483 make_double2(0.0, 0.0), x, y, x, x); 00484 } 00485 00490 __device__ void caxpby_(const float2 &a, const float4 &x, const float2 &b, float4 &y) 00491 { float4 yy; 00492 yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y; 00493 yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y; 00494 yy.z = a.x*x.z; yy.z -= a.y*x.w; yy.z += b.x*y.z; yy.z -= b.y*y.w; 00495 yy.w = a.y*x.z; yy.w += a.x*x.w; yy.w += b.y*y.z; yy.w += b.x*y.w; 00496 y = yy; } 00497 00498 __device__ void caxpby_(const float2 &a, const float2 &x, const float2 &b, float2 &y) 00499 { float2 yy; 00500 yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y; 00501 yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y; 00502 y = yy; } 00503 00504 __device__ void caxpby_(const double2 &a, const double2 &x, const double2 &b, double2 &y) 00505 { double2 yy; 00506 yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y; 00507 yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y; 00508 y = yy; } 00509 00510 template <typename Float2, typename FloatN> 00511 struct caxpby { 00512 const Float2 a; 00513 const Float2 b; 00514 caxpby(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00515 __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { caxpby_(a, x, b, y); } 00516 static int streams() { return 3; } 00517 static int flops() { return 7; } 00518 }; 00519 00520 void caxpbyCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, cudaColorSpinorField &y) { 00521 const int kernel = 9; 00522 blasCuda<caxpby,0,1,0,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(), b.imag()), 00523 make_double2(0.0, 0.0), x, y, x, x); 00524 } 00525 00530 __device__ void cxpaypbz_(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z) { 00531 float4 zz; 00532 zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y; 00533 zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y; 00534 zz.z = x.z + a.x*y.z; zz.z -= a.y*y.w; zz.z += b.x*z.z; zz.z -= b.y*z.w; 00535 zz.w = x.w + a.y*y.z; zz.w += a.x*y.w; zz.w += b.y*z.z; zz.w += b.x*z.w; 00536 z = zz; 00537 } 00538 00539 __device__ void cxpaypbz_(const float2 &x, const float2 &a, const float2 &y, const float2 &b, float2 &z) { 00540 float2 zz; 00541 zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y; 00542 zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y; 00543 z = zz; 00544 } 00545 00546 __device__ void cxpaypbz_(const double2 &x, const double2 &a, const double2 &y, const double2 &b, double2 &z) { 00547 double2 zz; 00548 zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y; 00549 zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y; 00550 z = zz; 00551 } 00552 00553 template <typename Float2, typename FloatN> 00554 struct cxpaypbz { 00555 const Float2 a; 00556 const Float2 b; 00557 cxpaypbz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00558 __device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, FloatN &w) 00559 { cxpaypbz_(x, a, y, b, z); } 00560 static int streams() { return 4; } 00561 static int flops() { return 8; } 00562 }; 00563 00564 void cxpaypbzCuda(cudaColorSpinorField &x, const quda::Complex &a, cudaColorSpinorField &y, 00565 const quda::Complex &b, cudaColorSpinorField &z) { 00566 const int kernel = 10; 00567 blasCuda<cxpaypbz,0,0,1,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(), b.imag()), 00568 make_double2(0.0, 0.0), x, y, z, z); 00569 } 00570 00574 template <typename Float2, typename FloatN> 00575 struct axpyBzpcx { 00576 const Float2 a; 00577 const Float2 b; 00578 const Float2 c; 00579 axpyBzpcx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; } 00580 __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) 00581 { y += a.x*x; x = b.x*z + c.x*x; } 00582 static int streams() { return 5; } 00583 static int flops() { return 10; } 00584 }; 00585 00586 void axpyBzpcxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y, const double &b, 00587 cudaColorSpinorField& z, const double &c) { 00588 const int kernel = 11; 00589 blasCuda<axpyBzpcx,1,1,0,0>(kernel, make_double2(a,0.0), make_double2(b,0.0), make_double2(c,0.0), 00590 x, y, z, x); 00591 } 00592 00596 template <typename Float2, typename FloatN> 00597 struct axpyZpbx { 00598 const Float2 a; 00599 const Float2 b; 00600 axpyZpbx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00601 __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) 00602 { y += a.x*x; x = z + b.x*x; } 00603 static int streams() { return 5; } 00604 static int flops() { return 8; } 00605 }; 00606 00607 void axpyZpbxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y, 00608 cudaColorSpinorField& z, const double &b) { 00609 const int kernel = 12; 00610 // swap arguments around 00611 blasCuda<axpyZpbx,1,1,0,0>(kernel, make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0), 00612 x, y, z, x); 00613 } 00614 00618 template <typename Float2, typename FloatN> 00619 struct caxpbypzYmbw { 00620 const Float2 a; 00621 const Float2 b; 00622 caxpbypzYmbw(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00623 __device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, const FloatN &w) 00624 { caxpy_(a, x, z); caxpy_(b, y, z); caxpy_(-b, w, y); } 00625 00626 static int streams() { return 6; } 00627 static int flops() { return 12; } 00628 }; 00629 00630 void caxpbypzYmbwCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, 00631 cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w) { 00632 const int kernel = 12; 00633 blasCuda<caxpbypzYmbw,0,1,1,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(), b.imag()), 00634 make_double2(0.0,0.0), x, y, z, w); 00635 } 00636 00640 template <typename Float2, typename FloatN> 00641 struct cabxpyAx { 00642 const Float2 a; 00643 const Float2 b; 00644 cabxpyAx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00645 __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) 00646 { x *= a.x; caxpy_(b, x, y); } 00647 static int streams() { return 4; } 00648 static int flops() { return 5; } 00649 }; 00650 00651 void cabxpyAxCuda(const double &a, const quda::Complex &b, 00652 cudaColorSpinorField &x, cudaColorSpinorField &y) { 00653 const int kernel = 14; 00654 // swap arguments around 00655 blasCuda<cabxpyAx,1,1,0,0>(kernel, make_double2(a,0.0), make_double2(b.real(),b.imag()), 00656 make_double2(0.0,0.0), x, y, x, x); 00657 } 00658 00662 template <typename Float2, typename FloatN> 00663 struct caxpbypz { 00664 const Float2 a; 00665 const Float2 b; 00666 caxpbypz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; } 00667 __device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, const FloatN &w) 00668 { caxpy_(a, x, z); caxpy_(b, y, z); } 00669 static int streams() { return 4; } 00670 static int flops() { return 5; } 00671 }; 00672 00673 void caxpbypzCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, 00674 cudaColorSpinorField &y, cudaColorSpinorField &z) { 00675 const int kernel = 15; 00676 blasCuda<caxpbypz,0,0,1,0>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(),b.imag()), 00677 make_double2(0.0,0.0), x, y, z, z); 00678 } 00679 00683 template <typename Float2, typename FloatN> 00684 struct caxpbypczpw { 00685 const Float2 a; 00686 const Float2 b; 00687 const Float2 c; 00688 caxpbypczpw(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; } 00689 __device__ void operator()(const FloatN &x, const FloatN &y, const FloatN &z, FloatN &w) 00690 { caxpy_(a, x, w); caxpy_(b, y, w); caxpy_(c, z, w); } 00691 00692 static int streams() { return 4; } 00693 static int flops() { return 5; } 00694 }; 00695 00696 void caxpbypczpwCuda(const quda::Complex &a, cudaColorSpinorField &x, const quda::Complex &b, 00697 cudaColorSpinorField &y, const quda::Complex &c, cudaColorSpinorField &z, 00698 cudaColorSpinorField &w) { 00699 const int kernel = 16; 00700 blasCuda<caxpbypczpw,0,0,0,1>(kernel, make_double2(a.real(),a.imag()), make_double2(b.real(),b.imag()), 00701 make_double2(c.real(), c.imag()), x, y, z, w); 00702 } 00703 00710 template <typename Float2, typename FloatN> 00711 struct caxpyxmaz { 00712 Float2 a; 00713 caxpyxmaz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; } 00714 __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) 00715 { caxpy_(a, x, y); x-= a.x*z; } 00716 static int streams() { return 5; } 00717 static int flops() { return 8; } 00718 }; 00719 00720 void caxpyXmazCuda(const quda::Complex &a, cudaColorSpinorField &x, 00721 cudaColorSpinorField &y, cudaColorSpinorField &z) { 00722 const int kernel = 17; 00723 blasCuda<caxpyxmaz,1,1,0,0>(kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), 00724 make_double2(0.0, 0.0), x, y, z, x); 00725 } 00726 00727 00731 __device__ double norm2_(const double2 &a) { return a.x*a.x + a.y*a.y; } 00732 __device__ float norm2_(const float2 &a) { return a.x*a.x + a.y*a.y; } 00733 __device__ float norm2_(const float4 &a) { return a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w; } 00734 00735 template <typename ReduceType, typename Float2, typename FloatN> 00736 struct Norm2 { 00737 Norm2(const Float2 &a, const Float2 &b) { ; } 00738 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += norm2_(x); } 00739 static int streams() { return 1; } 00740 static int flops() { return 2; } 00741 }; 00742 00743 #include <reduce_core.h> 00744 00745 double normCuda(const cudaColorSpinorField &x) { 00746 const int kernel = 18; 00747 cudaColorSpinorField &y = (cudaColorSpinorField&)x; // FIXME 00748 return reduceCuda<double,QudaSumFloat,QudaSumFloat,Norm2,0,0,0> 00749 (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y); 00750 } 00751 00755 __device__ double dot_(const double2 &a, const double2 &b) { return a.x*b.x + a.y*b.y; } 00756 __device__ float dot_(const float2 &a, const float2 &b) { return a.x*b.x + a.y*b.y; } 00757 __device__ float dot_(const float4 &a, const float4 &b) { return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; } 00758 00759 template <typename ReduceType, typename Float2, typename FloatN> 00760 struct Dot { 00761 Dot(const Float2 &a, const Float2 &b) { ; } 00762 __device__ void operator()(ReduceType &sum, const FloatN &x, const FloatN &y, const FloatN &z, 00763 const FloatN &w, const FloatN &v) { sum += dot_(x,y); } 00764 static int streams() { return 2; } 00765 static int flops() { return 2; } 00766 }; 00767 00768 double reDotProductCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00769 const int kernel = 19; 00770 return reduceCuda<double,QudaSumFloat,QudaSumFloat,Dot,0,0,0> 00771 (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x); 00772 } 00773 00778 template <typename ReduceType, typename Float2, typename FloatN> 00779 struct axpyNorm2 { 00780 Float2 a; 00781 axpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; } 00782 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { 00783 y += a.x*x; sum += norm2_(y); } 00784 static int streams() { return 3; } 00785 static int flops() { return 4; } 00786 }; 00787 00788 double axpyNormCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 00789 const int kernel = 20; 00790 return reduceCuda<double,QudaSumFloat,QudaSumFloat,axpyNorm2,0,1,0> 00791 (kernel, make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x); 00792 } 00793 00798 template <typename ReduceType, typename Float2, typename FloatN> 00799 struct xmyNorm2 { 00800 xmyNorm2(const Float2 &a, const Float2 &b) { ; } 00801 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { 00802 y = x - y; sum += norm2_(y); } 00803 static int streams() { return 3; } 00804 static int flops() { return 3; } 00805 }; 00806 00807 double xmyNormCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00808 const int kernel = 21; 00809 return reduceCuda<double,QudaSumFloat,QudaSumFloat,xmyNorm2,0,1,0> 00810 (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x); 00811 } 00812 00817 template <typename ReduceType, typename Float2, typename FloatN> 00818 struct caxpyNorm2 { 00819 Float2 a; 00820 caxpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; } 00821 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { 00822 caxpy_(a, x, y); sum += norm2_(y); } 00823 static int streams() { return 3; } 00824 static int flops() { return 6; } 00825 }; 00826 00827 double caxpyNormCuda(const quda::Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y) { 00828 const int kernel = 22; 00829 return reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyNorm2,0,1,0> 00830 (kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, x, x, x); 00831 } 00832 00840 template <typename ReduceType, typename Float2, typename FloatN> 00841 struct caxpyxmaznormx { 00842 Float2 a; 00843 caxpyxmaznormx(const Float2 &a, const Float2 &b) : a(a) { ; } 00844 __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { caxpy_(a, x, y); x-= a.x*z; sum += norm2_(x); } 00845 static int streams() { return 5; } 00846 static int flops() { return 10; } 00847 }; 00848 00849 double caxpyXmazNormXCuda(const quda::Complex &a, cudaColorSpinorField &x, 00850 cudaColorSpinorField &y, cudaColorSpinorField &z) { 00851 const int kernel = 23; 00852 return reduceCuda<double,QudaSumFloat,QudaSumFloat,caxpyxmaznormx,1,1,0> 00853 (kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, z, x, x); 00854 } 00855 00863 template <typename ReduceType, typename Float2, typename FloatN> 00864 struct cabxpyaxnorm { 00865 Float2 a; 00866 Float2 b; 00867 cabxpyaxnorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; } 00868 __device__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { x *= a.x; caxpy_(b, x, y); sum += norm2_(y); } 00869 static int streams() { return 4; } 00870 static int flops() { return 10; } 00871 }; 00872 00873 double cabxpyAxNormCuda(const double &a, const quda::Complex &b, 00874 cudaColorSpinorField &x, cudaColorSpinorField &y) { 00875 const int kernel = 24; 00876 return reduceCuda<double,QudaSumFloat,QudaSumFloat,cabxpyaxnorm,1,1,0> 00877 (kernel, make_double2(a, 0.0), make_double2(b.real(), b.imag()), x, y, x, x, x); 00878 } 00879 00883 __device__ double2 cdot_(const double2 &a, const double2 &b) 00884 { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); } 00885 __device__ double2 cdot_(const float2 &a, const float2 &b) 00886 { return make_double2(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x); } 00887 __device__ double2 cdot_(const float4 &a, const float4 &b) 00888 { return make_double2(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z); } 00889 00890 template <typename ReduceType, typename Float2, typename FloatN> 00891 struct Cdot { 00892 Cdot(const Float2 &a, const Float2 &b) { ; } 00893 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += cdot_(x,y); } 00894 static int streams() { return 2; } 00895 static int flops() { return 4; } 00896 }; 00897 00898 quda::Complex cDotProductCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00899 const int kernel = 25; 00900 double2 cdot = reduceCuda<double2,QudaSumFloat2,QudaSumFloat,Cdot,0,0,0> 00901 (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x); 00902 return quda::Complex(cdot.x, cdot.y); 00903 } 00904 00911 template <typename ReduceType, typename Float2, typename FloatN> 00912 struct xpaycdotzy { 00913 Float2 a; 00914 xpaycdotzy(const Float2 &a, const Float2 &b) : a(a) { ; } 00915 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { y = x + a.x*y; sum += cdot_(z,y); } 00916 static int streams() { return 4; } 00917 static int flops() { return 6; } 00918 }; 00919 00920 quda::Complex xpaycDotzyCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y, cudaColorSpinorField &z) { 00921 const int kernel = 26; 00922 double2 cdot = reduceCuda<double2,QudaSumFloat2,QudaSumFloat,xpaycdotzy,0,1,0> 00923 (kernel, make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, x, x); 00924 return quda::Complex(cdot.x, cdot.y); 00925 } 00926 00933 template <typename ReduceType, typename Float2, typename FloatN> 00934 struct caxpydotzy { 00935 Float2 a; 00936 caxpydotzy(const Float2 &a, const Float2 &b) : a(a) { ; } 00937 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { caxpy_(a, x, y); sum += cdot_(z,y); } 00938 static int streams() { return 4; } 00939 static int flops() { return 8; } 00940 }; 00941 00942 quda::Complex caxpyDotzyCuda(const quda::Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, 00943 cudaColorSpinorField &z) { 00944 const int kernel = 27; 00945 double2 cdot = reduceCuda<double2,QudaSumFloat2,QudaSumFloat,caxpydotzy,0,1,0> 00946 (kernel, make_double2(a.real(), a.imag()), make_double2(0.0, 0.0), x, y, z, x, x); 00947 return quda::Complex(cdot.x, cdot.y); 00948 } 00949 00954 __device__ double3 cdotNormA_(const double2 &a, const double2 &b) 00955 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); } 00956 __device__ double3 cdotNormA_(const float2 &a, const float2 &b) 00957 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, a.x*a.x + a.y*a.y); } 00958 __device__ double3 cdotNormA_(const float4 &a, const float4 &b) 00959 { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, 00960 a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z, 00961 a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w); } 00962 00963 template <typename ReduceType, typename Float2, typename FloatN> 00964 struct CdotNormA { 00965 CdotNormA(const Float2 &a, const Float2 &b) { ; } 00966 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += cdotNormA_(x,y); } 00967 static int streams() { return 2; } 00968 static int flops() { return 6; } 00969 }; 00970 00971 double3 cDotProductNormACuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00972 const int kernel = 28; 00973 return reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormA,0,0,0> 00974 (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x); 00975 } 00976 00981 __device__ double3 cdotNormB_(const double2 &a, const double2 &b) 00982 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); } 00983 __device__ double3 cdotNormB_(const float2 &a, const float2 &b) 00984 { return make_double3(a.x*b.x + a.y*b.y, a.x*b.y - a.y*b.x, b.x*b.x + b.y*b.y); } 00985 __device__ double3 cdotNormB_(const float4 &a, const float4 &b) 00986 { return make_double3(a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w, a.x*b.y - a.y*b.x + a.z*b.w - a.w*b.z, 00987 b.x*b.x + b.y*b.y + b.z*b.z + b.w*b.w); } 00988 00989 template <typename ReduceType, typename Float2, typename FloatN> 00990 struct CdotNormB { 00991 CdotNormB(const Float2 &a, const Float2 &b) { ; } 00992 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w, const FloatN &v) { sum += cdotNormB_(x,y); } 00993 static int streams() { return 2; } 00994 static int flops() { return 6; } 00995 }; 00996 00997 double3 cDotProductNormBCuda(cudaColorSpinorField &x, cudaColorSpinorField &y) { 00998 const int kernel = 29; 00999 return reduceCuda<double3,QudaSumFloat3,QudaSumFloat,CdotNormB,0,0,0> 01000 (kernel, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x); 01001 } 01002 01007 template <typename ReduceType, typename Float2, typename FloatN> 01008 struct caxpbypzYmbwcDotProductUYNormY { 01009 Float2 a; 01010 Float2 b; 01011 caxpbypzYmbwcDotProductUYNormY(const Float2 &a, const Float2 &b) : a(a), b(b) { ; } 01012 __device__ void operator()(ReduceType &sum, const FloatN &x, FloatN &y, FloatN &z, const FloatN &w, const FloatN &v) { caxpy_(a, x, z); caxpy_(b, y, z); caxpy_(-b, w, y); sum += cdotNormB_(v,y); } 01013 static int streams() { return 7; } 01014 static int flops() { return 18; } 01015 }; 01016 01017 double3 caxpbypzYmbwcDotProductUYNormYCuda(const quda::Complex &a, cudaColorSpinorField &x, 01018 const quda::Complex &b, cudaColorSpinorField &y, 01019 cudaColorSpinorField &z, cudaColorSpinorField &w, 01020 cudaColorSpinorField &u) { 01021 const int kernel = 30; 01022 return reduceCuda<double3,QudaSumFloat3,QudaSumFloat,caxpbypzYmbwcDotProductUYNormY,0,1,1> 01023 (kernel, make_double2(a.real(), a.imag()), make_double2(b.real(), b.imag()), x, y, z, w, u); 01024 }