1 __host__ __device__
void zero(
double &
x) { x = 0.0; }
2 __host__ __device__
void zero(double2 &
x) { x.x = 0.0; x.y = 0.0; }
3 __host__ __device__
void zero(double3 &
x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
4 __device__
void copytoshared(
double *
s,
const int i,
const double x,
const int block) { s[i] =
x; }
5 __device__
void copytoshared(
double *
s,
const int i,
const double2
x,
const int block)
6 { s[i] = x.x; s[i+block] = x.y; }
7 __device__
void copytoshared(
double *
s,
const int i,
const double3
x,
const int block)
8 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
9 __device__
void copytoshared(
volatile double *
s,
const int i,
const double x,
const int block) { s[i] =
x; }
10 __device__
void copytoshared(
volatile double *
s,
const int i,
const double2
x,
const int block)
11 { s[i] = x.x; s[i+block] = x.y; }
12 __device__
void copytoshared(
volatile double *
s,
const int i,
const double3
x,
const int block)
13 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
14 __device__
void copyfromshared(
double &
x,
const double *
s,
const int i,
const int block) { x = s[i]; }
15 __device__
void copyfromshared(double2 &
x,
const double *
s,
const int i,
const int block)
16 { x.x = s[i]; x.y = s[i+block]; }
17 __device__
void copyfromshared(double3 &
x,
const double *
s,
const int i,
const int block)
18 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
20 template<
typename ReduceType,
typename ReduceSimpleType>
21 __device__
void add(ReduceType &sum, ReduceSimpleType *
s,
const int i,
const int block) { }
25 { sum.x +=
s[i]; sum.y +=
s[i+block]; }
27 { sum.x +=
s[i]; sum.y +=
s[i+block]; sum.z +=
s[i+2*block]; }
29 template<
typename ReduceType,
typename ReduceSimpleType>
30 __device__
void add(ReduceSimpleType *
s,
const int i,
const int j,
const int block) { }
31 template<
typename ReduceType,
typename ReduceSimpleType>
32 __device__
void add(
volatile ReduceSimpleType *
s,
const int i,
const int j,
const int block) { }
36 template<> __device__
void add<double,double>(
volatile double *
s,
const int i,
const int j,
const int block)
40 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
41 template<> __device__
void add<double2,double>(
volatile double *
s,
const int i,
const int j,
const int block)
42 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
45 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
46 template<> __device__
void add<double3,double>(
volatile double *
s,
const int i,
const int j,
const int block)
47 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
49 #if (__COMPUTE_CAPABILITY__ < 130)
55 { s[i] = x.
x; s[i+block] = x.
y; }
57 { s[i] = x.
x; s[i+block] = x.
y; s[i+2*block] = x.
z; }
60 { s[i].
a.x = x.
x.
a.x; s[i].
a.y = x.
x.
a.y; s[i+block].
a.x = x.
y.
a.x; s[i+block].
a.y = x.
y.
a.y; }
62 { s[i].
a.x = x.
x.
a.x; s[i].
a.y = x.
x.
a.y; s[i+block].
a.x = x.
y.
a.x; s[i+block].
a.y = x.
y.
a.y;
63 s[i+2*block].
a.x = x.
z.
a.x; s[i+2*block].
a.y = x.
z.
a.y; }
66 { x.
x = s[i]; x.
y = s[i+block]; }
68 { x.
x = s[i]; x.
y = s[i+block]; x.
z = s[i+2*block]; }
73 { sum.x +=
s[i]; sum.y +=
s[i+block]; }
75 { sum.x +=
s[i]; sum.y +=
s[i+block]; sum.z +=
s[i+2*block]; }
83 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
85 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
88 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
90 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
93 __device__
unsigned int count = 0;
99 template <
int block_size,
typename ReduceType,
typename ReduceSimpleType,
100 typename FloatN,
int M,
typename SpinorX,
typename SpinorY,
101 typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
102 __global__
void reduceKernel(SpinorX
X, SpinorY Y, SpinorZ
Z, SpinorW W, SpinorV
V, Reducer r,
103 ReduceType *partial, ReduceType *complete,
int length) {
104 unsigned int tid = threadIdx.x;
105 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
106 unsigned int gridSize = gridDim.x*blockDim.x;
111 FloatN x[M], y[M], z[M], w[M], v[M];
118 #if (__COMPUTE_CAPABILITY__ >= 200)
123 for (
int j=0; j<M; j++) r(sum, x[j], y[j], z[j], w[j], v[j]);
125 #if (__COMPUTE_CAPABILITY__ >= 200)
138 extern __shared__ ReduceSimpleType sdata[];
139 ReduceSimpleType *
s = sdata + tid;
140 if (tid >= warpSize)
copytoshared(s, 0, sum, block_size);
147 for (
int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum,
s, i, block_size); }
150 volatile ReduceSimpleType *sv =
s;
153 if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
154 if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
155 if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
156 if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
157 if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
168 partial[blockIdx.x] =
tmp;
173 unsigned int value = atomicInc(&
count, gridDim.x);
183 unsigned int i = threadIdx.x;
187 while (i < gridDim.x) {
192 extern __shared__ ReduceSimpleType sdata[];
193 ReduceSimpleType *s = sdata + tid;
194 if (tid >= warpSize)
copytoshared(s, 0, sum, block_size);
201 for (
int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum,
s, i, block_size); }
204 volatile ReduceSimpleType *sv =
s;
207 if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
208 if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
209 if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
210 if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
211 if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
218 if (threadIdx.x == 0) {
230 template <
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
typename FloatN,
231 int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
232 typename SpinorW,
typename SpinorV,
typename Reducer>
233 doubleN
reduceLaunch(SpinorX
X, SpinorY Y, SpinorZ
Z, SpinorW W, SpinorV
V, Reducer r,
234 int len,
const TuneParam &tp,
const cudaStream_t &
stream) {
235 ReduceType *part = (ReduceType*)d_reduce;
236 ReduceType *full = (ReduceType*)hd_reduce;
241 if (tp.block.x == 32) {
242 reduceKernel<32,ReduceType,ReduceSimpleType,FloatN,M>
243 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
244 }
else if (tp.block.x == 64) {
245 reduceKernel<64,ReduceType,ReduceSimpleType,FloatN,M>
246 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
247 }
else if (tp.block.x == 96) {
248 reduceKernel<96,ReduceType,ReduceSimpleType,FloatN,M>
249 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
250 }
else if (tp.block.x == 128) {
251 reduceKernel<128,ReduceType,ReduceSimpleType,FloatN,M>
252 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
253 }
else if (tp.block.x == 160) {
254 reduceKernel<160,ReduceType,ReduceSimpleType,FloatN,M>
255 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
256 }
else if (tp.block.x == 192) {
257 reduceKernel<192,ReduceType,ReduceSimpleType,FloatN,M>
258 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
259 }
else if (tp.block.x == 224) {
260 reduceKernel<224,ReduceType,ReduceSimpleType,FloatN,M>
261 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
262 }
else if (tp.block.x == 256) {
263 reduceKernel<256,ReduceType,ReduceSimpleType,FloatN,M>
264 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
265 }
else if (tp.block.x == 288) {
266 reduceKernel<288,ReduceType,ReduceSimpleType,FloatN,M>
267 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
268 }
else if (tp.block.x == 320) {
269 reduceKernel<320,ReduceType,ReduceSimpleType,FloatN,M>
270 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
271 }
else if (tp.block.x == 352) {
272 reduceKernel<352,ReduceType,ReduceSimpleType,FloatN,M>
273 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
274 }
else if (tp.block.x == 384) {
275 reduceKernel<384,ReduceType,ReduceSimpleType,FloatN,M>
276 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
277 }
else if (tp.block.x == 416) {
278 reduceKernel<416,ReduceType,ReduceSimpleType,FloatN,M>
279 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
280 }
else if (tp.block.x == 448) {
281 reduceKernel<448,ReduceType,ReduceSimpleType,FloatN,M>
282 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
283 }
else if (tp.block.x == 480) {
284 reduceKernel<480,ReduceType,ReduceSimpleType,FloatN,M>
285 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
286 }
else if (tp.block.x == 512) {
287 reduceKernel<512,ReduceType,ReduceSimpleType,FloatN,M>
288 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
289 }
else if (tp.block.x == 544) {
290 reduceKernel<544,ReduceType,ReduceSimpleType,FloatN,M>
291 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
292 }
else if (tp.block.x == 576) {
293 reduceKernel<576,ReduceType,ReduceSimpleType,FloatN,M>
294 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
295 }
else if (tp.block.x == 608) {
296 reduceKernel<608,ReduceType,ReduceSimpleType,FloatN,M>
297 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
298 }
else if (tp.block.x == 640) {
299 reduceKernel<640,ReduceType,ReduceSimpleType,FloatN,M>
300 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
301 }
else if (tp.block.x == 672) {
302 reduceKernel<672,ReduceType,ReduceSimpleType,FloatN,M>
303 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
304 }
else if (tp.block.x == 704) {
305 reduceKernel<704,ReduceType,ReduceSimpleType,FloatN,M>
306 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
307 }
else if (tp.block.x == 736) {
308 reduceKernel<736,ReduceType,ReduceSimpleType,FloatN,M>
309 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
310 }
else if (tp.block.x == 768) {
311 reduceKernel<768,ReduceType,ReduceSimpleType,FloatN,M>
312 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
313 }
else if (tp.block.x == 800) {
314 reduceKernel<800,ReduceType,ReduceSimpleType,FloatN,M>
315 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
316 }
else if (tp.block.x == 832) {
317 reduceKernel<832,ReduceType,ReduceSimpleType,FloatN,M>
318 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
319 }
else if (tp.block.x == 864) {
320 reduceKernel<864,ReduceType,ReduceSimpleType,FloatN,M>
321 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
322 }
else if (tp.block.x == 896) {
323 reduceKernel<896,ReduceType,ReduceSimpleType,FloatN,M>
324 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
325 }
else if (tp.block.x == 928) {
326 reduceKernel<928,ReduceType,ReduceSimpleType,FloatN,M>
327 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
328 }
else if (tp.block.x == 960) {
329 reduceKernel<960,ReduceType,ReduceSimpleType,FloatN,M>
330 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
331 }
else if (tp.block.x == 992) {
332 reduceKernel<992,ReduceType,ReduceSimpleType,FloatN,M>
333 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
334 }
else if (tp.block.x == 1024) {
335 reduceKernel<1024,ReduceType,ReduceSimpleType,FloatN,M>
336 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
X, Y,
Z, W,
V, r, part, full, len);
338 errorQuda(
"Reduction not implemented for %d threads", tp.block.x);
341 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
343 cudaEventRecord(reduceEnd, stream);
344 while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; }
347 { cudaMemcpy(h_reduce, hd_reduce,
sizeof(ReduceType), cudaMemcpyDeviceToHost); }
351 cpu_sum += ((ReduceType*)h_reduce)[0];
353 const int Nreduce =
sizeof(doubleN) /
sizeof(
double);
360 template <
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
typename FloatN,
361 int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
362 typename SpinorW,
typename SpinorV,
typename Reducer>
377 char *X_h, *Y_h, *Z_h, *W_h, *V_h;
378 char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;
380 int sharedBytesPerThread()
const {
return sizeof(ReduceType); }
384 int sharedBytesPerBlock(
const TuneParam &
param)
const {
386 return 2*warpSize*
sizeof(ReduceType);
389 virtual bool advanceSharedBytes(TuneParam ¶m)
const
391 TuneParam next(param);
392 advanceBlockDim(next);
393 int nthreads = next.block.x * next.block.y * next.block.z;
394 param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
395 sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
400 ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z,
401 SpinorW &W, SpinorV &V, Reducer &r,
int length) :
402 result(result), X(X), Y(Y), Z(Z), W(W), V(V), r(r),
403 X_h(0), Y_h(0), Z_h(0), W_h(0), V_h(0),
404 Xnorm_h(0), Ynorm_h(0), Znorm_h(0), Wnorm_h(0), Vnorm_h(0), length(length)
409 std::stringstream vol, aux;
410 vol << blasConstants.x[0] <<
"x";
411 vol << blasConstants.x[1] <<
"x";
412 vol << blasConstants.x[2] <<
"x";
413 vol << blasConstants.x[3];
414 aux <<
"stride=" << blasConstants.stride <<
",prec=" << X.Precision();
415 return TuneKey(vol.str(),
typeid(r).name(), aux.str());
420 result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M>
421 (X, Y, Z, W, V, r, length, tp,
stream);
425 size_t bytes = X.Precision()*(
sizeof(
FloatN)/
sizeof(((
FloatN*)0)->x))*M*X.Stride();
427 X.save(&X_h, &Xnorm_h, bytes, norm_bytes);
428 Y.save(&Y_h, &Ynorm_h, bytes, norm_bytes);
429 Z.save(&Z_h, &Znorm_h, bytes, norm_bytes);
430 W.save(&W_h, &Wnorm_h, bytes, norm_bytes);
431 V.save(&V_h, &Vnorm_h, bytes, norm_bytes);
435 size_t bytes = X.Precision()*(
sizeof(
FloatN)/
sizeof(((
FloatN*)0)->x))*M*X.Stride();
437 X.load(&X_h, &Xnorm_h, bytes, norm_bytes);
438 Y.load(&Y_h, &Ynorm_h, bytes, norm_bytes);
439 Z.load(&Z_h, &Znorm_h, bytes, norm_bytes);
440 W.load(&W_h, &Wnorm_h, bytes, norm_bytes);
441 V.load(&V_h, &Vnorm_h, bytes, norm_bytes);
448 return r.streams()*bytes*length; }
469 template <
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
470 template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
471 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
bool siteUnroll>
472 doubleN
reduceCuda(
const double2 &a,
const double2 &b, cudaColorSpinorField &
x,
473 cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w,
474 cudaColorSpinorField &v) {
477 reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
478 writeY,writeZ,writeW,writeV,siteUnroll>
479 (a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even());
481 reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
482 writeY,writeZ,writeW,writeV,siteUnroll>
483 (a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd());
492 for (
int d=0; d<
QUDA_MAX_DIM; d++) blasConstants.x[d] = x.X()[d];
493 blasConstants.stride = x.Stride();
495 int reduce_length = siteUnroll ? x.RealLength() : x.Length();
500 const int M = siteUnroll ? 12 : 1;
506 Reducer<ReduceType, double2, double2> r(a,b);
507 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
508 Spinor<double2,double2,double2,M,writeX>,
Spinor<double2,double2,double2,M,writeY>,
509 Spinor<double2,double2,double2,M,writeZ>,
Spinor<double2,double2,double2,M,writeW>,
511 reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
513 }
else if (x.Nspin() == 1){
514 const int M = siteUnroll ? 3 : 1;
520 Reducer<ReduceType, double2, double2> r(a,b);
521 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
522 Spinor<double2,double2,double2,M,writeX>,
Spinor<double2,double2,double2,M,writeY>,
523 Spinor<double2,double2,double2,M,writeZ>,
Spinor<double2,double2,double2,M,writeW>,
525 reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
527 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x.Nspin()); }
530 const int M = siteUnroll ? 6 : 1;
536 Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
537 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,M,
538 Spinor<float4,float4,float4,M,writeX,0>,
Spinor<float4,float4,float4,M,writeY,1>,
539 Spinor<float4,float4,float4,M,writeZ,2>,
Spinor<float4,float4,float4,M,writeW,3>,
541 reduce(value, X, Y, Z, W, V, r, reduce_length/(4*M));
543 }
else if (x.Nspin() == 1) {
544 const int M = siteUnroll ? 3 : 1;
550 Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
551 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,M,
552 Spinor<float2,float2,float2,M,writeX,0>,
Spinor<float2,float2,float2,M,writeY,1>,
553 Spinor<float2,float2,float2,M,writeZ,2>,
Spinor<float2,float2,float2,M,writeW,3>,
555 reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
557 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x.Nspin()); }
565 Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
566 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,
567 Spinor<float4,float4,short4,6,writeX,0>,
Spinor<float4,float4,short4,6,writeY,1>,
568 Spinor<float4,float4,short4,6,writeZ,2>,
Spinor<float4,float4,short4,6,writeW,3>,
570 reduce(value, X, Y, Z, W, V, r, y.Volume());
572 }
else if (x.Nspin() == 1) {
578 Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
579 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,
580 Spinor<float2,float2,short2,3,writeX,0>,
Spinor<float2,float2,short2,3,writeY,1>,
581 Spinor<float2,float2,short2,3,writeZ,2>,
Spinor<float2,float2,short2,3,writeW,3>,
583 reduce(value, X, Y, Z, W, V, r, y.Volume());
585 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x.Nspin()); }
589 blas_flops += Reducer<ReduceType,double2,double2>::flops()*(
unsigned long long)x.RealLength();