3 template <
int N,
typename ReduceType,
typename SpinorX,
typename SpinorY,
4 typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
5 struct MultiReduceArg {
16 MultiReduceArg(SpinorX
X[N], SpinorY
Y[N], SpinorZ
Z[N], SpinorW W[N], SpinorV
V[N],
17 Reducer r, ReduceType *partial, ReduceType *complete,
int length)
18 : r(r), length(length){
20 for(
int i=0; i<N; ++i){
26 this->partial = partial;
27 this->complete = complete;
33 template<
int block_size,
int N,
typename ReduceType,
typename ReduceSimpleType,
34 typename FloatN,
int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
35 __global__
void multiReduceKernel(MultiReduceArg<N,ReduceType,SpinorX,SpinorY,SpinorZ,SpinorW,SpinorV,Reducer>
arg){
37 unsigned int tid = threadIdx.x;
38 unsigned int gridSize = gridDim.x*blockDim.x;
45 for(
int i=0; i<N; ++i){
47 unsigned int id = blockIdx.x*(blockDim.x) + threadIdx.x;
48 FloatN
x[M],
y[M], z[M], w[M], v[M];
49 while(
id < arg.length){
56 #if (__COMPUTE_CAPABILITY__ >= 200)
60 for (
int j=0; j<M; j++) arg.r(sum[i],
x[j],
y[j], z[j], w[j], v[j]);
62 #if (__COMPUTE_CAPABILITY__ >= 200)
77 extern __shared__ ReduceSimpleType sdata[];
78 ReduceSimpleType *
s = sdata + tid;
81 for(
int i=0; i<N; ++i){
82 if(tid >= warpSize)
copytoshared(s, 0, sum[i], block_size);
87 for(
int j=warpSize; j<block_size; j+=warpSize) add<ReduceType>(sum[i],
s, j, block_size);
88 warpReduce<block_size>(
s, sum[i]);
94 arg.partial[i*gridDim.x + blockIdx.x] =
tmp;
101 unsigned int value = atomicInc(&
count, gridDim.x);
110 for(
int i=0; i<N; ++i){
111 unsigned int id = threadIdx.x;
114 while(
id < gridDim.x){
115 sum[i] += arg.partial[i*gridDim.x + id];
120 extern __shared__ ReduceSimpleType sdata[];
121 ReduceSimpleType *s = sdata + tid;
123 for(
int i=0; i<N; ++i){
124 if(tid >= warpSize)
copytoshared(s, 0, sum[i], block_size);
128 for(
int j=warpSize; j<block_size; j+=warpSize){ add<ReduceType>(sum[i],
s, j, block_size); }
129 warpReduce<block_size>(
s, sum[i]);
134 arg.complete[i] =
tmp;
138 if(threadIdx.x == 0)
count = 0;
142 template<
int N,
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
typename FloatN,
143 int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
144 void multiReduceLaunch(doubleN result[],
145 MultiReduceArg<N,ReduceType,SpinorX,SpinorY,SpinorZ,SpinorW,SpinorV,Reducer> &arg,
146 const TuneParam &tp,
const cudaStream_t &
stream){
151 LAUNCH_KERNEL(multiReduceKernel, tp, stream, arg, N, ReduceType, ReduceSimpleType, FloatN, M);
153 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__))
155 cudaEventRecord(reduceEnd, stream);
156 while(cudaSuccess != cudaEventQuery(reduceEnd)) {}
159 { cudaMemcpy(h_reduce, hd_reduce,
sizeof(ReduceType)*N, cudaMemcpyDeviceToHost); }
161 memset(result, 0, N*
sizeof(doubleN));
162 for(
int i=0; i<N; ++i) result[i] += ((ReduceType*)h_reduce)[i];
164 const int Nreduce = N*(
sizeof(doubleN)/
sizeof(
double));
172 template<
int N,
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
173 typename FloatN,
int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
174 typename SpinorW,
typename SpinorV,
typename Reducer>
175 class MultiReduceCuda :
public Tunable {
178 mutable MultiReduceArg<N,ReduceType,SpinorX,SpinorY,SpinorZ,SpinorW,SpinorV,Reducer>
arg;
183 char *X_h[N], *Y_h[N], *Z_h[N], *W_h[N], *V_h[N];
184 char *Xnorm_h[N], *Ynorm_h[N], *Znorm_h[N], *Wnorm_h[N], *Vnorm_h[N];
186 unsigned int sharedBytesPerThread()
const {
return sizeof(ReduceType); }
190 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
192 return 2*warpSize*
sizeof(ReduceType);
195 virtual bool advanceSharedBytes(TuneParam &
param)
const
197 TuneParam next(param);
198 advanceBlockDim(next);
199 int nthreads = next.block.x * next.block.y * next.block.z;
200 param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ? sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
205 MultiReduceCuda(doubleN result[], SpinorX
X[], SpinorY
Y[],
206 SpinorZ
Z[], SpinorW W[], SpinorV
V[], Reducer &r,
int length) :
207 arg(X, Y, Z, W, V, r, (ReduceType*)d_reduce, (ReduceType*)hd_reduce, length), result(result) {
208 for(
int i=0; i<N; ++i){
223 virtual ~MultiReduceCuda(){}
225 inline TuneKey tuneKey()
const {
226 return TuneKey(blasStrings.vol_str,
typeid(arg.r).name(), blasStrings.aux_str);
229 void apply(
const cudaStream_t &stream){
231 multiReduceLaunch<N,doubleN,ReduceType,ReduceSimpleType,FloatN,M>(result,
arg,tp,
stream);
236 #define BYTES(X) ( arg.X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*arg.X.Stride() )
237 #define NORM_BYTES(X) ( (arg.X.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*arg.length : 0 )
242 for(
int i=0; i<N; ++i){
243 arg.X[i].save(&X_h[i], &Xnorm_h[i], BYTES(X[i]), NORM_BYTES(X[i]));
244 arg.Y[i].save(&Y_h[i], &Ynorm_h[i], BYTES(Y[i]), NORM_BYTES(Y[i]));
245 arg.Z[i].save(&Z_h[i], &Znorm_h[i], BYTES(Z[i]), NORM_BYTES(Z[i]));
246 arg.W[i].save(&W_h[i], &Wnorm_h[i], BYTES(W[i]), NORM_BYTES(W[i]));
247 arg.V[i].save(&V_h[i], &Vnorm_h[i], BYTES(V[i]), NORM_BYTES(V[i]));
252 for(
int i=0; i<N; ++i){
253 arg.X[i].load(&X_h[i], &Xnorm_h[i], BYTES(X[i]), NORM_BYTES(X[i]));
254 arg.Y[i].load(&Y_h[i], &Ynorm_h[i], BYTES(Y[i]), NORM_BYTES(Y[i]));
255 arg.Z[i].load(&Z_h[i], &Znorm_h[i], BYTES(Z[i]), NORM_BYTES(Z[i]));
256 arg.W[i].load(&W_h[i], &Wnorm_h[i], BYTES(W[i]), NORM_BYTES(W[i]));
257 arg.V[i].load(&V_h[i], &Vnorm_h[i], BYTES(V[i]), NORM_BYTES(V[i]));
264 long long flops()
const {
return arg.r.flops()*(
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*arg.length*M; }
265 long long bytes()
const {
266 size_t bytes = N*arg.X[0].Precision()*(
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*M;
268 return arg.r.streams()*bytes*arg.length; }
269 int tuningIter()
const {
return 3; }
276 template<
int N,
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
277 template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
278 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
bool siteUnroll>
279 void multiReduceCuda(doubleN result[],
const double2& a,
const double2& b,
280 std::vector<cudaColorSpinorField*>&
x, std::vector<cudaColorSpinorField*>&
y,
281 std::vector<cudaColorSpinorField*>& z, std::vector<cudaColorSpinorField*>& w,
282 std::vector<cudaColorSpinorField*>& v){
285 doubleN evenResult[N];
286 doubleN oddResult[N];
287 std::vector<cudaColorSpinorField> xp; xp.reserve(N);
288 std::vector<cudaColorSpinorField> yp; yp.reserve(N);
289 std::vector<cudaColorSpinorField> zp; zp.reserve(N);
290 std::vector<cudaColorSpinorField> wp; wp.reserve(N);
291 std::vector<cudaColorSpinorField> vp; vp.reserve(N);
293 std::vector<cudaColorSpinorField*> xpp; xpp.reserve(N);
294 std::vector<cudaColorSpinorField*> ypp; ypp.reserve(N);
295 std::vector<cudaColorSpinorField*> zpp; zpp.reserve(N);
296 std::vector<cudaColorSpinorField*> wpp; wpp.reserve(N);
297 std::vector<cudaColorSpinorField*> vpp; vpp.reserve(N);
299 for(
int i=0; i<N; ++i){
300 xp.push_back(x[i]->Even());
301 yp.push_back(y[i]->Even());
302 zp.push_back(z[i]->Even());
303 wp.push_back(w[i]->Even());
304 vp.push_back(v[i]->Even());
306 xpp.push_back(&xp[i]);
307 ypp.push_back(&yp[i]);
308 zpp.push_back(&zp[i]);
309 wpp.push_back(&wp[i]);
310 vpp.push_back(&vp[i]);
313 multiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, Reducer, writeX,
314 writeY, writeZ, writeW, writeV, siteUnroll>
315 (evenResult, a, b, xpp, ypp, zpp, wpp, vpp);
317 for(
int i=0; i<N; ++i){
318 xp.push_back(x[i]->Odd());
319 yp.push_back(y[i]->Odd());
320 zp.push_back(z[i]->Odd());
321 wp.push_back(w[i]->Odd());
322 vp.push_back(v[i]->Odd());
324 xpp.push_back(&xp[i]);
325 ypp.push_back(&yp[i]);
326 zpp.push_back(&zp[i]);
327 wpp.push_back(&wp[i]);
328 vpp.push_back(&vp[i]);
331 multiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, Reducer, writeX,
332 writeY, writeZ, writeW, writeV, siteUnroll>
333 (oddResult, a, b, xpp, ypp, zpp, wpp, vpp);
335 for(
int i=0; i<N; ++i) result[i] = evenResult[i] + oddResult[i];
338 for(
int i=0; i<N; ++i){
345 if(!x[i]->isNative()){
346 warningQuda(
"Reductions on non-native fields are not supported\n");
347 memset(result, 0, N*
sizeof(doubleN));
352 memset(result, 0, N*
sizeof(doubleN));
354 blasStrings.vol_str = x[0]->VolString();
355 blasStrings.aux_str = x[0]->AuxString();
357 int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
360 if(x[0]->
Nspin() == 4){
361 const int M = siteUnroll ? 12 : 1;
369 for(
int i=0; i<N; ++i){
376 Reducer<ReduceType, double2, double2> r(a,b);
377 MultiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, double2, M,
378 Spinor<double2, double2, double2, M, writeX>,
Spinor<double2, double2, double2, M, writeY>,
379 Spinor<double2, double2, double2, M, writeZ>,
Spinor<double2, double2, double2, M, writeW>,
381 reduce(result, X, Y, Z, W, V, r, reduce_length/(2*M));
386 }
else if(x[0]->
Nspin() == 1){
388 const int M = siteUnroll ? 3 : 1;
390 Spinor<double2, double2, double2, M, writeX> X[N];
391 Spinor<double2, double2, double2, M, writeY> Y[N];
392 Spinor<double2, double2, double2, M, writeZ> Z[N];
393 Spinor<double2, double2, double2, M, writeW> W[N];
394 Spinor<double2, double2, double2, M, writeV> V[N];
396 for(
int i=0; i<N; ++i){
404 Reducer<ReduceType, double2, double2> r(a,b);
405 MultiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, double2, M,
406 Spinor<double2, double2, double2, M, writeX>, Spinor<double2, double2, double2, M, writeY>,
407 Spinor<double2, double2, double2, M, writeZ>, Spinor<double2, double2, double2, M, writeW>,
408 Spinor<double2, double2, double2, M, writeV>, Reducer<ReduceType, double2, double2> >
409 reduce(result, X, Y, Z, W, V, r, reduce_length/(2*M));
413 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x[0]->
Nspin()); }
416 if(x[0]->
Nspin() == 4){
417 const int M = siteUnroll ? 6 : 1;
425 for(
int i=0; i<N; ++i){
434 Reducer<ReduceType, float2, float4> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
435 MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float4,M,
436 Spinor<float4,float4,float4,M,writeX>,
Spinor<float4,float4,float4,M,writeY>,
437 Spinor<float4,float4,float4,M,writeZ>,
Spinor<float4,float4,float4,M,writeW>,
439 reduce(result, X, Y, Z, W, V, r, reduce_length/(4*M));
441 }
else if(x[0]->
Nspin() == 1){
443 const int M = siteUnroll ? 3 : 1;
451 for(
int i=0; i<N; ++i){
459 Reducer<ReduceType, float2, float2> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
460 MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float2,M,
461 Spinor<float2,float2,float2,M,writeX>,
Spinor<float2,float2,float2,M,writeY>,
462 Spinor<float2,float2,float2,M,writeZ>,
Spinor<float2,float2,float2,M,writeW>,
464 reduce(result, X, Y, Z, W, V, r, reduce_length/(2*M));
468 if(x[0]->
Nspin() == 4){
475 for(
int i=0; i<N; ++i){
483 Reducer<ReduceType, float2, float4> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
484 MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float4,6,
485 Spinor<float4,float4,short4,6,writeX>,
Spinor<float4,float4,short4,6,writeY>,
486 Spinor<float4,float4,short4,6,writeZ>,
Spinor<float4,float4,short4,6,writeW>,
488 reduce(result, X, Y, Z, W, V, r, y[0]->Volume());
490 }
else if(x[0]->
Nspin() == 1){
497 for(
int i=0; i<N; ++i){
505 Reducer<ReduceType, float2, float2> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
506 MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float2,3,
507 Spinor<float2,float2,short2,3,writeX>,
Spinor<float2,float2,short2,3,writeY>,
508 Spinor<float2,float2,short2,3,writeZ>,
Spinor<float2,float2,short2,3,writeW>,
510 reduce(result, X, Y, Z, W, V, r, y[0]->Volume());
513 }
else{
errorQuda(
"ERROR: nSpin=%d is not supported\n", x[0]->
Nspin()); }
516 for(
int i=0; i<N; ++i){
518 blas_flops += Reducer<ReduceType,double2,double2>::flops()*(
unsigned long long)x[i]->RealLength();
__device__ void copyfromshared(double &x, const double *s, const int i, const int block)
cudaDeviceProp deviceProp
__device__ void copytoshared(double *s, const int i, const double x, const int block)
QudaVerbosity getVerbosity()
unsigned long long blas_bytes
void set(const cudaColorSpinorField &x)
__host__ __device__ void zero(double &x)
cudaColorSpinorField * tmp
void reduceDoubleArray(double *, const int len)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__device__ unsigned int count
cudaStream_t * getBlasStream()
unsigned long long blas_flops
void * memset(void *s, int c, size_t n)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define checkSpinor(a, b)
#define REDUCE_MAX_BLOCKS
__shared__ bool isLastBlockDone