QUDA: quda/lib/reduce_core.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 __host__ __device__ void zero(double &x) { x = 0.0; }
00002 __host__ __device__ void zero(double2 &x) { x.x = 0.0; x.y = 0.0; }
00003 __host__ __device__ void zero(double3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
00004 __device__ void copytoshared(double *s, const int i, const double x, const int block) { s[i] = x; }
00005 __device__ void copytoshared(double *s, const int i, const double2 x, const int block) 
00006 { s[i] = x.x; s[i+block] = x.y; }
00007 __device__ void copytoshared(double *s, const int i, const double3 x, const int block) 
00008 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
00009 __device__ void copytoshared(volatile double *s, const int i, const double x, const int block) { s[i] = x; }
00010 __device__ void copytoshared(volatile double *s, const int i, const double2 x, const int block) 
00011 { s[i] = x.x; s[i+block] = x.y; }
00012 __device__ void copytoshared(volatile double *s, const int i, const double3 x, const int block) 
00013 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
00014 __device__ void copyfromshared(double &x, const double *s, const int i, const int block) { x = s[i]; }
00015 __device__ void copyfromshared(double2 &x, const double *s, const int i, const int block) 
00016 { x.x = s[i]; x.y = s[i+block]; }
00017 __device__ void copyfromshared(double3 &x, const double *s, const int i, const int block) 
00018 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
00019 
00020 template<typename ReduceType, typename ReduceSimpleType> 
00021 __device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block) { }
00022 template<> __device__ void add<double,double>(double &sum, double *s, const int i, const int block) 
00023 { sum += s[i]; }
00024 template<> __device__ void add<double2,double>(double2 &sum, double *s, const int i, const int block) 
00025 { sum.x += s[i]; sum.y += s[i+block]; }
00026 template<> __device__ void add<double3,double>(double3 &sum, double *s, const int i, const int block) 
00027 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
00028 
00029 template<typename ReduceType, typename ReduceSimpleType> 
00030 __device__ void add(ReduceSimpleType *s, const int i, const int j, const int block) { }
00031 template<typename ReduceType, typename ReduceSimpleType> 
00032 __device__ void add(volatile ReduceSimpleType *s, const int i, const int j, const int block) { }
00033 
00034 template<> __device__ void add<double,double>(double *s, const int i, const int j, const int block) 
00035 { s[i] += s[j]; }
00036 template<> __device__ void add<double,double>(volatile double *s, const int i, const int j, const int block) 
00037 { s[i] += s[j]; }
00038 
00039 template<> __device__ void add<double2,double>(double *s, const int i, const int j, const int block) 
00040 { s[i] += s[j]; s[i+block] += s[j+block];}
00041 template<> __device__ void add<double2,double>(volatile double *s, const int i, const int j, const int block) 
00042 { s[i] += s[j]; s[i+block] += s[j+block];}
00043 
00044 template<> __device__ void add<double3,double>(double *s, const int i, const int j, const int block) 
00045 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
00046 template<> __device__ void add<double3,double>(volatile double *s, const int i, const int j, const int block) 
00047 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
00048 
00049 #if (__COMPUTE_CAPABILITY__ < 130)
00050 __host__ __device__ void zero(doublesingle &x) { x = 0.0; }
00051 __host__ __device__ void zero(doublesingle2 &x) { x.x = 0.0; x.y = 0.0; }
00052 __host__ __device__ void zero(doublesingle3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
00053 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle x, const int block) { s[i] = x; }
00054 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle2 x, const int block) 
00055 { s[i] = x.x; s[i+block] = x.y; }
00056 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle3 x, const int block) 
00057 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
00058 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle x, const int block) { s[i].a.x = x.a.x; s[i].a.y = x.a.y; }
00059 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle2 x, const int block) 
00060 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; }
00061 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle3 x, const int block) 
00062 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; 
00063   s[i+2*block].a.x = x.z.a.x; s[i+2*block].a.y = x.z.a.y; }
00064 __device__ void copyfromshared(doublesingle &x, const doublesingle *s, const int i, const int block) { x = s[i]; }
00065 __device__ void copyfromshared(doublesingle2 &x, const doublesingle *s, const int i, const int block) 
00066 { x.x = s[i]; x.y = s[i+block]; }
00067 __device__ void copyfromshared(doublesingle3 &x, const doublesingle *s, const int i, const int block) 
00068 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
00069 
00070 template<> __device__ void add<doublesingle,doublesingle>(doublesingle &sum, doublesingle *s, const int i, const int block) 
00071 { sum += s[i]; }
00072 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle2 &sum, doublesingle *s, const int i, const int block) 
00073 { sum.x += s[i]; sum.y += s[i+block]; }
00074 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle3 &sum, doublesingle *s, const int i, const int block) 
00075 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
00076 
00077 template<> __device__ void add<doublesingle,doublesingle>(doublesingle *s, const int i, const int j, const int block) 
00078 { s[i] += s[j]; }
00079 template<> __device__ void add<doublesingle,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block) 
00080 { s[i] += s[j]; }
00081 
00082 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle *s, const int i, const int j, const int block) 
00083 { s[i] += s[j]; s[i+block] += s[j+block];}
00084 template<> __device__ void add<doublesingle2,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block) 
00085 { s[i] += s[j]; s[i+block] += s[j+block];}
00086 
00087 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle *s, const int i, const int j, const int block) 
00088 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
00089 template<> __device__ void add<doublesingle3,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block) 
00090 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
00091 #endif
00092 
00093 __device__ unsigned int count = 0;
00094 __shared__ bool isLastBlockDone;
00095 
00099 template <int block_size, typename ReduceType, typename ReduceSimpleType, 
00100           typename FloatN, int M, int writeX, int writeY, int writeZ,
00101           typename InputX, typename InputY, typename InputZ, typename InputW, typename InputV,
00102           typename OutputX, typename OutputY, typename OutputZ, typename Reducer>
00103 __global__ void reduceKernel(InputX X, InputY Y, InputZ Z, InputW W, InputV V, Reducer r, 
00104                              ReduceType *partial, ReduceType *complete,
00105                              OutputX XX, OutputY YY, OutputZ ZZ, int length) {
00106   unsigned int tid = threadIdx.x;
00107   unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
00108   unsigned int gridSize = gridDim.x*blockDim.x;
00109 
00110   ReduceType sum;
00111   zero(sum); 
00112   while (i < length) {
00113     FloatN x[M], y[M], z[M], w[M], v[M];
00114     X.load(x, i);
00115     Y.load(y, i);
00116     Z.load(z, i);
00117     W.load(w, i);
00118     V.load(v, i);
00119 #pragma unroll
00120     for (int j=0; j<M; j++) r(sum, x[j], y[j], z[j], w[j], v[j]);
00121 
00122     if (writeX) XX.save(x, i);
00123     if (writeY) YY.save(y, i);
00124     if (writeZ) ZZ.save(z, i);
00125 
00126     i += gridSize;
00127   }
00128 
00129   extern __shared__ ReduceSimpleType sdata[];
00130   ReduceSimpleType *s = sdata + tid;  
00131   if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
00132   __syncthreads();
00133   
00134   // now reduce using the first warp only
00135   if (tid<warpSize) {
00136     // Warp raking
00137 #pragma unroll
00138     for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
00139 
00140     // Intra-warp reduction
00141     volatile ReduceSimpleType *sv = s;
00142     copytoshared(sv, 0, sum, block_size);
00143 
00144     if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); } 
00145     if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); } 
00146     if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); } 
00147     if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); } 
00148     if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); } 
00149 
00150     // warpSize generic warp reduction - open64 can't handle it, only nvvm
00151     //#pragma unroll
00152     //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
00153   }
00154 
00155   // write result for this block to global mem 
00156   if (tid == 0) {    
00157     ReduceType tmp;
00158     copyfromshared(tmp, s, 0, block_size);
00159     partial[blockIdx.x] = tmp;
00160     
00161     __threadfence(); // flush result
00162     
00163     // increment global block counter
00164     unsigned int value = atomicInc(&count, gridDim.x);
00165     
00166     // Determine if this block is the last block to be done
00167     isLastBlockDone = (value == (gridDim.x-1));
00168   }
00169 
00170   __syncthreads();
00171 
00172   // Finish the reduction if last block
00173   if (isLastBlockDone) {
00174     unsigned int i = threadIdx.x;
00175 
00176     ReduceType sum;
00177     zero(sum); 
00178     while (i < gridDim.x) {
00179       sum += partial[i];
00180       i += block_size;
00181     }
00182     
00183     extern __shared__ ReduceSimpleType sdata[];
00184     ReduceSimpleType *s = sdata + tid;  
00185     if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
00186     __syncthreads();
00187    
00188     // now reduce using the first warp only
00189     if (tid<warpSize) {
00190       // Warp raking
00191 #pragma unroll
00192       for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
00193       
00194       // Intra-warp reduction
00195       volatile ReduceSimpleType *sv = s;
00196       copytoshared(sv, 0, sum, block_size);
00197 
00198       if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); } 
00199       if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); } 
00200       if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); } 
00201       if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); } 
00202       if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); } 
00203 
00204       //#pragma unroll
00205       //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); } 
00206     }
00207  
00208     // write out the final reduced value
00209     if (threadIdx.x == 0) {
00210       ReduceType tmp;
00211       copyfromshared(tmp, s, 0, block_size);
00212       complete[0] = tmp;
00213       count = 0;
00214     }
00215   }
00216 
00217 }
00221 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN, 
00222           int M, int writeX, int writeY, int writeZ, 
00223           typename InputX, typename InputY, typename InputZ, typename InputW, typename InputV,
00224           typename Reducer, typename OutputX, typename OutputY, typename OutputZ>
00225 doubleN reduceLaunch(InputX X, InputY Y, InputZ Z, InputW W, InputV V, Reducer r, 
00226                      OutputX XX, OutputY YY, OutputZ ZZ, int length, const TuneParam &tp,
00227                      const cudaStream_t &stream) {
00228   ReduceType *partial = (ReduceType*)d_reduce;
00229   ReduceType *complete = (ReduceType*)hd_reduce;
00230 
00231   if (tp.grid.x > REDUCE_MAX_BLOCKS) 
00232     errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, REDUCE_MAX_BLOCKS);
00233 
00234   if (tp.block.x == 32) {
00235     reduceKernel<32,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00236       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00237   } else if (tp.block.x == 64) {
00238     reduceKernel<64,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00239       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00240   } else if (tp.block.x == 96) {
00241     reduceKernel<96,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00242       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00243   } else if (tp.block.x == 128) {
00244     reduceKernel<128,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00245       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00246   } else if (tp.block.x == 160) {
00247     reduceKernel<160,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00248       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00249   } else if (tp.block.x == 192) {
00250     reduceKernel<192,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00251       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00252   } else if (tp.block.x == 224) {
00253     reduceKernel<224,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00254       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00255   } else if (tp.block.x == 256) {
00256     reduceKernel<256,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00257       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00258   } else if (tp.block.x == 288) {
00259     reduceKernel<288,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00260       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00261   } else if (tp.block.x == 320) {
00262     reduceKernel<320,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00263       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00264   } else if (tp.block.x == 352) {
00265     reduceKernel<352,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00266       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00267   } else if (tp.block.x == 384) {
00268     reduceKernel<384,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00269       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00270   } else if (tp.block.x == 416) {
00271     reduceKernel<416,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00272       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00273   } else if (tp.block.x == 448) {
00274     reduceKernel<448,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00275       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00276   } else if (tp.block.x == 480) {
00277     reduceKernel<480,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00278       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00279   } else if (tp.block.x == 512) {
00280     reduceKernel<512,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00281       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00282   } /*else if (tp.block.x == 544) {
00283     reduceKernel<544,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00284       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00285   } else if (tp.block.x == 576) {
00286     reduceKernel<576,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00287       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00288   } else if (tp.block.x == 608) {
00289     reduceKernel<608,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00290       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00291   } else if (tp.block.x == 640) {
00292     reduceKernel<640,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00293       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00294   } else if (tp.block.x == 672) {
00295     reduceKernel<672,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00296       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00297   } else if (tp.block.x == 704) {
00298     reduceKernel<704,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00299       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00300   } else if (tp.block.x == 736) {
00301     reduceKernel<736,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00302       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00303   } else if (tp.block.x == 768) {
00304     reduceKernel<768,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00305       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00306   } else if (tp.block.x == 800) {
00307     reduceKernel<800,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00308       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00309   } else if (tp.block.x == 832) {
00310     reduceKernel<832,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00311       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00312   } else if (tp.block.x == 864) {
00313     reduceKernel<864,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00314       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00315   } else if (tp.block.x == 896) {
00316     reduceKernel<896,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00317       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00318   } else if (tp.block.x == 928) {
00319     reduceKernel<928,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00320       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00321   } else if (tp.block.x == 960) {
00322     reduceKernel<960,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00323       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00324   } else if (tp.block.x == 992) {
00325     reduceKernel<992,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00326       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00327   } else if (tp.block.x == 1024) {
00328     reduceKernel<1024,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00329       <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length);
00330       } */ else {
00331     errorQuda("Reduction not implemented for %d threads", tp.block.x);
00332   }
00333 
00334   if(deviceProp.canMapHostMemory) {
00335     cudaEventRecord(reduceEnd, stream);
00336     while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; }
00337   } else {
00338     cudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost);
00339   }
00340 
00341   doubleN cpu_sum;
00342   zero(cpu_sum);
00343   cpu_sum += ((ReduceType*)h_reduce)[0];
00344 
00345   const int Nreduce = sizeof(doubleN) / sizeof(double);
00346   reduceDoubleArray((double*)&cpu_sum, Nreduce);
00347 
00348   return cpu_sum;
00349 }
00350 
00351 
00352 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN, 
00353           int M, int writeX, int writeY, int writeZ, 
00354           typename InputX, typename InputY, typename InputZ, typename InputW, typename InputV,
00355           typename Reducer, typename OutputX, typename OutputY, typename OutputZ>
00356 class ReduceCuda : public Tunable {
00357 
00358 private:
00359   InputX &X;
00360   InputY &Y;
00361   InputZ &Z;
00362   InputW &W;
00363   InputV &V;
00364   OutputX &XX;
00365   OutputY &YY;
00366   OutputZ &ZZ;
00367   Reducer &r;
00368   const int length;
00369   doubleN &result;
00370 
00371   // host pointers used for backing up fields when tuning
00372   // these can't be curried into the Spinors because of Tesla argument length restriction
00373   char *X_h, *Y_h, *Z_h;
00374   char *Xnorm_h, *Ynorm_h, *Znorm_h;
00375 
00376   int sharedBytesPerThread() const { return sizeof(ReduceType); }
00377 
00378   // when there is only one warp per block, we need to allocate two warps 
00379   // worth of shared memory so that we don't index shared memory out of bounds
00380   int sharedBytesPerBlock() const { 
00381     int warpSize = 32; // FIXME - use device property query
00382     return 2*warpSize*sizeof(ReduceType); 
00383   }
00384 
00385   virtual bool advanceSharedBytes(TuneParam &param) const
00386   {
00387     TuneParam next(param);
00388     advanceBlockDim(next); // to get next blockDim
00389     int nthreads = next.block.x * next.block.y * next.block.z;
00390     param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock() ?
00391       sharedBytesPerThread()*nthreads : sharedBytesPerBlock();
00392     return false;
00393   }
00394 
00395 public:
00396   ReduceCuda(doubleN &result, InputX &X, InputY &Y, InputZ &Z, InputW &W, InputV &V, Reducer &r, 
00397              OutputX &XX, OutputY &YY, OutputZ &ZZ, int length) :
00398   result(result), X(X), Y(Y), Z(Z), W(W), V(V), r(r), XX(XX), YY(YY), ZZ(ZZ), length(length)
00399     { ; }
00400   virtual ~ReduceCuda() { }
00401 
00402   TuneKey tuneKey() const {
00403     std::stringstream vol, aux;
00404     vol << blasConstants.x[0] << "x";
00405     vol << blasConstants.x[1] << "x";
00406     vol << blasConstants.x[2] << "x";
00407     vol << blasConstants.x[3];    
00408     aux << "stride=" << blasConstants.stride << ",prec=" << XX.Precision();
00409     return TuneKey(vol.str(), typeid(r).name(), aux.str());
00410   }  
00411 
00412   void apply(const cudaStream_t &stream) {
00413     TuneParam tp = tuneLaunch(*this, blasTuning, verbosity);
00414     result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ>
00415       (X, Y, Z, W, V, r, XX, YY, ZZ, length, tp, stream);
00416   }
00417 
00418   void preTune() { 
00419     size_t bytes = XX.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*XX.Stride();
00420     size_t norm_bytes = (XX.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*length : 0;
00421     if (writeX) XX.save(&X_h, &Xnorm_h, bytes, norm_bytes);
00422     if (writeY) YY.save(&Y_h, &Ynorm_h, bytes, norm_bytes);
00423     if (writeZ) ZZ.save(&Z_h, &Znorm_h, bytes, norm_bytes);
00424   }
00425 
00426   void postTune() {
00427     size_t bytes = XX.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*XX.Stride();
00428     size_t norm_bytes = (XX.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*length : 0;
00429     if (writeX) XX.load(&X_h, &Xnorm_h, bytes, norm_bytes);
00430     if (writeY) YY.load(&Y_h, &Ynorm_h, bytes, norm_bytes);
00431     if (writeZ) ZZ.load(&Z_h, &Znorm_h, bytes, norm_bytes);
00432   }
00433 
00434   long long flops() const { return r.flops()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*length*M; }
00435   long long bytes() const { 
00436     size_t bytes = XX.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M;
00437     if (XX.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
00438     return r.streams()*bytes*length; }
00439 };
00440 
00441 
00442 
00447 template <typename doubleN, typename ReduceType, typename ReduceSimpleType,
00448           template <typename ReducerType, typename Float, typename FloatN> class Reducer,
00449           int writeX, int writeY, int writeZ>
00450 doubleN reduceCuda(const int kernel, const double2 &a, const double2 &b, cudaColorSpinorField &x, 
00451                    cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w,
00452                    cudaColorSpinorField &v) {
00453   if (x.SiteSubset() == QUDA_FULL_SITE_SUBSET) {
00454     doubleN even =
00455       reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,writeY,writeZ>
00456       (kernel, a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even());
00457     doubleN odd = 
00458       reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,writeY,writeZ>
00459       (kernel, a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd());
00460     return even + odd;
00461   }
00462 
00463   checkSpinor(x, y);
00464   checkSpinor(x, z);
00465   checkSpinor(x, w);
00466   checkSpinor(x, v);
00467 
00468   for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = x.X()[d];
00469   blasConstants.stride = x.Stride();
00470 
00471   doubleN value;
00472 
00473   Tunable *reduce = 0;
00474   if (x.Precision() == QUDA_DOUBLE_PRECISION) {
00475     const int M = 1; // determines how much work per thread to do
00476     SpinorTexture<double2,double2,double2,M,0> xTex(x);
00477     SpinorTexture<double2,double2,double2,M,1> yTex;
00478     if (x.V() != y.V()) yTex = SpinorTexture<double2,double2,double2,M,1>(y);    
00479     SpinorTexture<double2,double2,double2,M,2> zTex;
00480     if (x.V() != z.V()) zTex = SpinorTexture<double2,double2,double2,M,2>(z);    
00481     Spinor<double2,double2,double2,M> X(x);
00482     Spinor<double2,double2,double2,M> Y(y);
00483     Spinor<double2,double2,double2,M> Z(z);
00484     Spinor<double2,double2,double2,M> W(w);
00485     Spinor<double2,double2,double2,M> V(v);
00486     Reducer<ReduceType, double2, double2> r(a,b);
00487     reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,writeX,writeY,writeZ,
00488       SpinorTexture<double2,double2,double2,M,0>, SpinorTexture<double2,double2,double2,M,1>,
00489       SpinorTexture<double2,double2,double2,M,2>, Spinor<double2,double2,double2,M>,
00490       Spinor<double2,double2,double2,M>, Reducer<ReduceType, double2, double2>, 
00491       Spinor<double2,double2,double2,M>, Spinor<double2,double2,double2,M>, Spinor<double2,double2,double2,M> >
00492       (value, xTex, yTex, zTex, W, V, r, X, Y, Z, x.Length()/(2*M));
00493   } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
00494     const int M = 1;
00495     SpinorTexture<float4,float4,float4,M,0> xTex(x);
00496     SpinorTexture<float4,float4,float4,M,1> yTex;
00497     if (x.V() != y.V()) yTex = SpinorTexture<float4,float4,float4,M,1>(y);
00498     SpinorTexture<float4,float4,float4,M,2> zTex;
00499     if (x.V() != z.V()) zTex = SpinorTexture<float4,float4,float4,M,2>(z);
00500     SpinorTexture<float4,float4,float4,M,3> wTex;
00501     if (x.V() != w.V()) wTex = SpinorTexture<float4,float4,float4,M,3>(w);
00502     SpinorTexture<float4,float4,float4,M,4> vTex;
00503     if (x.V() != v.V()) vTex = SpinorTexture<float4,float4,float4,M,4>(v);
00504     Spinor<float4,float4,float4,M> X(x);
00505     Spinor<float4,float4,float4,M> Y(y);
00506     Spinor<float4,float4,float4,M> Z(z);
00507     Spinor<float4,float4,float4,M> W(w);
00508     Spinor<float4,float4,float4,M> V(v);
00509     Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
00510     reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,M,writeX,writeY,writeZ,
00511       SpinorTexture<float4,float4,float4,M,0>,  SpinorTexture<float4,float4,float4,M,1>,
00512       SpinorTexture<float4,float4,float4,M,2>,  SpinorTexture<float4,float4,float4,M,3>,
00513       SpinorTexture<float4,float4,float4,M,4>, Reducer<ReduceType, float2, float4>,
00514       Spinor<float4,float4,float4,M>, Spinor<float4,float4,float4,M>, Spinor<float4,float4,float4,M> >
00515       (value, xTex, yTex, zTex, wTex, vTex, r, X, Y, Z, x.Length()/(4*M));
00516   } else {
00517     if (x.Nspin() == 4){ //wilson
00518       SpinorTexture<float4,float4,short4,6,0> xTex(x);
00519       SpinorTexture<float4,float4,short4,6,1> yTex;
00520       if (x.V() != y.V()) yTex = SpinorTexture<float4,float4,short4,6,1>(y);
00521       SpinorTexture<float4,float4,short4,6,2> zTex;
00522       if (x.V() != z.V()) zTex = SpinorTexture<float4,float4,short4,6,2>(z);
00523       SpinorTexture<float4,float4,short4,6,3> wTex;
00524       if (x.V() != w.V()) wTex = SpinorTexture<float4,float4,short4,6,3>(w);
00525       SpinorTexture<float4,float4,short4,6,4> vTex;
00526       if (x.V() != v.V()) vTex = SpinorTexture<float4,float4,short4,6,4>(v);
00527       Spinor<float4,float4,short4,6> xOut(x);
00528       Spinor<float4,float4,short4,6> yOut(y);
00529       Spinor<float4,float4,short4,6> zOut(z);
00530       Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
00531       reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,writeX,writeY,writeZ,
00532         SpinorTexture<float4,float4,short4,6,0>, SpinorTexture<float4,float4,short4,6,1>,
00533         SpinorTexture<float4,float4,short4,6,2>, SpinorTexture<float4,float4,short4,6,3>,
00534         SpinorTexture<float4,float4,short4,6,4>, Reducer<ReduceType, float2, float4>,
00535         Spinor<float4,float4,short4,6>, Spinor<float4,float4,short4,6>, Spinor<float4,float4,short4,6> >
00536         (value,xTex,yTex,zTex,wTex,vTex,r,xOut,yOut,zOut,y.Volume());
00537     } else if (x.Nspin() == 1) {//staggered
00538       SpinorTexture<float2,float2,short2,3,0> xTex(x);
00539       SpinorTexture<float2,float2,short2,3,1> yTex;
00540       if (x.V() != y.V()) yTex = SpinorTexture<float2,float2,short2,3,1>(y);
00541       SpinorTexture<float2,float2,short2,3,2> zTex;
00542       if (x.V() != z.V()) zTex = SpinorTexture<float2,float2,short2,3,2>(z);
00543       SpinorTexture<float2,float2,short2,3,3> wTex;
00544       if (x.V() != w.V()) wTex = SpinorTexture<float2,float2,short2,3,3>(w);
00545       SpinorTexture<float2,float2,short2,3,4> vTex;
00546       if (x.V() != v.V()) vTex = SpinorTexture<float2,float2,short2,3,4>(v);
00547       Spinor<float2,float2,short2,3> xOut(x);
00548       Spinor<float2,float2,short2,3> yOut(y);
00549       Spinor<float2,float2,short2,3> zOut(z);
00550       Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
00551       reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,writeX,writeY,writeZ,
00552         SpinorTexture<float2,float2,short2,3,0>, SpinorTexture<float2,float2,short2,3,1>,
00553         SpinorTexture<float2,float2,short2,3,2>, SpinorTexture<float2,float2,short2,3,3>,
00554         SpinorTexture<float2,float2,short2,3,4>, Reducer<ReduceType, float2, float2>,
00555         Spinor<float2,float2,short2,3>, Spinor<float2,float2,short2,3>, Spinor<float2,float2,short2,3> >
00556         (value,xTex,yTex,zTex,wTex,vTex,r,xOut,yOut,zOut,y.Volume());
00557     } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
00558     quda::blas_bytes += Reducer<ReduceType,double2,double2>::streams()*x.Volume()*sizeof(float);
00559   }
00560   quda::blas_bytes += Reducer<ReduceType,double2,double2>::streams()*x.RealLength()*x.Precision();
00561   quda::blas_flops += Reducer<ReduceType,double2,double2>::flops()*x.RealLength();
00562 
00563   reduce->apply(*blasStream);
00564   delete reduce;
00565 
00566   checkCudaError();
00567 
00568   return value;
00569 }
00570