QUDA v0.4.0
A library for QCD on GPUs
|
00001 __host__ __device__ void zero(double &x) { x = 0.0; } 00002 __host__ __device__ void zero(double2 &x) { x.x = 0.0; x.y = 0.0; } 00003 __host__ __device__ void zero(double3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; } 00004 __device__ void copytoshared(double *s, const int i, const double x, const int block) { s[i] = x; } 00005 __device__ void copytoshared(double *s, const int i, const double2 x, const int block) 00006 { s[i] = x.x; s[i+block] = x.y; } 00007 __device__ void copytoshared(double *s, const int i, const double3 x, const int block) 00008 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; } 00009 __device__ void copytoshared(volatile double *s, const int i, const double x, const int block) { s[i] = x; } 00010 __device__ void copytoshared(volatile double *s, const int i, const double2 x, const int block) 00011 { s[i] = x.x; s[i+block] = x.y; } 00012 __device__ void copytoshared(volatile double *s, const int i, const double3 x, const int block) 00013 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; } 00014 __device__ void copyfromshared(double &x, const double *s, const int i, const int block) { x = s[i]; } 00015 __device__ void copyfromshared(double2 &x, const double *s, const int i, const int block) 00016 { x.x = s[i]; x.y = s[i+block]; } 00017 __device__ void copyfromshared(double3 &x, const double *s, const int i, const int block) 00018 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; } 00019 00020 template<typename ReduceType, typename ReduceSimpleType> 00021 __device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block) { } 00022 template<> __device__ void add<double,double>(double &sum, double *s, const int i, const int block) 00023 { sum += s[i]; } 00024 template<> __device__ void add<double2,double>(double2 &sum, double *s, const int i, const int block) 00025 { sum.x += s[i]; sum.y += s[i+block]; } 00026 template<> __device__ void add<double3,double>(double3 &sum, double *s, const int i, const int block) 00027 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; } 00028 00029 template<typename ReduceType, typename ReduceSimpleType> 00030 __device__ void add(ReduceSimpleType *s, const int i, const int j, const int block) { } 00031 template<typename ReduceType, typename ReduceSimpleType> 00032 __device__ void add(volatile ReduceSimpleType *s, const int i, const int j, const int block) { } 00033 00034 template<> __device__ void add<double,double>(double *s, const int i, const int j, const int block) 00035 { s[i] += s[j]; } 00036 template<> __device__ void add<double,double>(volatile double *s, const int i, const int j, const int block) 00037 { s[i] += s[j]; } 00038 00039 template<> __device__ void add<double2,double>(double *s, const int i, const int j, const int block) 00040 { s[i] += s[j]; s[i+block] += s[j+block];} 00041 template<> __device__ void add<double2,double>(volatile double *s, const int i, const int j, const int block) 00042 { s[i] += s[j]; s[i+block] += s[j+block];} 00043 00044 template<> __device__ void add<double3,double>(double *s, const int i, const int j, const int block) 00045 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];} 00046 template<> __device__ void add<double3,double>(volatile double *s, const int i, const int j, const int block) 00047 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];} 00048 00049 #if (__COMPUTE_CAPABILITY__ < 130) 00050 __host__ __device__ void zero(doublesingle &x) { x = 0.0; } 00051 __host__ __device__ void zero(doublesingle2 &x) { x.x = 0.0; x.y = 0.0; } 00052 __host__ __device__ void zero(doublesingle3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; } 00053 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle x, const int block) { s[i] = x; } 00054 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle2 x, const int block) 00055 { s[i] = x.x; s[i+block] = x.y; } 00056 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle3 x, const int block) 00057 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; } 00058 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle x, const int block) { s[i].a.x = x.a.x; s[i].a.y = x.a.y; } 00059 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle2 x, const int block) 00060 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; } 00061 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle3 x, const int block) 00062 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; 00063 s[i+2*block].a.x = x.z.a.x; s[i+2*block].a.y = x.z.a.y; } 00064 __device__ void copyfromshared(doublesingle &x, const doublesingle *s, const int i, const int block) { x = s[i]; } 00065 __device__ void copyfromshared(doublesingle2 &x, const doublesingle *s, const int i, const int block) 00066 { x.x = s[i]; x.y = s[i+block]; } 00067 __device__ void copyfromshared(doublesingle3 &x, const doublesingle *s, const int i, const int block) 00068 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; } 00069 00070 template<> __device__ void add<doublesingle,doublesingle>(doublesingle &sum, doublesingle *s, const int i, const int block) 00071 { sum += s[i]; } 00072 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle2 &sum, doublesingle *s, const int i, const int block) 00073 { sum.x += s[i]; sum.y += s[i+block]; } 00074 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle3 &sum, doublesingle *s, const int i, const int block) 00075 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; } 00076 00077 template<> __device__ void add<doublesingle,doublesingle>(doublesingle *s, const int i, const int j, const int block) 00078 { s[i] += s[j]; } 00079 template<> __device__ void add<doublesingle,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block) 00080 { s[i] += s[j]; } 00081 00082 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle *s, const int i, const int j, const int block) 00083 { s[i] += s[j]; s[i+block] += s[j+block];} 00084 template<> __device__ void add<doublesingle2,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block) 00085 { s[i] += s[j]; s[i+block] += s[j+block];} 00086 00087 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle *s, const int i, const int j, const int block) 00088 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];} 00089 template<> __device__ void add<doublesingle3,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block) 00090 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];} 00091 #endif 00092 00093 __device__ unsigned int count = 0; 00094 __shared__ bool isLastBlockDone; 00095 00099 template <int block_size, typename ReduceType, typename ReduceSimpleType, 00100 typename FloatN, int M, int writeX, int writeY, int writeZ, 00101 typename InputX, typename InputY, typename InputZ, typename InputW, typename InputV, 00102 typename OutputX, typename OutputY, typename OutputZ, typename Reducer> 00103 __global__ void reduceKernel(InputX X, InputY Y, InputZ Z, InputW W, InputV V, Reducer r, 00104 ReduceType *partial, ReduceType *complete, 00105 OutputX XX, OutputY YY, OutputZ ZZ, int length) { 00106 unsigned int tid = threadIdx.x; 00107 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; 00108 unsigned int gridSize = gridDim.x*blockDim.x; 00109 00110 ReduceType sum; 00111 zero(sum); 00112 while (i < length) { 00113 FloatN x[M], y[M], z[M], w[M], v[M]; 00114 X.load(x, i); 00115 Y.load(y, i); 00116 Z.load(z, i); 00117 W.load(w, i); 00118 V.load(v, i); 00119 #pragma unroll 00120 for (int j=0; j<M; j++) r(sum, x[j], y[j], z[j], w[j], v[j]); 00121 00122 if (writeX) XX.save(x, i); 00123 if (writeY) YY.save(y, i); 00124 if (writeZ) ZZ.save(z, i); 00125 00126 i += gridSize; 00127 } 00128 00129 extern __shared__ ReduceSimpleType sdata[]; 00130 ReduceSimpleType *s = sdata + tid; 00131 if (tid >= warpSize) copytoshared(s, 0, sum, block_size); 00132 __syncthreads(); 00133 00134 // now reduce using the first warp only 00135 if (tid<warpSize) { 00136 // Warp raking 00137 #pragma unroll 00138 for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); } 00139 00140 // Intra-warp reduction 00141 volatile ReduceSimpleType *sv = s; 00142 copytoshared(sv, 0, sum, block_size); 00143 00144 if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); } 00145 if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); } 00146 if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); } 00147 if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); } 00148 if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); } 00149 00150 // warpSize generic warp reduction - open64 can't handle it, only nvvm 00151 //#pragma unroll 00152 //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); } 00153 } 00154 00155 // write result for this block to global mem 00156 if (tid == 0) { 00157 ReduceType tmp; 00158 copyfromshared(tmp, s, 0, block_size); 00159 partial[blockIdx.x] = tmp; 00160 00161 __threadfence(); // flush result 00162 00163 // increment global block counter 00164 unsigned int value = atomicInc(&count, gridDim.x); 00165 00166 // Determine if this block is the last block to be done 00167 isLastBlockDone = (value == (gridDim.x-1)); 00168 } 00169 00170 __syncthreads(); 00171 00172 // Finish the reduction if last block 00173 if (isLastBlockDone) { 00174 unsigned int i = threadIdx.x; 00175 00176 ReduceType sum; 00177 zero(sum); 00178 while (i < gridDim.x) { 00179 sum += partial[i]; 00180 i += block_size; 00181 } 00182 00183 extern __shared__ ReduceSimpleType sdata[]; 00184 ReduceSimpleType *s = sdata + tid; 00185 if (tid >= warpSize) copytoshared(s, 0, sum, block_size); 00186 __syncthreads(); 00187 00188 // now reduce using the first warp only 00189 if (tid<warpSize) { 00190 // Warp raking 00191 #pragma unroll 00192 for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); } 00193 00194 // Intra-warp reduction 00195 volatile ReduceSimpleType *sv = s; 00196 copytoshared(sv, 0, sum, block_size); 00197 00198 if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); } 00199 if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); } 00200 if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); } 00201 if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); } 00202 if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); } 00203 00204 //#pragma unroll 00205 //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); } 00206 } 00207 00208 // write out the final reduced value 00209 if (threadIdx.x == 0) { 00210 ReduceType tmp; 00211 copyfromshared(tmp, s, 0, block_size); 00212 complete[0] = tmp; 00213 count = 0; 00214 } 00215 } 00216 00217 } 00221 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN, 00222 int M, int writeX, int writeY, int writeZ, 00223 typename InputX, typename InputY, typename InputZ, typename InputW, typename InputV, 00224 typename Reducer, typename OutputX, typename OutputY, typename OutputZ> 00225 doubleN reduceLaunch(InputX X, InputY Y, InputZ Z, InputW W, InputV V, Reducer r, 00226 OutputX XX, OutputY YY, OutputZ ZZ, int length, const TuneParam &tp, 00227 const cudaStream_t &stream) { 00228 ReduceType *partial = (ReduceType*)d_reduce; 00229 ReduceType *complete = (ReduceType*)hd_reduce; 00230 00231 if (tp.grid.x > REDUCE_MAX_BLOCKS) 00232 errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, REDUCE_MAX_BLOCKS); 00233 00234 if (tp.block.x == 32) { 00235 reduceKernel<32,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00236 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00237 } else if (tp.block.x == 64) { 00238 reduceKernel<64,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00239 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00240 } else if (tp.block.x == 96) { 00241 reduceKernel<96,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00242 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00243 } else if (tp.block.x == 128) { 00244 reduceKernel<128,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00245 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00246 } else if (tp.block.x == 160) { 00247 reduceKernel<160,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00248 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00249 } else if (tp.block.x == 192) { 00250 reduceKernel<192,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00251 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00252 } else if (tp.block.x == 224) { 00253 reduceKernel<224,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00254 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00255 } else if (tp.block.x == 256) { 00256 reduceKernel<256,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00257 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00258 } else if (tp.block.x == 288) { 00259 reduceKernel<288,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00260 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00261 } else if (tp.block.x == 320) { 00262 reduceKernel<320,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00263 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00264 } else if (tp.block.x == 352) { 00265 reduceKernel<352,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00266 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00267 } else if (tp.block.x == 384) { 00268 reduceKernel<384,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00269 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00270 } else if (tp.block.x == 416) { 00271 reduceKernel<416,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00272 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00273 } else if (tp.block.x == 448) { 00274 reduceKernel<448,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00275 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00276 } else if (tp.block.x == 480) { 00277 reduceKernel<480,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00278 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00279 } else if (tp.block.x == 512) { 00280 reduceKernel<512,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00281 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00282 } /*else if (tp.block.x == 544) { 00283 reduceKernel<544,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00284 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00285 } else if (tp.block.x == 576) { 00286 reduceKernel<576,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00287 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00288 } else if (tp.block.x == 608) { 00289 reduceKernel<608,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00290 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00291 } else if (tp.block.x == 640) { 00292 reduceKernel<640,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00293 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00294 } else if (tp.block.x == 672) { 00295 reduceKernel<672,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00296 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00297 } else if (tp.block.x == 704) { 00298 reduceKernel<704,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00299 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00300 } else if (tp.block.x == 736) { 00301 reduceKernel<736,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00302 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00303 } else if (tp.block.x == 768) { 00304 reduceKernel<768,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00305 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00306 } else if (tp.block.x == 800) { 00307 reduceKernel<800,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00308 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00309 } else if (tp.block.x == 832) { 00310 reduceKernel<832,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00311 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00312 } else if (tp.block.x == 864) { 00313 reduceKernel<864,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00314 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00315 } else if (tp.block.x == 896) { 00316 reduceKernel<896,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00317 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00318 } else if (tp.block.x == 928) { 00319 reduceKernel<928,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00320 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00321 } else if (tp.block.x == 960) { 00322 reduceKernel<960,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00323 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00324 } else if (tp.block.x == 992) { 00325 reduceKernel<992,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00326 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00327 } else if (tp.block.x == 1024) { 00328 reduceKernel<1024,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00329 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, partial, complete, XX, YY, ZZ, length); 00330 } */ else { 00331 errorQuda("Reduction not implemented for %d threads", tp.block.x); 00332 } 00333 00334 if(deviceProp.canMapHostMemory) { 00335 cudaEventRecord(reduceEnd, stream); 00336 while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; } 00337 } else { 00338 cudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost); 00339 } 00340 00341 doubleN cpu_sum; 00342 zero(cpu_sum); 00343 cpu_sum += ((ReduceType*)h_reduce)[0]; 00344 00345 const int Nreduce = sizeof(doubleN) / sizeof(double); 00346 reduceDoubleArray((double*)&cpu_sum, Nreduce); 00347 00348 return cpu_sum; 00349 } 00350 00351 00352 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN, 00353 int M, int writeX, int writeY, int writeZ, 00354 typename InputX, typename InputY, typename InputZ, typename InputW, typename InputV, 00355 typename Reducer, typename OutputX, typename OutputY, typename OutputZ> 00356 class ReduceCuda : public Tunable { 00357 00358 private: 00359 InputX &X; 00360 InputY &Y; 00361 InputZ &Z; 00362 InputW &W; 00363 InputV &V; 00364 OutputX &XX; 00365 OutputY &YY; 00366 OutputZ &ZZ; 00367 Reducer &r; 00368 const int length; 00369 doubleN &result; 00370 00371 // host pointers used for backing up fields when tuning 00372 // these can't be curried into the Spinors because of Tesla argument length restriction 00373 char *X_h, *Y_h, *Z_h; 00374 char *Xnorm_h, *Ynorm_h, *Znorm_h; 00375 00376 int sharedBytesPerThread() const { return sizeof(ReduceType); } 00377 00378 // when there is only one warp per block, we need to allocate two warps 00379 // worth of shared memory so that we don't index shared memory out of bounds 00380 int sharedBytesPerBlock() const { 00381 int warpSize = 32; // FIXME - use device property query 00382 return 2*warpSize*sizeof(ReduceType); 00383 } 00384 00385 virtual bool advanceSharedBytes(TuneParam ¶m) const 00386 { 00387 TuneParam next(param); 00388 advanceBlockDim(next); // to get next blockDim 00389 int nthreads = next.block.x * next.block.y * next.block.z; 00390 param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock() ? 00391 sharedBytesPerThread()*nthreads : sharedBytesPerBlock(); 00392 return false; 00393 } 00394 00395 public: 00396 ReduceCuda(doubleN &result, InputX &X, InputY &Y, InputZ &Z, InputW &W, InputV &V, Reducer &r, 00397 OutputX &XX, OutputY &YY, OutputZ &ZZ, int length) : 00398 result(result), X(X), Y(Y), Z(Z), W(W), V(V), r(r), XX(XX), YY(YY), ZZ(ZZ), length(length) 00399 { ; } 00400 virtual ~ReduceCuda() { } 00401 00402 TuneKey tuneKey() const { 00403 std::stringstream vol, aux; 00404 vol << blasConstants.x[0] << "x"; 00405 vol << blasConstants.x[1] << "x"; 00406 vol << blasConstants.x[2] << "x"; 00407 vol << blasConstants.x[3]; 00408 aux << "stride=" << blasConstants.stride << ",prec=" << XX.Precision(); 00409 return TuneKey(vol.str(), typeid(r).name(), aux.str()); 00410 } 00411 00412 void apply(const cudaStream_t &stream) { 00413 TuneParam tp = tuneLaunch(*this, blasTuning, verbosity); 00414 result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M,writeX,writeY,writeZ> 00415 (X, Y, Z, W, V, r, XX, YY, ZZ, length, tp, stream); 00416 } 00417 00418 void preTune() { 00419 size_t bytes = XX.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*XX.Stride(); 00420 size_t norm_bytes = (XX.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*length : 0; 00421 if (writeX) XX.save(&X_h, &Xnorm_h, bytes, norm_bytes); 00422 if (writeY) YY.save(&Y_h, &Ynorm_h, bytes, norm_bytes); 00423 if (writeZ) ZZ.save(&Z_h, &Znorm_h, bytes, norm_bytes); 00424 } 00425 00426 void postTune() { 00427 size_t bytes = XX.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*XX.Stride(); 00428 size_t norm_bytes = (XX.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*length : 0; 00429 if (writeX) XX.load(&X_h, &Xnorm_h, bytes, norm_bytes); 00430 if (writeY) YY.load(&Y_h, &Ynorm_h, bytes, norm_bytes); 00431 if (writeZ) ZZ.load(&Z_h, &Znorm_h, bytes, norm_bytes); 00432 } 00433 00434 long long flops() const { return r.flops()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*length*M; } 00435 long long bytes() const { 00436 size_t bytes = XX.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M; 00437 if (XX.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float); 00438 return r.streams()*bytes*length; } 00439 }; 00440 00441 00442 00447 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, 00448 template <typename ReducerType, typename Float, typename FloatN> class Reducer, 00449 int writeX, int writeY, int writeZ> 00450 doubleN reduceCuda(const int kernel, const double2 &a, const double2 &b, cudaColorSpinorField &x, 00451 cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, 00452 cudaColorSpinorField &v) { 00453 if (x.SiteSubset() == QUDA_FULL_SITE_SUBSET) { 00454 doubleN even = 00455 reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,writeY,writeZ> 00456 (kernel, a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even()); 00457 doubleN odd = 00458 reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,writeY,writeZ> 00459 (kernel, a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd()); 00460 return even + odd; 00461 } 00462 00463 checkSpinor(x, y); 00464 checkSpinor(x, z); 00465 checkSpinor(x, w); 00466 checkSpinor(x, v); 00467 00468 for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = x.X()[d]; 00469 blasConstants.stride = x.Stride(); 00470 00471 doubleN value; 00472 00473 Tunable *reduce = 0; 00474 if (x.Precision() == QUDA_DOUBLE_PRECISION) { 00475 const int M = 1; // determines how much work per thread to do 00476 SpinorTexture<double2,double2,double2,M,0> xTex(x); 00477 SpinorTexture<double2,double2,double2,M,1> yTex; 00478 if (x.V() != y.V()) yTex = SpinorTexture<double2,double2,double2,M,1>(y); 00479 SpinorTexture<double2,double2,double2,M,2> zTex; 00480 if (x.V() != z.V()) zTex = SpinorTexture<double2,double2,double2,M,2>(z); 00481 Spinor<double2,double2,double2,M> X(x); 00482 Spinor<double2,double2,double2,M> Y(y); 00483 Spinor<double2,double2,double2,M> Z(z); 00484 Spinor<double2,double2,double2,M> W(w); 00485 Spinor<double2,double2,double2,M> V(v); 00486 Reducer<ReduceType, double2, double2> r(a,b); 00487 reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,writeX,writeY,writeZ, 00488 SpinorTexture<double2,double2,double2,M,0>, SpinorTexture<double2,double2,double2,M,1>, 00489 SpinorTexture<double2,double2,double2,M,2>, Spinor<double2,double2,double2,M>, 00490 Spinor<double2,double2,double2,M>, Reducer<ReduceType, double2, double2>, 00491 Spinor<double2,double2,double2,M>, Spinor<double2,double2,double2,M>, Spinor<double2,double2,double2,M> > 00492 (value, xTex, yTex, zTex, W, V, r, X, Y, Z, x.Length()/(2*M)); 00493 } else if (x.Precision() == QUDA_SINGLE_PRECISION) { 00494 const int M = 1; 00495 SpinorTexture<float4,float4,float4,M,0> xTex(x); 00496 SpinorTexture<float4,float4,float4,M,1> yTex; 00497 if (x.V() != y.V()) yTex = SpinorTexture<float4,float4,float4,M,1>(y); 00498 SpinorTexture<float4,float4,float4,M,2> zTex; 00499 if (x.V() != z.V()) zTex = SpinorTexture<float4,float4,float4,M,2>(z); 00500 SpinorTexture<float4,float4,float4,M,3> wTex; 00501 if (x.V() != w.V()) wTex = SpinorTexture<float4,float4,float4,M,3>(w); 00502 SpinorTexture<float4,float4,float4,M,4> vTex; 00503 if (x.V() != v.V()) vTex = SpinorTexture<float4,float4,float4,M,4>(v); 00504 Spinor<float4,float4,float4,M> X(x); 00505 Spinor<float4,float4,float4,M> Y(y); 00506 Spinor<float4,float4,float4,M> Z(z); 00507 Spinor<float4,float4,float4,M> W(w); 00508 Spinor<float4,float4,float4,M> V(v); 00509 Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y)); 00510 reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,M,writeX,writeY,writeZ, 00511 SpinorTexture<float4,float4,float4,M,0>, SpinorTexture<float4,float4,float4,M,1>, 00512 SpinorTexture<float4,float4,float4,M,2>, SpinorTexture<float4,float4,float4,M,3>, 00513 SpinorTexture<float4,float4,float4,M,4>, Reducer<ReduceType, float2, float4>, 00514 Spinor<float4,float4,float4,M>, Spinor<float4,float4,float4,M>, Spinor<float4,float4,float4,M> > 00515 (value, xTex, yTex, zTex, wTex, vTex, r, X, Y, Z, x.Length()/(4*M)); 00516 } else { 00517 if (x.Nspin() == 4){ //wilson 00518 SpinorTexture<float4,float4,short4,6,0> xTex(x); 00519 SpinorTexture<float4,float4,short4,6,1> yTex; 00520 if (x.V() != y.V()) yTex = SpinorTexture<float4,float4,short4,6,1>(y); 00521 SpinorTexture<float4,float4,short4,6,2> zTex; 00522 if (x.V() != z.V()) zTex = SpinorTexture<float4,float4,short4,6,2>(z); 00523 SpinorTexture<float4,float4,short4,6,3> wTex; 00524 if (x.V() != w.V()) wTex = SpinorTexture<float4,float4,short4,6,3>(w); 00525 SpinorTexture<float4,float4,short4,6,4> vTex; 00526 if (x.V() != v.V()) vTex = SpinorTexture<float4,float4,short4,6,4>(v); 00527 Spinor<float4,float4,short4,6> xOut(x); 00528 Spinor<float4,float4,short4,6> yOut(y); 00529 Spinor<float4,float4,short4,6> zOut(z); 00530 Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y)); 00531 reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,writeX,writeY,writeZ, 00532 SpinorTexture<float4,float4,short4,6,0>, SpinorTexture<float4,float4,short4,6,1>, 00533 SpinorTexture<float4,float4,short4,6,2>, SpinorTexture<float4,float4,short4,6,3>, 00534 SpinorTexture<float4,float4,short4,6,4>, Reducer<ReduceType, float2, float4>, 00535 Spinor<float4,float4,short4,6>, Spinor<float4,float4,short4,6>, Spinor<float4,float4,short4,6> > 00536 (value,xTex,yTex,zTex,wTex,vTex,r,xOut,yOut,zOut,y.Volume()); 00537 } else if (x.Nspin() == 1) {//staggered 00538 SpinorTexture<float2,float2,short2,3,0> xTex(x); 00539 SpinorTexture<float2,float2,short2,3,1> yTex; 00540 if (x.V() != y.V()) yTex = SpinorTexture<float2,float2,short2,3,1>(y); 00541 SpinorTexture<float2,float2,short2,3,2> zTex; 00542 if (x.V() != z.V()) zTex = SpinorTexture<float2,float2,short2,3,2>(z); 00543 SpinorTexture<float2,float2,short2,3,3> wTex; 00544 if (x.V() != w.V()) wTex = SpinorTexture<float2,float2,short2,3,3>(w); 00545 SpinorTexture<float2,float2,short2,3,4> vTex; 00546 if (x.V() != v.V()) vTex = SpinorTexture<float2,float2,short2,3,4>(v); 00547 Spinor<float2,float2,short2,3> xOut(x); 00548 Spinor<float2,float2,short2,3> yOut(y); 00549 Spinor<float2,float2,short2,3> zOut(z); 00550 Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y)); 00551 reduce = new ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,writeX,writeY,writeZ, 00552 SpinorTexture<float2,float2,short2,3,0>, SpinorTexture<float2,float2,short2,3,1>, 00553 SpinorTexture<float2,float2,short2,3,2>, SpinorTexture<float2,float2,short2,3,3>, 00554 SpinorTexture<float2,float2,short2,3,4>, Reducer<ReduceType, float2, float2>, 00555 Spinor<float2,float2,short2,3>, Spinor<float2,float2,short2,3>, Spinor<float2,float2,short2,3> > 00556 (value,xTex,yTex,zTex,wTex,vTex,r,xOut,yOut,zOut,y.Volume()); 00557 } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); } 00558 quda::blas_bytes += Reducer<ReduceType,double2,double2>::streams()*x.Volume()*sizeof(float); 00559 } 00560 quda::blas_bytes += Reducer<ReduceType,double2,double2>::streams()*x.RealLength()*x.Precision(); 00561 quda::blas_flops += Reducer<ReduceType,double2,double2>::flops()*x.RealLength(); 00562 00563 reduce->apply(*blasStream); 00564 delete reduce; 00565 00566 checkCudaError(); 00567 00568 return value; 00569 } 00570