QUDA: quda/lib/reduce_core.h Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #if (REDUCE_TYPE == REDUCE_KAHAN)
00002 
00003 #define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1))
00004 
00005 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) {
00006   unsigned int tid = threadIdx.x;
00007   unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x;
00008   unsigned int gridSize = reduce_threads*gridDim.x;
00009   
00010   QudaSumFloat acc0 = 0;
00011   QudaSumFloat acc1 = 0;
00012   
00013   while (i < n) {
00014     REDUCE_AUXILIARY(i);
00015     DSACC(acc0, acc1, REDUCE_OPERATION(i), 0);
00016     i += gridSize;
00017   }
00018   
00019   extern __shared__ QudaSumFloat sdata[];
00020   QudaSumFloat *s = sdata + 2*tid;
00021   s[0] = acc0;
00022   s[1] = acc1;
00023   
00024   __syncthreads();
00025   
00026   if (reduce_threads >= 1024) { if (tid < 512) { DSACC(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); }
00027   if (reduce_threads >= 512) { if (tid < 256) { DSACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }    
00028   if (reduce_threads >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
00029   if (reduce_threads >= 128) { if (tid <  64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }    
00030 
00031 
00032 #ifndef __DEVICE_EMULATION__
00033   if (tid < 32) 
00034 #endif
00035     {
00036       volatile QudaSumFloat *sv = s;
00037       if (reduce_threads >=  64) { DSACC(sv[0],sv[1],sv[64+0],sv[64+1]); EMUSYNC; }
00038       if (reduce_threads >=  32) { DSACC(sv[0],sv[1],sv[32+0],sv[32+1]); EMUSYNC; }
00039       if (reduce_threads >=  16) { DSACC(sv[0],sv[1],sv[16+0],sv[16+1]); EMUSYNC; }
00040       if (reduce_threads >=   8) { DSACC(sv[0],sv[1], sv[8+0], sv[8+1]); EMUSYNC; }
00041       if (reduce_threads >=   4) { DSACC(sv[0],sv[1], sv[4+0], sv[4+1]); EMUSYNC; }
00042       if (reduce_threads >=   2) { DSACC(sv[0],sv[1], sv[2+0], sv[2+1]); EMUSYNC; }
00043     }
00044   
00045   // write result for this block to global mem as single float
00046   if (tid == 0) g_odata[blockIdx.x] = sdata[0]+sdata[1];
00047 }
00048 
00049 #else // true double precision kernel
00050 
00051 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) {
00052   unsigned int tid = threadIdx.x;
00053   unsigned int i = blockIdx.x*reduce_threads + threadIdx.x;
00054   unsigned int gridSize = reduce_threads*gridDim.x;
00055   
00056   QudaSumFloat sum = 0;
00057 
00058   while (i < n) {
00059     REDUCE_AUXILIARY(i);
00060     sum += REDUCE_OPERATION(i);
00061     i += gridSize;
00062   }
00063 
00064   extern __shared__ QudaSumFloat sdata[];
00065   QudaSumFloat *s = sdata + tid;
00066   
00067   s[0] = sum;
00068   __syncthreads();
00069   
00070   // do reduction in shared mem
00071   if (reduce_threads >= 1024) { if (tid < 512) { s[0] += s[512]; } __syncthreads(); }
00072   if (reduce_threads >= 512) { if (tid < 256) { s[0] += s[256]; } __syncthreads(); }
00073   if (reduce_threads >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); }
00074   if (reduce_threads >= 128) { if (tid <  64) { s[0] += s[ 64]; } __syncthreads(); }
00075   
00076 #ifndef __DEVICE_EMULATION__
00077   if (tid < 32)
00078 #endif
00079     {
00080       volatile QudaSumFloat *sv = s;
00081       if (reduce_threads >=  64) { sv[0] += sv[32]; EMUSYNC; }
00082       if (reduce_threads >=  32) { sv[0] += sv[16]; EMUSYNC; }
00083       if (reduce_threads >=  16) { sv[0] += sv[ 8]; EMUSYNC; }
00084       if (reduce_threads >=   8) { sv[0] += sv[ 4]; EMUSYNC; }
00085       if (reduce_threads >=   4) { sv[0] += sv[ 2]; EMUSYNC; }
00086       if (reduce_threads >=   2) { sv[0] += sv[ 1]; EMUSYNC; }
00087     }
00088   
00089   // write result for this block to global mem 
00090   if (tid == 0) {
00091     g_odata[blockIdx.x] = s[0];
00092   }
00093 
00094 }
00095 
00096 #endif
00097 
00098 template <typename Float>
00099 double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) {
00100   setBlock(kernel, n, precision);
00101   
00102   if (blasGrid.x > REDUCE_MAX_BLOCKS) {
00103     errorQuda("reduce_core: grid size %d must be smaller than %d", blasGrid.x, REDUCE_MAX_BLOCKS);
00104   }
00105   
00106   // when there is only one warp per block, we need to allocate two warps 
00107   // worth of shared memory so that we don't index shared memory out of bounds
00108 #if (REDUCE_TYPE == REDUCE_KAHAN)
00109   size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 4 * sizeof(QudaSumFloat) : blasBlock.x * 2 * sizeof(QudaSumFloat);
00110 #else
00111   size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 2 * sizeof(QudaSumFloat) : blasBlock.x * sizeof(QudaSumFloat);
00112 #endif
00113 
00114   if (blasBlock.x == 32) {
00115     REDUCE_FUNC_NAME(Kernel)<32><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
00116   } else if (blasBlock.x == 64) {
00117     REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
00118   } else if (blasBlock.x == 128) {
00119     REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
00120   } else if (blasBlock.x == 256) {
00121     REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
00122   } else if (blasBlock.x == 512) {
00123     REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
00124   } else if (blasBlock.x == 1024) {
00125     REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
00126   } else {
00127     errorQuda("Reduction not implemented for %d threads", blasBlock.x);
00128   }
00129 
00130   // copy result from device to host, and perform final reduction on CPU
00131   cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
00132 
00133   // for a tuning run, let blas_test check the error condition
00134   if (!blasTuning) checkCudaError();
00135 
00136   double cpu_sum = 0;
00137   for (unsigned int i = 0; i < blasGrid.x; i++) cpu_sum += h_reduceFloat[i];
00138 
00139   return cpu_sum;
00140 }
00141