QUDA: quda/lib/reduce_complex

QUDA v0.3.2
A library for QCD on GPUs
00001 
00002 #if (REDUCE_TYPE == REDUCE_KAHAN)
00003 
00004 
00005 #define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1))
00006 #define ZCACC(c0, c1, a0, a1) zcadd((c0), (c1), (c0), (c1), (a0), (a1))
00007 
00008 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) {
00009   unsigned int tid = threadIdx.x;
00010   unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x;
00011   unsigned int gridSize = reduce_threads*gridDim.x;
00012   
00013   QudaSumFloat acc0 = 0;
00014   QudaSumFloat acc1 = 0;
00015   QudaSumFloat acc2 = 0;
00016   QudaSumFloat acc3 = 0;
00017   
00018   while (i < n) {
00019     REDUCE_REAL_AUXILIARY(i);
00020     REDUCE_IMAG_AUXILIARY(i);
00021     DSACC(acc0, acc1, REDUCE_REAL_OPERATION(i), 0);
00022     DSACC(acc2, acc3, REDUCE_IMAG_OPERATION(i), 0);
00023     i += gridSize;
00024   }
00025   
00026   extern __shared__ QudaSumComplex cdata[];
00027   QudaSumComplex *s = cdata + 2*tid;
00028   s[0].x = acc0;
00029   s[1].x = acc1;
00030   s[0].y = acc2;
00031   s[1].y = acc3;
00032   
00033   __syncthreads();
00034   
00035   if (reduce_threads >= 1024) { if (tid < 512) { ZCACC(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); }
00036   if (reduce_threads >= 512) { if (tid < 256) { ZCACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }    
00037   if (reduce_threads >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
00038   if (reduce_threads >= 128) { if (tid <  64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }    
00039 
00040 #ifndef __DEVICE_EMULATION__
00041   if (tid < 32) 
00042 #endif
00043     {
00044       volatile QudaSumComplex *sv = s;
00045       if (reduce_threads >=  64) { ZCACC(sv[0],sv[1],sv[64+0],sv[64+1]); EMUSYNC; }
00046       if (reduce_threads >=  32) { ZCACC(sv[0],sv[1],sv[32+0],sv[32+1]); EMUSYNC; }
00047       if (reduce_threads >=  16) { ZCACC(sv[0],sv[1],sv[16+0],sv[16+1]); EMUSYNC; }
00048       if (reduce_threads >=   8) { ZCACC(sv[0],sv[1], sv[8+0], sv[8+1]); EMUSYNC; }
00049       if (reduce_threads >=   4) { ZCACC(sv[0],sv[1], sv[4+0], sv[4+1]); EMUSYNC; }
00050       if (reduce_threads >=   2) { ZCACC(sv[0],sv[1], sv[2+0], sv[2+1]); EMUSYNC; }
00051     }
00052   
00053   // write result for this block to global mem as single QudaSumComplex
00054   if (tid == 0) {
00055     g_odata[blockIdx.x].x = cdata[0].x+cdata[1].x;
00056     g_odata[blockIdx.x].y = cdata[0].y+cdata[1].y;
00057   }
00058 }
00059 
00060 #else
00061 
00062 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) {
00063   unsigned int tid = threadIdx.x;
00064   unsigned int i = blockIdx.x*reduce_threads + threadIdx.x;
00065   unsigned int gridSize = reduce_threads*gridDim.x;
00066   
00067   QudaSumComplex sum;
00068   sum.x = 0.0;
00069   sum.y = 0.0;
00070   
00071   while (i < n) {
00072     REDUCE_REAL_AUXILIARY(i);
00073     REDUCE_IMAG_AUXILIARY(i);
00074     sum.x += REDUCE_REAL_OPERATION(i);
00075     sum.y += REDUCE_IMAG_OPERATION(i);
00076     i += gridSize;
00077   }
00078 
00079   // all real then imaginary to avoid bank conflicts
00080   extern __shared__ QudaSumFloat cdata[];
00081   QudaSumFloat *sx = cdata + tid;
00082   QudaSumFloat *sy = cdata + tid + reduce_threads;
00083 
00084   sx[0] = sum.x;
00085   sy[0] = sum.y;
00086   __syncthreads();
00087   
00088   // do reduction in shared mem
00089   if (reduce_threads >= 1024) { if (tid < 512) { sx[0] += sx[512]; sy[0] += sy[512]; } __syncthreads(); }
00090   if (reduce_threads >= 512) { if (tid < 256) { sx[0] += sx[256]; sy[0] += sy[256]; } __syncthreads(); }
00091   if (reduce_threads >= 256) { if (tid < 128) { sx[0] += sx[128]; sy[0] += sy[128]; } __syncthreads(); }
00092   if (reduce_threads >= 128) { if (tid <  64) { sx[0] += sx[ 64]; sy[0] += sy[ 64]; } __syncthreads(); }
00093   
00094 #ifndef __DEVICE_EMULATION__
00095   if (tid < 32) 
00096 #endif
00097     {
00098       volatile QudaSumFloat *svx = sx;
00099       volatile QudaSumFloat *svy = sy;
00100       if (reduce_threads >=  64) { svx[0] += svx[32]; svy[0] += svy[32]; EMUSYNC; }
00101       if (reduce_threads >=  32) { svx[0] += svx[16]; svy[0] += svy[16]; EMUSYNC; }
00102       if (reduce_threads >=  16) { svx[0] += svx[ 8]; svy[0] += svy[ 8]; EMUSYNC; }
00103       if (reduce_threads >=   8) { svx[0] += svx[ 4]; svy[0] += svy[ 4]; EMUSYNC; }
00104       if (reduce_threads >=   4) { svx[0] += svx[ 2]; svy[0] += svy[ 2]; EMUSYNC; }
00105       if (reduce_threads >=   2) { svx[0] += svx[ 1]; svy[0] += svy[ 1]; EMUSYNC; }
00106     }
00107   
00108   // write result for this block to global mem 
00109   if (tid == 0) {
00110     g_odata[blockIdx.x].x = sx[0];
00111     g_odata[blockIdx.x].y = sy[0];
00112   }
00113 }
00114 
00115 #endif
00116 
00117 template <typename Float, typename Float2>
00118 cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) {
00119 
00120   setBlock(kernel, n, precision);
00121   
00122   if (blasGrid.x > REDUCE_MAX_BLOCKS) {
00123     errorQuda("reduce_complex: grid size %d must be smaller than %d", blasGrid.x, REDUCE_MAX_BLOCKS);
00124   }
00125   
00126 #if (REDUCE_TYPE == REDUCE_KAHAN)
00127   size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 4 * sizeof(QudaSumComplex) : blasBlock.x * 2 * sizeof(QudaSumComplex);
00128 #else
00129   size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 2 * sizeof(QudaSumComplex) : blasBlock.x * sizeof(QudaSumComplex);
00130 #endif
00131 
00132   if (blasBlock.x == 32) {
00133     REDUCE_FUNC_NAME(Kernel)<32><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
00134   } else if (blasBlock.x == 64) {
00135     REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
00136   } else if (blasBlock.x == 128) {
00137     REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
00138   } else if (blasBlock.x == 256) {
00139     REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
00140   } else if (blasBlock.x == 512) {
00141     REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
00142   } else if (blasBlock.x == 1024) {
00143     REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
00144   } else {
00145     errorQuda("Reduction not implemented for %d threads", blasBlock.x);
00146   }
00147 
00148   // copy result from device to host, and perform final reduction on CPU
00149   cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
00150 
00151   // for a tuning run, let blas_test check the error condition
00152   if (!blasTuning) checkCudaError();
00153   
00154   cuDoubleComplex gpu_result;
00155   gpu_result.x = 0;
00156   gpu_result.y = 0;
00157   for (unsigned int i = 0; i < blasGrid.x; i++) {
00158     gpu_result.x += h_reduceComplex[i].x;
00159     gpu_result.y += h_reduceComplex[i].y;
00160   }
00161   
00162   return gpu_result;
00163 }
00164 
00165
quda/lib/reduce_complex_core.h