QUDA: quda/lib/reduce_triple_core.h Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 
00002 #if (REDUCE_TYPE == REDUCE_KAHAN)
00003 
00004 #define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1))
00005 #define DSACC3(c0, c1, a0, a1) dsadd3((c0), (c1), (c0), (c1), (a0), (a1))
00006 
00007 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata, unsigned int n) {
00008   unsigned int tid = threadIdx.x;
00009   unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x;
00010   unsigned int gridSize = reduce_threads*gridDim.x;
00011     
00012   QudaSumFloat acc0 = 0;
00013   QudaSumFloat acc1 = 0;
00014   QudaSumFloat acc2 = 0;
00015   QudaSumFloat acc3 = 0;
00016   QudaSumFloat acc4 = 0;
00017   QudaSumFloat acc5 = 0;
00018 
00019   while (i < n) {
00020     REDUCE_X_AUXILIARY(i);
00021     REDUCE_Y_AUXILIARY(i);
00022     REDUCE_Z_AUXILIARY(i);
00023     DSACC(acc0, acc1, REDUCE_X_OPERATION(i), 0);
00024     DSACC(acc2, acc3, REDUCE_Y_OPERATION(i), 0);
00025     DSACC(acc4, acc5, REDUCE_Z_OPERATION(i), 0);
00026     i += gridSize;
00027   }
00028 
00029   extern __shared__ QudaSumFloat3 tdata[];
00030   QudaSumFloat3 *s = tdata + 2*tid;
00031   s[0].x = acc0;
00032   s[1].x = acc1;
00033   s[0].y = acc2;
00034   s[1].y = acc3;
00035   s[0].z = acc4;
00036   s[1].z = acc5;
00037     
00038   __syncthreads();
00039     
00040   if (reduce_threads >= 1024) { if (tid < 512) { DSACC3(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); }
00041   if (reduce_threads >= 512) { if (tid < 256) { DSACC3(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }    
00042   if (reduce_threads >= 256) { if (tid < 128) { DSACC3(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
00043   if (reduce_threads >= 128) { if (tid <  64) { DSACC3(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }    
00044 
00045 #ifndef __DEVICE_EMULATION__
00046   if (tid < 32) 
00047 #endif
00048     {
00049       volatile QudaSumFloat3 *sv = s;
00050       if (reduce_threads >=  64) { DSACC3(sv[0],sv[1],sv[64+0],sv[64+1]); EMUSYNC; }
00051       if (reduce_threads >=  32) { DSACC3(sv[0],sv[1],sv[32+0],sv[32+1]); EMUSYNC; }
00052       if (reduce_threads >=  16) { DSACC3(sv[0],sv[1],sv[16+0],sv[16+1]); EMUSYNC; }
00053       if (reduce_threads >=   8) { DSACC3(sv[0],sv[1], sv[8+0], sv[8+1]); EMUSYNC; }
00054       if (reduce_threads >=   4) { DSACC3(sv[0],sv[1], sv[4+0], sv[4+1]); EMUSYNC; } 
00055       if (reduce_threads >=   2) { DSACC3(sv[0],sv[1], sv[2+0], sv[2+1]); EMUSYNC; }
00056     }
00057     
00058   // write result for this block to global mem as single QudaSumFloat3
00059   if (tid == 0) {
00060     g_odata[blockIdx.x].x = tdata[0].x+tdata[1].x;
00061     g_odata[blockIdx.x].y = tdata[0].y+tdata[1].y;
00062     g_odata[blockIdx.x].z = tdata[0].z+tdata[1].z;
00063   }
00064 }
00065 
00066 #else
00067 
00068 /*#define SUM_DOUBLE3(s, i, j)                  \
00069   s[i].x += s[j].x;                             \
00070   s[i].y += s[j].y;                             \
00071   s[i].z += s[j].z;*/
00072 
00073 #define SUM_DOUBLE3(s, i, j)                    \
00074   s##x[i] += s##x[j];                           \
00075   s##y[i] += s##y[j];                           \
00076   s##z[i] += s##z[j];
00077 
00078 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata, unsigned int n) {
00079   unsigned int tid = threadIdx.x;
00080   unsigned int i = blockIdx.x*reduce_threads + threadIdx.x;
00081   unsigned int gridSize = reduce_threads*gridDim.x;
00082 
00083   // all x, y then z to avoid bank conflicts
00084   extern __shared__ QudaSumFloat tdata[];
00085   QudaSumFloat *sx = tdata + tid;
00086   QudaSumFloat *sy = tdata + tid + reduce_threads;
00087   QudaSumFloat *sz = tdata + tid + 2*reduce_threads;
00088   sx[0] = 0;
00089   sy[0] = 0;
00090   sz[0] = 0;
00091 
00092   while (i < n) {
00093     REDUCE_X_AUXILIARY(i);
00094     REDUCE_Y_AUXILIARY(i);
00095     REDUCE_Z_AUXILIARY(i);
00096     sx[0] += REDUCE_X_OPERATION(i);
00097     sy[0] += REDUCE_Y_OPERATION(i);
00098     sz[0] += REDUCE_Z_OPERATION(i);
00099     i += gridSize;
00100   }
00101   
00102   __syncthreads();
00103 
00104   // do reduction in shared mem
00105   if (reduce_threads >= 1024) 
00106     { if (tid < 512) { SUM_DOUBLE3(s, 0, 512); } __syncthreads(); }
00107   if (reduce_threads >= 512) 
00108     { if (tid < 256) { SUM_DOUBLE3(s, 0, 256); } __syncthreads(); }
00109   if (reduce_threads >= 256) 
00110     { if (tid < 128) { SUM_DOUBLE3(s, 0, 128); } __syncthreads(); }
00111   if (reduce_threads >= 128) 
00112     { if (tid <  64) { SUM_DOUBLE3(s, 0, 64); } __syncthreads(); }
00113     
00114 #ifndef __DEVICE_EMULATION__
00115   if (tid < 32) 
00116 #endif
00117     {
00118       volatile QudaSumFloat *svx = sx;
00119       volatile QudaSumFloat *svy = sy;
00120       volatile QudaSumFloat *svz = sz;
00121       if (reduce_threads >=  64) { SUM_DOUBLE3(sv, 0,32); EMUSYNC; }
00122       if (reduce_threads >=  32) { SUM_DOUBLE3(sv, 0,16); EMUSYNC; }
00123       if (reduce_threads >=  16) { SUM_DOUBLE3(sv, 0,8); EMUSYNC; }
00124       if (reduce_threads >=   8) { SUM_DOUBLE3(sv, 0,4); EMUSYNC; }
00125       if (reduce_threads >=   4) { SUM_DOUBLE3(sv, 0,2); EMUSYNC; }
00126       if (reduce_threads >=   2) { SUM_DOUBLE3(sv, 0,1); EMUSYNC; }
00127     }
00128     
00129   // write result for this block to global mem 
00130   if (tid == 0) {
00131     g_odata[blockIdx.x].x = sx[0];
00132     g_odata[blockIdx.x].y = sy[0];
00133     g_odata[blockIdx.x].z = sz[0];
00134   }
00135 }
00136 
00137 #undef SUM_DOUBLE3
00138 
00139 #endif
00140 
00141 template <typename Float2>
00142 double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) {
00143 
00144   setBlock(kernel, n, precision);
00145   
00146   if (blasGrid.x > REDUCE_MAX_BLOCKS) {
00147     errorQuda("reduce_triple_core: grid size %d must be smaller than %d", blasGrid.x, REDUCE_MAX_BLOCKS);
00148   }
00149   
00150 #if (REDUCE_TYPE == REDUCE_KAHAN)
00151   size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 4 * sizeof(QudaSumFloat3) : blasBlock.x * 2 * sizeof(QudaSumFloat3);
00152 #else
00153   size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 2 * sizeof(QudaSumFloat3) : blasBlock.x * sizeof(QudaSumFloat3);
00154 #endif
00155 
00156   if (blasBlock.x == 32) {
00157     REDUCE_FUNC_NAME(Kernel)<32><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
00158   } else if (blasBlock.x == 64) {
00159     REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
00160   } else if (blasBlock.x == 128) {
00161     REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
00162   } else if (blasBlock.x == 256) {
00163     REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
00164   } else if (blasBlock.x == 512) {
00165     REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
00166   } else if (blasBlock.x == 1024) {
00167     REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
00168   } else {
00169     errorQuda("Reduction not implemented for %d threads", blasBlock.x);
00170   }
00171 
00172   // copy result from device to host, and perform final reduction on CPU
00173   cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
00174 
00175   // for a tuning run, let blas_test check the error condition
00176   if (!blasTuning) checkCudaError();
00177   
00178   double3 gpu_result;
00179   gpu_result.x = 0;
00180   gpu_result.y = 0;
00181   gpu_result.z = 0;
00182   for (unsigned int i = 0; i < blasGrid.x; i++) {
00183     gpu_result.x += h_reduceFloat3[i].x;
00184     gpu_result.y += h_reduceFloat3[i].y;
00185     gpu_result.z += h_reduceFloat3[i].z;
00186   }
00187   
00188   return gpu_result;
00189 }
00190 
00191