|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 00002 #if (REDUCE_TYPE == REDUCE_KAHAN) 00003 00004 #define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1)) 00005 #define DSACC3(c0, c1, a0, a1) dsadd3((c0), (c1), (c0), (c1), (a0), (a1)) 00006 00007 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata, unsigned int n) { 00008 unsigned int tid = threadIdx.x; 00009 unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x; 00010 unsigned int gridSize = reduce_threads*gridDim.x; 00011 00012 QudaSumFloat acc0 = 0; 00013 QudaSumFloat acc1 = 0; 00014 QudaSumFloat acc2 = 0; 00015 QudaSumFloat acc3 = 0; 00016 QudaSumFloat acc4 = 0; 00017 QudaSumFloat acc5 = 0; 00018 00019 while (i < n) { 00020 REDUCE_X_AUXILIARY(i); 00021 REDUCE_Y_AUXILIARY(i); 00022 REDUCE_Z_AUXILIARY(i); 00023 DSACC(acc0, acc1, REDUCE_X_OPERATION(i), 0); 00024 DSACC(acc2, acc3, REDUCE_Y_OPERATION(i), 0); 00025 DSACC(acc4, acc5, REDUCE_Z_OPERATION(i), 0); 00026 i += gridSize; 00027 } 00028 00029 extern __shared__ QudaSumFloat3 tdata[]; 00030 QudaSumFloat3 *s = tdata + 2*tid; 00031 s[0].x = acc0; 00032 s[1].x = acc1; 00033 s[0].y = acc2; 00034 s[1].y = acc3; 00035 s[0].z = acc4; 00036 s[1].z = acc5; 00037 00038 __syncthreads(); 00039 00040 if (reduce_threads >= 1024) { if (tid < 512) { DSACC3(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); } 00041 if (reduce_threads >= 512) { if (tid < 256) { DSACC3(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); } 00042 if (reduce_threads >= 256) { if (tid < 128) { DSACC3(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); } 00043 if (reduce_threads >= 128) { if (tid < 64) { DSACC3(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); } 00044 00045 #ifndef __DEVICE_EMULATION__ 00046 if (tid < 32) 00047 #endif 00048 { 00049 volatile QudaSumFloat3 *sv = s; 00050 if (reduce_threads >= 64) { DSACC3(sv[0],sv[1],sv[64+0],sv[64+1]); EMUSYNC; } 00051 if (reduce_threads >= 32) { DSACC3(sv[0],sv[1],sv[32+0],sv[32+1]); EMUSYNC; } 00052 if (reduce_threads >= 16) { DSACC3(sv[0],sv[1],sv[16+0],sv[16+1]); EMUSYNC; } 00053 if (reduce_threads >= 8) { DSACC3(sv[0],sv[1], sv[8+0], sv[8+1]); EMUSYNC; } 00054 if (reduce_threads >= 4) { DSACC3(sv[0],sv[1], sv[4+0], sv[4+1]); EMUSYNC; } 00055 if (reduce_threads >= 2) { DSACC3(sv[0],sv[1], sv[2+0], sv[2+1]); EMUSYNC; } 00056 } 00057 00058 // write result for this block to global mem as single QudaSumFloat3 00059 if (tid == 0) { 00060 g_odata[blockIdx.x].x = tdata[0].x+tdata[1].x; 00061 g_odata[blockIdx.x].y = tdata[0].y+tdata[1].y; 00062 g_odata[blockIdx.x].z = tdata[0].z+tdata[1].z; 00063 } 00064 } 00065 00066 #else 00067 00068 /*#define SUM_DOUBLE3(s, i, j) \ 00069 s[i].x += s[j].x; \ 00070 s[i].y += s[j].y; \ 00071 s[i].z += s[j].z;*/ 00072 00073 #define SUM_DOUBLE3(s, i, j) \ 00074 s##x[i] += s##x[j]; \ 00075 s##y[i] += s##y[j]; \ 00076 s##z[i] += s##z[j]; 00077 00078 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata, unsigned int n) { 00079 unsigned int tid = threadIdx.x; 00080 unsigned int i = blockIdx.x*reduce_threads + threadIdx.x; 00081 unsigned int gridSize = reduce_threads*gridDim.x; 00082 00083 // all x, y then z to avoid bank conflicts 00084 extern __shared__ QudaSumFloat tdata[]; 00085 QudaSumFloat *sx = tdata + tid; 00086 QudaSumFloat *sy = tdata + tid + reduce_threads; 00087 QudaSumFloat *sz = tdata + tid + 2*reduce_threads; 00088 sx[0] = 0; 00089 sy[0] = 0; 00090 sz[0] = 0; 00091 00092 while (i < n) { 00093 REDUCE_X_AUXILIARY(i); 00094 REDUCE_Y_AUXILIARY(i); 00095 REDUCE_Z_AUXILIARY(i); 00096 sx[0] += REDUCE_X_OPERATION(i); 00097 sy[0] += REDUCE_Y_OPERATION(i); 00098 sz[0] += REDUCE_Z_OPERATION(i); 00099 i += gridSize; 00100 } 00101 00102 __syncthreads(); 00103 00104 // do reduction in shared mem 00105 if (reduce_threads >= 1024) 00106 { if (tid < 512) { SUM_DOUBLE3(s, 0, 512); } __syncthreads(); } 00107 if (reduce_threads >= 512) 00108 { if (tid < 256) { SUM_DOUBLE3(s, 0, 256); } __syncthreads(); } 00109 if (reduce_threads >= 256) 00110 { if (tid < 128) { SUM_DOUBLE3(s, 0, 128); } __syncthreads(); } 00111 if (reduce_threads >= 128) 00112 { if (tid < 64) { SUM_DOUBLE3(s, 0, 64); } __syncthreads(); } 00113 00114 #ifndef __DEVICE_EMULATION__ 00115 if (tid < 32) 00116 #endif 00117 { 00118 volatile QudaSumFloat *svx = sx; 00119 volatile QudaSumFloat *svy = sy; 00120 volatile QudaSumFloat *svz = sz; 00121 if (reduce_threads >= 64) { SUM_DOUBLE3(sv, 0,32); EMUSYNC; } 00122 if (reduce_threads >= 32) { SUM_DOUBLE3(sv, 0,16); EMUSYNC; } 00123 if (reduce_threads >= 16) { SUM_DOUBLE3(sv, 0,8); EMUSYNC; } 00124 if (reduce_threads >= 8) { SUM_DOUBLE3(sv, 0,4); EMUSYNC; } 00125 if (reduce_threads >= 4) { SUM_DOUBLE3(sv, 0,2); EMUSYNC; } 00126 if (reduce_threads >= 2) { SUM_DOUBLE3(sv, 0,1); EMUSYNC; } 00127 } 00128 00129 // write result for this block to global mem 00130 if (tid == 0) { 00131 g_odata[blockIdx.x].x = sx[0]; 00132 g_odata[blockIdx.x].y = sy[0]; 00133 g_odata[blockIdx.x].z = sz[0]; 00134 } 00135 } 00136 00137 #undef SUM_DOUBLE3 00138 00139 #endif 00140 00141 template <typename Float2> 00142 double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) { 00143 00144 setBlock(kernel, n, precision); 00145 00146 if (blasGrid.x > REDUCE_MAX_BLOCKS) { 00147 errorQuda("reduce_triple_core: grid size %d must be smaller than %d", blasGrid.x, REDUCE_MAX_BLOCKS); 00148 } 00149 00150 #if (REDUCE_TYPE == REDUCE_KAHAN) 00151 size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 4 * sizeof(QudaSumFloat3) : blasBlock.x * 2 * sizeof(QudaSumFloat3); 00152 #else 00153 size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 2 * sizeof(QudaSumFloat3) : blasBlock.x * sizeof(QudaSumFloat3); 00154 #endif 00155 00156 if (blasBlock.x == 32) { 00157 REDUCE_FUNC_NAME(Kernel)<32><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n); 00158 } else if (blasBlock.x == 64) { 00159 REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n); 00160 } else if (blasBlock.x == 128) { 00161 REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n); 00162 } else if (blasBlock.x == 256) { 00163 REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n); 00164 } else if (blasBlock.x == 512) { 00165 REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n); 00166 } else if (blasBlock.x == 1024) { 00167 REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n); 00168 } else { 00169 errorQuda("Reduction not implemented for %d threads", blasBlock.x); 00170 } 00171 00172 // copy result from device to host, and perform final reduction on CPU 00173 cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost); 00174 00175 // for a tuning run, let blas_test check the error condition 00176 if (!blasTuning) checkCudaError(); 00177 00178 double3 gpu_result; 00179 gpu_result.x = 0; 00180 gpu_result.y = 0; 00181 gpu_result.z = 0; 00182 for (unsigned int i = 0; i < blasGrid.x; i++) { 00183 gpu_result.x += h_reduceFloat3[i].x; 00184 gpu_result.y += h_reduceFloat3[i].y; 00185 gpu_result.z += h_reduceFloat3[i].z; 00186 } 00187 00188 return gpu_result; 00189 } 00190 00191
1.7.3