|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #if (REDUCE_TYPE == REDUCE_KAHAN) 00002 00003 #define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1)) 00004 00005 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) { 00006 unsigned int tid = threadIdx.x; 00007 unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x; 00008 unsigned int gridSize = reduce_threads*gridDim.x; 00009 00010 QudaSumFloat acc0 = 0; 00011 QudaSumFloat acc1 = 0; 00012 00013 while (i < n) { 00014 REDUCE_AUXILIARY(i); 00015 DSACC(acc0, acc1, REDUCE_OPERATION(i), 0); 00016 i += gridSize; 00017 } 00018 00019 extern __shared__ QudaSumFloat sdata[]; 00020 QudaSumFloat *s = sdata + 2*tid; 00021 s[0] = acc0; 00022 s[1] = acc1; 00023 00024 __syncthreads(); 00025 00026 if (reduce_threads >= 1024) { if (tid < 512) { DSACC(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); } 00027 if (reduce_threads >= 512) { if (tid < 256) { DSACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); } 00028 if (reduce_threads >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); } 00029 if (reduce_threads >= 128) { if (tid < 64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); } 00030 00031 00032 #ifndef __DEVICE_EMULATION__ 00033 if (tid < 32) 00034 #endif 00035 { 00036 volatile QudaSumFloat *sv = s; 00037 if (reduce_threads >= 64) { DSACC(sv[0],sv[1],sv[64+0],sv[64+1]); EMUSYNC; } 00038 if (reduce_threads >= 32) { DSACC(sv[0],sv[1],sv[32+0],sv[32+1]); EMUSYNC; } 00039 if (reduce_threads >= 16) { DSACC(sv[0],sv[1],sv[16+0],sv[16+1]); EMUSYNC; } 00040 if (reduce_threads >= 8) { DSACC(sv[0],sv[1], sv[8+0], sv[8+1]); EMUSYNC; } 00041 if (reduce_threads >= 4) { DSACC(sv[0],sv[1], sv[4+0], sv[4+1]); EMUSYNC; } 00042 if (reduce_threads >= 2) { DSACC(sv[0],sv[1], sv[2+0], sv[2+1]); EMUSYNC; } 00043 } 00044 00045 // write result for this block to global mem as single float 00046 if (tid == 0) g_odata[blockIdx.x] = sdata[0]+sdata[1]; 00047 } 00048 00049 #else // true double precision kernel 00050 00051 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) { 00052 unsigned int tid = threadIdx.x; 00053 unsigned int i = blockIdx.x*reduce_threads + threadIdx.x; 00054 unsigned int gridSize = reduce_threads*gridDim.x; 00055 00056 QudaSumFloat sum = 0; 00057 00058 while (i < n) { 00059 REDUCE_AUXILIARY(i); 00060 sum += REDUCE_OPERATION(i); 00061 i += gridSize; 00062 } 00063 00064 extern __shared__ QudaSumFloat sdata[]; 00065 QudaSumFloat *s = sdata + tid; 00066 00067 s[0] = sum; 00068 __syncthreads(); 00069 00070 // do reduction in shared mem 00071 if (reduce_threads >= 1024) { if (tid < 512) { s[0] += s[512]; } __syncthreads(); } 00072 if (reduce_threads >= 512) { if (tid < 256) { s[0] += s[256]; } __syncthreads(); } 00073 if (reduce_threads >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); } 00074 if (reduce_threads >= 128) { if (tid < 64) { s[0] += s[ 64]; } __syncthreads(); } 00075 00076 #ifndef __DEVICE_EMULATION__ 00077 if (tid < 32) 00078 #endif 00079 { 00080 volatile QudaSumFloat *sv = s; 00081 if (reduce_threads >= 64) { sv[0] += sv[32]; EMUSYNC; } 00082 if (reduce_threads >= 32) { sv[0] += sv[16]; EMUSYNC; } 00083 if (reduce_threads >= 16) { sv[0] += sv[ 8]; EMUSYNC; } 00084 if (reduce_threads >= 8) { sv[0] += sv[ 4]; EMUSYNC; } 00085 if (reduce_threads >= 4) { sv[0] += sv[ 2]; EMUSYNC; } 00086 if (reduce_threads >= 2) { sv[0] += sv[ 1]; EMUSYNC; } 00087 } 00088 00089 // write result for this block to global mem 00090 if (tid == 0) { 00091 g_odata[blockIdx.x] = s[0]; 00092 } 00093 00094 } 00095 00096 #endif 00097 00098 template <typename Float> 00099 double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) { 00100 setBlock(kernel, n, precision); 00101 00102 if (blasGrid.x > REDUCE_MAX_BLOCKS) { 00103 errorQuda("reduce_core: grid size %d must be smaller than %d", blasGrid.x, REDUCE_MAX_BLOCKS); 00104 } 00105 00106 // when there is only one warp per block, we need to allocate two warps 00107 // worth of shared memory so that we don't index shared memory out of bounds 00108 #if (REDUCE_TYPE == REDUCE_KAHAN) 00109 size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 4 * sizeof(QudaSumFloat) : blasBlock.x * 2 * sizeof(QudaSumFloat); 00110 #else 00111 size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 2 * sizeof(QudaSumFloat) : blasBlock.x * sizeof(QudaSumFloat); 00112 #endif 00113 00114 if (blasBlock.x == 32) { 00115 REDUCE_FUNC_NAME(Kernel)<32><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n); 00116 } else if (blasBlock.x == 64) { 00117 REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n); 00118 } else if (blasBlock.x == 128) { 00119 REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n); 00120 } else if (blasBlock.x == 256) { 00121 REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n); 00122 } else if (blasBlock.x == 512) { 00123 REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n); 00124 } else if (blasBlock.x == 1024) { 00125 REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n); 00126 } else { 00127 errorQuda("Reduction not implemented for %d threads", blasBlock.x); 00128 } 00129 00130 // copy result from device to host, and perform final reduction on CPU 00131 cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost); 00132 00133 // for a tuning run, let blas_test check the error condition 00134 if (!blasTuning) checkCudaError(); 00135 00136 double cpu_sum = 0; 00137 for (unsigned int i = 0; i < blasGrid.x; i++) cpu_sum += h_reduceFloat[i]; 00138 00139 return cpu_sum; 00140 } 00141
1.7.3