|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 00002 #if (REDUCE_TYPE == REDUCE_KAHAN) 00003 00004 00005 #define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1)) 00006 #define ZCACC(c0, c1, a0, a1) zcadd((c0), (c1), (c0), (c1), (a0), (a1)) 00007 00008 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) { 00009 unsigned int tid = threadIdx.x; 00010 unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x; 00011 unsigned int gridSize = reduce_threads*gridDim.x; 00012 00013 QudaSumFloat acc0 = 0; 00014 QudaSumFloat acc1 = 0; 00015 QudaSumFloat acc2 = 0; 00016 QudaSumFloat acc3 = 0; 00017 00018 while (i < n) { 00019 REDUCE_REAL_AUXILIARY(i); 00020 REDUCE_IMAG_AUXILIARY(i); 00021 DSACC(acc0, acc1, REDUCE_REAL_OPERATION(i), 0); 00022 DSACC(acc2, acc3, REDUCE_IMAG_OPERATION(i), 0); 00023 i += gridSize; 00024 } 00025 00026 extern __shared__ QudaSumComplex cdata[]; 00027 QudaSumComplex *s = cdata + 2*tid; 00028 s[0].x = acc0; 00029 s[1].x = acc1; 00030 s[0].y = acc2; 00031 s[1].y = acc3; 00032 00033 __syncthreads(); 00034 00035 if (reduce_threads >= 1024) { if (tid < 512) { ZCACC(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); } 00036 if (reduce_threads >= 512) { if (tid < 256) { ZCACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); } 00037 if (reduce_threads >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); } 00038 if (reduce_threads >= 128) { if (tid < 64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); } 00039 00040 #ifndef __DEVICE_EMULATION__ 00041 if (tid < 32) 00042 #endif 00043 { 00044 volatile QudaSumComplex *sv = s; 00045 if (reduce_threads >= 64) { ZCACC(sv[0],sv[1],sv[64+0],sv[64+1]); EMUSYNC; } 00046 if (reduce_threads >= 32) { ZCACC(sv[0],sv[1],sv[32+0],sv[32+1]); EMUSYNC; } 00047 if (reduce_threads >= 16) { ZCACC(sv[0],sv[1],sv[16+0],sv[16+1]); EMUSYNC; } 00048 if (reduce_threads >= 8) { ZCACC(sv[0],sv[1], sv[8+0], sv[8+1]); EMUSYNC; } 00049 if (reduce_threads >= 4) { ZCACC(sv[0],sv[1], sv[4+0], sv[4+1]); EMUSYNC; } 00050 if (reduce_threads >= 2) { ZCACC(sv[0],sv[1], sv[2+0], sv[2+1]); EMUSYNC; } 00051 } 00052 00053 // write result for this block to global mem as single QudaSumComplex 00054 if (tid == 0) { 00055 g_odata[blockIdx.x].x = cdata[0].x+cdata[1].x; 00056 g_odata[blockIdx.x].y = cdata[0].y+cdata[1].y; 00057 } 00058 } 00059 00060 #else 00061 00062 __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) { 00063 unsigned int tid = threadIdx.x; 00064 unsigned int i = blockIdx.x*reduce_threads + threadIdx.x; 00065 unsigned int gridSize = reduce_threads*gridDim.x; 00066 00067 QudaSumComplex sum; 00068 sum.x = 0.0; 00069 sum.y = 0.0; 00070 00071 while (i < n) { 00072 REDUCE_REAL_AUXILIARY(i); 00073 REDUCE_IMAG_AUXILIARY(i); 00074 sum.x += REDUCE_REAL_OPERATION(i); 00075 sum.y += REDUCE_IMAG_OPERATION(i); 00076 i += gridSize; 00077 } 00078 00079 // all real then imaginary to avoid bank conflicts 00080 extern __shared__ QudaSumFloat cdata[]; 00081 QudaSumFloat *sx = cdata + tid; 00082 QudaSumFloat *sy = cdata + tid + reduce_threads; 00083 00084 sx[0] = sum.x; 00085 sy[0] = sum.y; 00086 __syncthreads(); 00087 00088 // do reduction in shared mem 00089 if (reduce_threads >= 1024) { if (tid < 512) { sx[0] += sx[512]; sy[0] += sy[512]; } __syncthreads(); } 00090 if (reduce_threads >= 512) { if (tid < 256) { sx[0] += sx[256]; sy[0] += sy[256]; } __syncthreads(); } 00091 if (reduce_threads >= 256) { if (tid < 128) { sx[0] += sx[128]; sy[0] += sy[128]; } __syncthreads(); } 00092 if (reduce_threads >= 128) { if (tid < 64) { sx[0] += sx[ 64]; sy[0] += sy[ 64]; } __syncthreads(); } 00093 00094 #ifndef __DEVICE_EMULATION__ 00095 if (tid < 32) 00096 #endif 00097 { 00098 volatile QudaSumFloat *svx = sx; 00099 volatile QudaSumFloat *svy = sy; 00100 if (reduce_threads >= 64) { svx[0] += svx[32]; svy[0] += svy[32]; EMUSYNC; } 00101 if (reduce_threads >= 32) { svx[0] += svx[16]; svy[0] += svy[16]; EMUSYNC; } 00102 if (reduce_threads >= 16) { svx[0] += svx[ 8]; svy[0] += svy[ 8]; EMUSYNC; } 00103 if (reduce_threads >= 8) { svx[0] += svx[ 4]; svy[0] += svy[ 4]; EMUSYNC; } 00104 if (reduce_threads >= 4) { svx[0] += svx[ 2]; svy[0] += svy[ 2]; EMUSYNC; } 00105 if (reduce_threads >= 2) { svx[0] += svx[ 1]; svy[0] += svy[ 1]; EMUSYNC; } 00106 } 00107 00108 // write result for this block to global mem 00109 if (tid == 0) { 00110 g_odata[blockIdx.x].x = sx[0]; 00111 g_odata[blockIdx.x].y = sy[0]; 00112 } 00113 } 00114 00115 #endif 00116 00117 template <typename Float, typename Float2> 00118 cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) { 00119 00120 setBlock(kernel, n, precision); 00121 00122 if (blasGrid.x > REDUCE_MAX_BLOCKS) { 00123 errorQuda("reduce_complex: grid size %d must be smaller than %d", blasGrid.x, REDUCE_MAX_BLOCKS); 00124 } 00125 00126 #if (REDUCE_TYPE == REDUCE_KAHAN) 00127 size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 4 * sizeof(QudaSumComplex) : blasBlock.x * 2 * sizeof(QudaSumComplex); 00128 #else 00129 size_t smemSize = (blasBlock.x <= 32) ? blasBlock.x * 2 * sizeof(QudaSumComplex) : blasBlock.x * sizeof(QudaSumComplex); 00130 #endif 00131 00132 if (blasBlock.x == 32) { 00133 REDUCE_FUNC_NAME(Kernel)<32><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n); 00134 } else if (blasBlock.x == 64) { 00135 REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n); 00136 } else if (blasBlock.x == 128) { 00137 REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n); 00138 } else if (blasBlock.x == 256) { 00139 REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n); 00140 } else if (blasBlock.x == 512) { 00141 REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n); 00142 } else if (blasBlock.x == 1024) { 00143 REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n); 00144 } else { 00145 errorQuda("Reduction not implemented for %d threads", blasBlock.x); 00146 } 00147 00148 // copy result from device to host, and perform final reduction on CPU 00149 cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost); 00150 00151 // for a tuning run, let blas_test check the error condition 00152 if (!blasTuning) checkCudaError(); 00153 00154 cuDoubleComplex gpu_result; 00155 gpu_result.x = 0; 00156 gpu_result.y = 0; 00157 for (unsigned int i = 0; i < blasGrid.x; i++) { 00158 gpu_result.x += h_reduceComplex[i].x; 00159 gpu_result.y += h_reduceComplex[i].y; 00160 } 00161 00162 return gpu_result; 00163 } 00164 00165
1.7.3