QUDA v0.4.0
A library for QCD on GPUs
|
00001 // The following indexing routines work for arbitrary (including odd) lattice dimensions. 00002 // compute an index into the local volume from an index into the face (used by the face packing routines) 00003 00004 template <int dim, int nLayers> 00005 static inline __device__ int indexFromFaceIndex(int face_idx, const int &face_volume, 00006 const int &face_num, const int &parity) 00007 { 00008 // dimensions of the face (FIXME: optimize using constant cache) 00009 00010 int face_X = X1, face_Y = X2, face_Z = X3; // face_T = X4; 00011 switch (dim) { 00012 case 0: 00013 face_X = nLayers; 00014 break; 00015 case 1: 00016 face_Y = nLayers; 00017 break; 00018 case 2: 00019 face_Z = nLayers; 00020 break; 00021 case 3: 00022 // face_T = nLayers; 00023 break; 00024 } 00025 int face_XYZ = face_X * face_Y * face_Z; 00026 int face_XY = face_X * face_Y; 00027 00028 // intrinsic parity of the face depends on offset of first element 00029 00030 int face_parity; 00031 switch (dim) { 00032 case 0: 00033 face_parity = (parity + face_num * (X1 - nLayers)) & 1; 00034 break; 00035 case 1: 00036 face_parity = (parity + face_num * (X2 - nLayers)) & 1; 00037 break; 00038 case 2: 00039 face_parity = (parity + face_num * (X3 - nLayers)) & 1; 00040 break; 00041 case 3: 00042 face_parity = (parity + face_num * (X4 - nLayers)) & 1; 00043 break; 00044 } 00045 00046 // reconstruct full face index from index into the checkerboard 00047 00048 face_idx *= 2; 00049 00050 if (!(face_X & 1)) { // face_X even 00051 // int t = face_idx / face_XYZ; 00052 // int z = (face_idx / face_XY) % face_Z; 00053 // int y = (face_idx / face_X) % face_Y; 00054 // face_idx += (face_parity + t + z + y) & 1; 00055 // equivalent to the above, but with fewer divisions/mods: 00056 int aux1 = face_idx / face_X; 00057 int aux2 = aux1 / face_Y; 00058 int y = aux1 - aux2 * face_Y; 00059 int t = aux2 / face_Z; 00060 int z = aux2 - t * face_Z; 00061 face_idx += (face_parity + t + z + y) & 1; 00062 } else if (!(face_Y & 1)) { // face_Y even 00063 int t = face_idx / face_XYZ; 00064 int z = (face_idx / face_XY) % face_Z; 00065 face_idx += (face_parity + t + z) & 1; 00066 } else if (!(face_Z & 1)) { // face_Z even 00067 int t = face_idx / face_XYZ; 00068 face_idx += (face_parity + t) & 1; 00069 } else { 00070 face_idx += face_parity; 00071 } 00072 00073 // compute index into the full local volume 00074 00075 int idx = face_idx; 00076 int gap, aux; 00077 00078 switch (dim) { 00079 case 0: 00080 gap = X1 - nLayers; 00081 aux = face_idx / face_X; 00082 idx += (aux + face_num) * gap; 00083 break; 00084 case 1: 00085 gap = X2 - nLayers; 00086 aux = face_idx / face_XY; 00087 idx += (aux + face_num) * gap * face_X; 00088 break; 00089 case 2: 00090 gap = X3 - nLayers; 00091 aux = face_idx / face_XYZ; 00092 idx += (aux + face_num) * gap * face_XY; 00093 break; 00094 case 3: 00095 gap = X4 - nLayers; 00096 idx += face_num * gap * face_XYZ; 00097 break; 00098 } 00099 00100 // return index into the checkerboard 00101 00102 return idx >> 1; 00103 } 00104 00105 // compute an index into the local volume from an index into the face (used by the face packing routines) 00106 // G.Shi: the spinor order in ghost region is different between wilson and asqtad, thus different index 00107 // computing routine. 00108 template <int dim, int nLayers> 00109 static inline __device__ int indexFromFaceIndexAsqtad(int face_idx, const int &face_volume, 00110 const int &face_num, const int &parity) 00111 { 00112 // dimensions of the face (FIXME: optimize using constant cache) 00113 int dims[3]; 00114 int V = 2*Vh; 00115 int face_X = X1, face_Y = X2, face_Z = X3; // face_T = X4; 00116 switch (dim) { 00117 case 0: 00118 face_X = nLayers; 00119 dims[0]=X2; dims[1]=X3; dims[2]=X4; 00120 break; 00121 case 1: 00122 face_Y = nLayers; 00123 dims[0]=X1;dims[1]=X3; dims[2]=X4; 00124 break; 00125 case 2: 00126 face_Z = nLayers; 00127 dims[0]=X1;dims[1]=X2; dims[2]=X4; 00128 break; 00129 case 3: 00130 // face_T = nLayers; 00131 dims[0]=X1;dims[1]=X2; dims[2]=X4; 00132 break; 00133 } 00134 int face_XYZ = face_X * face_Y * face_Z; 00135 int face_XY = face_X * face_Y; 00136 00137 // intrinsic parity of the face depends on offset of first element 00138 00139 int face_parity; 00140 switch (dim) { 00141 case 0: 00142 face_parity = (parity + face_num * (X1 - nLayers)) & 1; 00143 break; 00144 case 1: 00145 face_parity = (parity + face_num * (X2 - nLayers)) & 1; 00146 break; 00147 case 2: 00148 face_parity = (parity + face_num * (X3 - nLayers)) & 1; 00149 break; 00150 case 3: 00151 face_parity = (parity + face_num * (X4 - nLayers)) & 1; 00152 break; 00153 } 00154 00155 00156 // reconstruct full face index from index into the checkerboard 00157 00158 face_idx *= 2; 00159 /*y,z,t here are face indexes in new order*/ 00160 int aux1 = face_idx / dims[0]; 00161 int aux2 = aux1 / dims[1]; 00162 int y = aux1 - aux2 * dims[1]; 00163 int t = aux2 / dims[2]; 00164 int z = aux2 - t * dims[2]; 00165 face_idx += (face_parity + t + z + y) & 1; 00166 00167 int idx = face_idx; 00168 int gap, aux; 00169 00170 switch (dim) { 00171 case 0: 00172 gap = X1 - nLayers; 00173 aux = face_idx; 00174 idx += face_num*gap + aux*(X1-1); 00175 idx += idx/V*(1-V); 00176 break; 00177 case 1: 00178 gap = X2 - nLayers; 00179 aux = face_idx / face_X; 00180 idx += face_num * gap * face_X + aux*(X2-1)*face_X; 00181 idx += idx/V*(X1-V); 00182 break; 00183 case 2: 00184 gap = X3 - nLayers; 00185 aux = face_idx / face_XY; 00186 idx += face_num * gap * face_XY +aux*(X3-1)*face_XY; 00187 idx += idx/V*(X2X1-V); 00188 break; 00189 case 3: 00190 gap = X4 - nLayers; 00191 idx += face_num * gap * face_XYZ; 00192 break; 00193 } 00194 00195 // return index into the checkerboard 00196 00197 return idx >> 1; 00198 } 00199 00200 00201 // compute full coordinates from an index into the face (used by the exterior Dslash kernels) 00202 template <int nLayers, typename Int> 00203 static inline __device__ void coordsFromFaceIndex(int &idx, int &cb_idx, Int &X, Int &Y, Int &Z, Int &T, int face_idx, 00204 const int &face_volume, const int &dim, const int &face_num, const int &parity) 00205 { 00206 // dimensions of the face (FIXME: optimize using constant cache) 00207 00208 int face_X = X1, face_Y = X2, face_Z = X3; 00209 int face_parity; 00210 switch (dim) { 00211 case 0: 00212 face_X = nLayers; 00213 face_parity = (parity + face_num * (X1 - nLayers)) & 1; 00214 break; 00215 case 1: 00216 face_Y = nLayers; 00217 face_parity = (parity + face_num * (X2 - nLayers)) & 1; 00218 break; 00219 case 2: 00220 face_Z = nLayers; 00221 face_parity = (parity + face_num * (X3 - nLayers)) & 1; 00222 break; 00223 case 3: 00224 face_parity = (parity + face_num * (X4 - nLayers)) & 1; 00225 break; 00226 } 00227 int face_XYZ = face_X * face_Y * face_Z; 00228 int face_XY = face_X * face_Y; 00229 00230 // compute coordinates from (checkerboard) face index 00231 00232 face_idx *= 2; 00233 00234 int x, y, z, t; 00235 00236 if (!(face_X & 1)) { // face_X even 00237 // t = face_idx / face_XYZ; 00238 // z = (face_idx / face_XY) % face_Z; 00239 // y = (face_idx / face_X) % face_Y; 00240 // face_idx += (face_parity + t + z + y) & 1; 00241 // x = face_idx % face_X; 00242 // equivalent to the above, but with fewer divisions/mods: 00243 int aux1 = face_idx / face_X; 00244 x = face_idx - aux1 * face_X; 00245 int aux2 = aux1 / face_Y; 00246 y = aux1 - aux2 * face_Y; 00247 t = aux2 / face_Z; 00248 z = aux2 - t * face_Z; 00249 x += (face_parity + t + z + y) & 1; 00250 // face_idx += (face_parity + t + z + y) & 1; 00251 } else if (!(face_Y & 1)) { // face_Y even 00252 t = face_idx / face_XYZ; 00253 z = (face_idx / face_XY) % face_Z; 00254 face_idx += (face_parity + t + z) & 1; 00255 y = (face_idx / face_X) % face_Y; 00256 x = face_idx % face_X; 00257 } else if (!(face_Z & 1)) { // face_Z even 00258 t = face_idx / face_XYZ; 00259 face_idx += (face_parity + t) & 1; 00260 z = (face_idx / face_XY) % face_Z; 00261 y = (face_idx / face_X) % face_Y; 00262 x = face_idx % face_X; 00263 } else { 00264 face_idx += face_parity; 00265 t = face_idx / face_XYZ; 00266 z = (face_idx / face_XY) % face_Z; 00267 y = (face_idx / face_X) % face_Y; 00268 x = face_idx % face_X; 00269 } 00270 00271 //printf("Local sid %d (%d, %d, %d, %d)\n", cb_int, x, y, z, t); 00272 00273 // need to convert to global coords, not face coords 00274 switch(dim) { 00275 case 0: 00276 x += face_num * (X1-nLayers); 00277 break; 00278 case 1: 00279 y += face_num * (X2-nLayers); 00280 break; 00281 case 2: 00282 z += face_num * (X3-nLayers); 00283 break; 00284 case 3: 00285 t += face_num * (X4-nLayers); 00286 break; 00287 } 00288 00289 // compute index into the full local volume 00290 00291 idx = X1*(X2*(X3*t + z) + y) + x; 00292 00293 // compute index into the checkerboard 00294 00295 cb_idx = idx >> 1; 00296 00297 X = x; 00298 Y = y; 00299 Z = z; 00300 T = t; 00301 00302 //printf("Global sid %d (%d, %d, %d, %d)\n", cb_int, x, y, z, t); 00303 } 00304 00305 00306 // compute coordinates from index into the checkerboard (used by the interior Dslash kernels) 00307 template <typename Int> 00308 static __device__ __forceinline__ void coordsFromIndex(int &idx, Int &X, Int &Y, Int &Z, Int &T, const int &cb_idx, const int &parity) 00309 { 00310 00311 int &LX = X1; 00312 int &LY = X2; 00313 int &LZ = X3; 00314 int &XYZ = X3X2X1; 00315 int &XY = X2X1; 00316 00317 idx = 2*cb_idx; 00318 00319 int x, y, z, t; 00320 00321 if (!(LX & 1)) { // X even 00322 // t = idx / XYZ; 00323 // z = (idx / XY) % Z; 00324 // y = (idx / X) % Y; 00325 // idx += (parity + t + z + y) & 1; 00326 // x = idx % X; 00327 // equivalent to the above, but with fewer divisions/mods: 00328 int aux1 = idx / LX; 00329 x = idx - aux1 * LX; 00330 int aux2 = aux1 / LY; 00331 y = aux1 - aux2 * LY; 00332 t = aux2 / LZ; 00333 z = aux2 - t * LZ; 00334 aux1 = (parity + t + z + y) & 1; 00335 x += aux1; 00336 idx += aux1; 00337 } else if (!(LY & 1)) { // Y even 00338 t = idx / XYZ; 00339 z = (idx / XY) % LZ; 00340 idx += (parity + t + z) & 1; 00341 y = (idx / LX) % LY; 00342 x = idx % LX; 00343 } else if (!(LZ & 1)) { // Z even 00344 t = idx / XYZ; 00345 idx += (parity + t) & 1; 00346 z = (idx / XY) % LZ; 00347 y = (idx / LX) % LY; 00348 x = idx % LX; 00349 } else { 00350 idx += parity; 00351 t = idx / XYZ; 00352 z = (idx / XY) % LZ; 00353 y = (idx / LX) % LY; 00354 x = idx % LX; 00355 } 00356 00357 X = x; 00358 Y = y; 00359 Z = z; 00360 T = t; 00361 } 00362 00363 00364 // routines for packing the ghost zones (multi-GPU only) 00365 00366 #ifdef MULTI_GPU 00367 00368 #ifdef GPU_WILSON_DIRAC 00369 00370 // double precision 00371 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX) 00372 #define READ_SPINOR READ_SPINOR_DOUBLE 00373 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP 00374 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN 00375 #define SPINORTEX in 00376 #else 00377 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX 00378 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX 00379 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX 00380 #define SPINORTEX spinorTexDouble 00381 #endif 00382 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2 00383 #define SPINOR_DOUBLE 00384 template <int dim, int dagger> 00385 static inline __device__ void packFaceWilsonCore(double2 *out, float *outNorm, const double2 *in, const float *inNorm, 00386 const int &idx, const int &face_idx, const int &face_volume, const int &face_num) 00387 { 00388 #if (__COMPUTE_CAPABILITY__ >= 130) 00389 if (dagger) { 00390 #include "wilson_pack_face_dagger_core.h" 00391 } else { 00392 #include "wilson_pack_face_core.h" 00393 } 00394 #endif // (__COMPUTE_CAPABILITY__ >= 130) 00395 } 00396 #undef READ_SPINOR 00397 #undef READ_SPINOR_UP 00398 #undef READ_SPINOR_DOWN 00399 #undef SPINORTEX 00400 #undef WRITE_HALF_SPINOR 00401 #undef SPINOR_DOUBLE 00402 00403 00404 // single precision 00405 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR 00406 #define READ_SPINOR READ_SPINOR_SINGLE 00407 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP 00408 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN 00409 #define SPINORTEX in 00410 #else 00411 #define READ_SPINOR READ_SPINOR_SINGLE_TEX 00412 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX 00413 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX 00414 #define SPINORTEX spinorTexSingle 00415 #endif 00416 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4 00417 template <int dim, int dagger> 00418 static inline __device__ void packFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm, 00419 const int &idx, const int &face_idx, const int &face_volume, const int &face_num) 00420 { 00421 if (dagger) { 00422 #include "wilson_pack_face_dagger_core.h" 00423 } else { 00424 #include "wilson_pack_face_core.h" 00425 } 00426 } 00427 #undef READ_SPINOR 00428 #undef READ_SPINOR_UP 00429 #undef READ_SPINOR_DOWN 00430 #undef SPINORTEX 00431 #undef WRITE_HALF_SPINOR 00432 00433 00434 // half precision 00435 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR 00436 #define READ_SPINOR READ_SPINOR_HALF 00437 #define READ_SPINOR_UP READ_SPINOR_HALF_UP 00438 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN 00439 #define SPINORTEX in 00440 #else 00441 #define READ_SPINOR READ_SPINOR_HALF_TEX 00442 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX 00443 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX 00444 #define SPINORTEX spinorTexHalf 00445 #endif 00446 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4 00447 template <int dim, int dagger> 00448 static inline __device__ void packFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm, 00449 const int &idx, const int &face_idx, const int &face_volume, const int &face_num) 00450 { 00451 if (dagger) { 00452 #include "wilson_pack_face_dagger_core.h" 00453 } else { 00454 #include "wilson_pack_face_core.h" 00455 } 00456 } 00457 #undef READ_SPINOR 00458 #undef READ_SPINOR_UP 00459 #undef READ_SPINOR_DOWN 00460 #undef SPINORTEX 00461 #undef WRITE_HALF_SPINOR 00462 00463 00464 template <int dim, int dagger, typename FloatN> 00465 __global__ void packFaceWilsonKernel(FloatN *out, float *outNorm, const FloatN *in, 00466 const float *inNorm, const int parity) 00467 { 00468 const int nFace = 1; // 1 face for Wilson 00469 const int Nint = 12; // output is spin projected 00470 size_t faceBytes = nFace*ghostFace[dim]*Nint*sizeof(out->x); 00471 if (sizeof(FloatN)==sizeof(short4)) faceBytes += nFace*ghostFace[dim]*sizeof(float); 00472 00473 int face_volume = ghostFace[dim]; 00474 int face_idx = blockIdx.x*blockDim.x + threadIdx.x; 00475 00476 if (face_idx >= 2*nFace*face_volume) return; 00477 00478 // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end 00479 const int face_num = (face_idx >= nFace*face_volume) ? 1 : 0; 00480 face_idx -= face_num*nFace*face_volume; 00481 00482 // compute an index into the local volume from the index into the face 00483 const int idx = indexFromFaceIndex<dim, nFace>(face_idx, face_volume, face_num, parity); 00484 00485 if (face_num) { 00486 out = (FloatN*)((char*)out + faceBytes); 00487 outNorm = (float*)((char*)outNorm + faceBytes); 00488 } 00489 00490 // read spinor, spin-project, and write half spinor to face 00491 packFaceWilsonCore<dim, dagger>(out, outNorm, in, inNorm, idx, face_idx, face_volume, face_num); 00492 } 00493 00494 #endif // GPU_WILSON_DIRAC 00495 00496 00497 template <typename FloatN> 00498 void packFaceWilson(FloatN *faces, float *facesNorm, const FloatN *in, const float *inNorm, 00499 const int dim, const int dagger, const int parity, 00500 const dim3 &gridDim, const dim3 &blockDim, const cudaStream_t &stream) 00501 { 00502 #ifdef GPU_WILSON_DIRAC 00503 if (dagger) { 00504 switch (dim) { 00505 case 0: packFaceWilsonKernel<0,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00506 case 1: packFaceWilsonKernel<1,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00507 case 2: packFaceWilsonKernel<2,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00508 case 3: packFaceWilsonKernel<3,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00509 } 00510 } else { 00511 switch (dim) { 00512 case 0: packFaceWilsonKernel<0,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00513 case 1: packFaceWilsonKernel<1,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00514 case 2: packFaceWilsonKernel<2,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00515 case 3: packFaceWilsonKernel<3,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00516 } 00517 } 00518 #else 00519 errorQuda("Wilson face packing kernel is not built"); 00520 #endif 00521 } 00522 00523 00524 void packFaceWilson(void *ghost_buf, cudaColorSpinorField &in, const int dim, const int dagger, 00525 const int parity, const cudaStream_t &stream) { 00526 const int nFace = 1; // 1 face for Wilson 00527 00528 unsigned int threads = in.GhostFace()[dim]*nFace*2; 00529 dim3 blockDim(64, 1, 1); // TODO: make this a parameter for auto-tuning 00530 dim3 gridDim( (threads+blockDim.x-1) / blockDim.x, 1, 1); 00531 00532 // compute location of norm zone 00533 int Nint = in.Ncolor() * in.Nspin(); // assume spin projection 00534 float *ghostNorm = (float*)((char*)ghost_buf + Nint*nFace*in.GhostFace()[dim]*in.Precision()); 00535 00536 switch(in.Precision()) { 00537 case QUDA_DOUBLE_PRECISION: 00538 packFaceWilson((double2*)ghost_buf, ghostNorm, (double2*)in.V(), (float*)in.Norm(), 00539 dim, dagger, parity, gridDim, blockDim, stream); 00540 break; 00541 case QUDA_SINGLE_PRECISION: 00542 packFaceWilson((float4*)ghost_buf, ghostNorm, (float4*)in.V(), (float*)in.Norm(), 00543 dim, dagger, parity, gridDim, blockDim, stream); 00544 break; 00545 case QUDA_HALF_PRECISION: 00546 packFaceWilson((short4*)ghost_buf, ghostNorm, (short4*)in.V(), (float*)in.Norm(), 00547 dim, dagger, parity, gridDim, blockDim, stream); 00548 break; 00549 } 00550 } 00551 00552 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX) 00553 template <typename Float2> 00554 __device__ void packSpinor(Float2 *out, float *outNorm, int out_idx, int out_stride, 00555 const Float2 *in, const float *inNorm, int in_idx, int in_stride) { 00556 out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride]; 00557 out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride]; 00558 out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride]; 00559 } 00560 template<> __device__ void packSpinor(short2 *out, float *outNorm, int out_idx, int out_stride, 00561 const short2 *in, const float *inNorm, int in_idx, int in_stride) { 00562 out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride]; 00563 out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride]; 00564 out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride]; 00565 outNorm[out_idx] = inNorm[in_idx]; 00566 } 00567 #else 00568 __device__ void packSpinor(double2 *out, float *outNorm, int out_idx, int out_stride, 00569 const double2 *in, const float *inNorm, int in_idx, int in_stride) { 00570 out[out_idx + 0*out_stride] = fetch_double2(spinorTexDouble, in_idx + 0*in_stride); 00571 out[out_idx + 1*out_stride] = fetch_double2(spinorTexDouble, in_idx + 1*in_stride); 00572 out[out_idx + 2*out_stride] = fetch_double2(spinorTexDouble, in_idx + 2*in_stride); 00573 } 00574 __device__ void packSpinor(float2 *out, float *outNorm, int out_idx, int out_stride, 00575 const float2 *in, const float *inNorm, int in_idx, int in_stride) { 00576 out[out_idx + 0*out_stride] = tex1Dfetch(spinorTexSingle2, in_idx + 0*in_stride); 00577 out[out_idx + 1*out_stride] = tex1Dfetch(spinorTexSingle2, in_idx + 1*in_stride); 00578 out[out_idx + 2*out_stride] = tex1Dfetch(spinorTexSingle2, in_idx + 2*in_stride); 00579 } 00580 __device__ void packSpinor(short2 *out, float *outNorm, int out_idx, int out_stride, 00581 const short2 *in, const float *inNorm, int in_idx, int in_stride) { 00582 out[out_idx + 0*out_stride] = float22short2(1.0f, tex1Dfetch(spinorTexHalf2, in_idx + 0*in_stride)); 00583 out[out_idx + 1*out_stride] = float22short2(1.0f, tex1Dfetch(spinorTexHalf2, in_idx + 1*in_stride)); 00584 out[out_idx + 2*out_stride] = float22short2(1.0f, tex1Dfetch(spinorTexHalf2, in_idx + 2*in_stride)); 00585 outNorm[out_idx] = tex1Dfetch(spinorTexHalfNorm, in_idx); 00586 } 00587 #endif 00588 00589 // 00590 // TODO: add support for textured reads 00591 00592 #ifdef GPU_STAGGERED_DIRAC 00593 template <int dim, int ishalf, typename Float2> 00594 __global__ void packFaceAsqtadKernel(Float2 *out, float *outNorm, const Float2 *in, 00595 const float *inNorm, const int parity) 00596 { 00597 const int nFace = 3; //3 faces for asqtad 00598 const int Nint = 6; // number of internal degrees of freedom 00599 size_t faceBytes = nFace*ghostFace[dim]*Nint*sizeof(out->x); 00600 if (ishalf) faceBytes += nFace*ghostFace[dim]*sizeof(float); 00601 00602 int face_volume = ghostFace[dim]; 00603 int face_idx = blockIdx.x*blockDim.x + threadIdx.x; 00604 00605 if (face_idx >= 2*nFace*face_volume) return; 00606 00607 // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end 00608 const int face_num = (face_idx >= nFace*face_volume) ? 1 : 0; 00609 face_idx -= face_num*nFace*face_volume; 00610 00611 // compute an index into the local volume from the index into the face 00612 const int idx = indexFromFaceIndexAsqtad<dim, nFace>(face_idx, face_volume, face_num, parity); 00613 00614 if (face_num) { 00615 out = (Float2*)((char*)out + faceBytes); 00616 outNorm = (float*)((char*)outNorm + faceBytes); 00617 } 00618 00619 packSpinor(out, outNorm, face_idx, nFace*face_volume, in, inNorm, idx, sp_stride); 00620 00621 /* Float2 tmp1 = in[idx + 0*sp_stride]; 00622 Float2 tmp2 = in[idx + 1*sp_stride]; 00623 Float2 tmp3 = in[idx + 2*sp_stride]; 00624 00625 out[face_idx + 0*nFace*face_volume] = tmp1; 00626 out[face_idx + 1*nFace*face_volume] = tmp2; 00627 out[face_idx + 2*nFace*face_volume] = tmp3; 00628 00629 if (ishalf) outNorm[face_idx] = inNorm[idx];*/ 00630 } 00631 #endif // GPU_STAGGERED_DIRAC 00632 00633 00634 template <typename Float2> 00635 void packFaceAsqtad(Float2 *faces, float *facesNorm, const Float2 *in, const float *inNorm, int dim, 00636 const int parity, const dim3 &gridDim, const dim3 &blockDim, 00637 const cudaStream_t &stream) 00638 { 00639 #ifdef GPU_STAGGERED_DIRAC 00640 if(typeid(Float2) != typeid(short2)){ 00641 switch (dim) { 00642 case 0: packFaceAsqtadKernel<0,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00643 case 1: packFaceAsqtadKernel<1,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00644 case 2: packFaceAsqtadKernel<2,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00645 case 3: packFaceAsqtadKernel<3,0><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00646 } 00647 }else{ 00648 switch(dim){ 00649 case 0: packFaceAsqtadKernel<0,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00650 case 1: packFaceAsqtadKernel<1,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00651 case 2: packFaceAsqtadKernel<2,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00652 case 3: packFaceAsqtadKernel<3,1><<<gridDim, blockDim, 0, stream>>>(faces, facesNorm, in, inNorm, parity); break; 00653 } 00654 } 00655 #else 00656 errorQuda("Asqtad face packing kernel is not built"); 00657 #endif 00658 } 00659 00660 00661 void packFaceAsqtad(void *ghost_buf, cudaColorSpinorField &in, const int dim, const int dagger, 00662 const int parity, const cudaStream_t &stream) { 00663 const int nFace = 3; //3 faces for asqtad 00664 00665 unsigned int threads = in.GhostFace()[dim]*nFace*2; 00666 dim3 blockDim(64, 1, 1); // TODO: make this a parameter for auto-tuning 00667 dim3 gridDim( (threads+blockDim.x-1) / blockDim.x, 1, 1); 00668 00669 // compute location of norm zone 00670 int Nint = 6; 00671 float *ghostNorm = (float*)((char*)ghost_buf + Nint*nFace*in.GhostFace()[dim]*in.Precision()); 00672 00673 switch(in.Precision()) { 00674 case QUDA_DOUBLE_PRECISION: 00675 packFaceAsqtad((double2*)ghost_buf, ghostNorm, (double2*)in.V(), (float*)in.Norm(), 00676 dim, parity, gridDim, blockDim, stream); 00677 break; 00678 case QUDA_SINGLE_PRECISION: 00679 packFaceAsqtad((float2*)ghost_buf, ghostNorm, (float2*)in.V(), (float*)in.Norm(), 00680 dim, parity, gridDim, blockDim, stream); 00681 break; 00682 case QUDA_HALF_PRECISION: 00683 packFaceAsqtad((short2*)ghost_buf, ghostNorm, (short2*)in.V(), (float*)in.Norm(), 00684 dim, parity, gridDim, blockDim, stream); 00685 break; 00686 } 00687 00688 } 00689 00690 void packFace(void *ghost_buf, cudaColorSpinorField &in, const int dim, const int dagger, 00691 const int parity, const cudaStream_t &stream) 00692 { 00693 if(in.Nspin() == 1){ 00694 packFaceAsqtad(ghost_buf, in, dim, dagger, parity, stream); 00695 }else{ 00696 packFaceWilson(ghost_buf, in, dim, dagger, parity, stream); 00697 } 00698 } 00699 00700 #endif // MULTI_GPU 00701 00702