30 extern __shared__
int cache_[];
31 return reinterpret_cast<real *
>(cache_);
42 int j = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
44 for (
int i = 0; i < 2 * a.size; i++) {
45 cache()[j] = *(
reinterpret_cast<const real *
>(a.data) + i);
46 j += blockDim.z * blockDim.y * blockDim.x;
60 int j = (z * blockDim.y + y) * blockDim.x + x;
62 for (
int i = 0; i < 2 * a.size; i++) {
63 *(
reinterpret_cast<real *
>(a.data) + i) =
cache()[j];
64 j += blockDim.z * blockDim.y * blockDim.x;
72 __device__
inline void sync() { __syncthreads(); }
__device__ real * cache()
This is the handle to the shared memory.
__device__ Vector load(int x, int y, int z)
Load a vector from the shared memory cache.
__device__ void sync()
Synchronize the cache.
Class which wraps around a shared memory cache for a Vector type, where each thread in the thread blo...
__device__ void save(const Vector &a)
Save the vector into the 3-d shared memory cache. Implicitly store the vector at coordinates given by...