36 #define WARP_CONVERGED 0xffffffff
51 static const bool value =
false;
70 int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) &
WARP_MASK;
71 const T *warp_begin_src = src - warp_id;
73 const U *as_int_src = (
const U *)warp_begin_src;
75 int_store loaded = warp_load<int_store>(as_int_src, warp_id);
77 return detail::fuse<T>(loaded);
81 __device__
typename enable_if<detail::use_direct<T>::value, T>::type
88 __device__
typename enable_if<detail::use_shfl<T>::value>::type
90 int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) &
WARP_MASK;
91 T *warp_begin_dest = dest - warp_id;
93 U *as_int_dest = (U *)warp_begin_dest;
95 int_store lysed = detail::lyse<U>(data);
101 __device__
typename enable_if<detail::use_direct<T>::value>::type
113 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
117 T* base_ptr =
__shfl(src, div);
119 U* result = ((U*)(base_ptr) + mod);
142 template<
int s,
typename T>
150 update_indices<T>(div, mod);
171 template<
int s,
typename T>
176 T* dest,
int div,
int mod) {
179 update_indices<T>(div, mod);
189 T* dest,
int div,
int mod) {
198 int neighbor_idx = (warp_id == 0) ? 0 : warp_id-1;
199 const T* neighbor_ptr =
__shfl(ptr, neighbor_idx);
200 bool neighbor_contiguous = (warp_id == 0) ?
true : (ptr - neighbor_ptr ==
sizeof(T));
201 bool result = __all(neighbor_contiguous);
208 int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) &
WARP_MASK;
217 return detail::fuse<T>(loaded);
233 int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) &
WARP_MASK;
239 u_store lysed = detail::lyse<U>(data);
256 __device__ T
load(
const T* src) {
265 __device__
void store(
const T& data, T* dest) {
__device__ __forceinline__ T __shfl(const T &t, const int &i)
__device__ enable_if< use_divergent< T >::value, T >::type divergent_load(const T *src)
__device__ bool is_contiguous(int warp_id, const T *ptr)
__device__ enable_if< use_shfl< T >::value >::type store_dispatch(const T &data, T *dest)
__device__ enable_if< use_shfl< T >::value, T >::type load_dispatch(const T *src)
__device__ detail::dismember_type< T >::type * compute_address(T *src, int div, int mod)
__device__ enable_if< use_divergent< T >::value >::type divergent_store(const T &data, T *dest)
__device__ void update_indices(int &div, int &mod)
__device__ void store(const T &data, T *dest)
__device__ enable_if< detail::use_shfl< T >::value, T >::type load_warp_contiguous(const T *src)
__device__ bool warp_converged()
__host__ __device__ void warp_store(const Array &t, typename Array::head_type *ptr, int offset, int stride=32)
__device__ enable_if< detail::use_shfl< T >::value >::type store_warp_contiguous(const T &data, T *dest)
__device__ T load(const T *src)
__device__ void r2c_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
__device__ void c2r_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
static const int mod_offset
static const int div_offset
detail::dismember_type< T >::type U
detail::dismember_type< T >::type U
static __device__ array< U, 1 > impl(const T *src, int div, int mod)
detail::dismember_type< T >::type U
static __device__ array< U, s > impl(const T *src, int div, int mod)
detail::dismember_type< T >::type U
static __device__ void impl(const array< U, 1 > &src, T *dest, int div, int mod)
detail::dismember_type< T >::type U
static __device__ void impl(const array< U, s > &src, T *dest, int div, int mod)
dismember_type< T >::type U