35 template<
typename Array>
38 template<
typename T,
int s>
40 __host__ __device__
static void impl(
42 T* ptr,
int offset,
int stride) {
45 d.
tail, ptr, offset + stride, stride);
51 __host__ __device__
static void impl(
53 T* ptr,
int offset,
int stride) {
58 template<
typename Array>
61 template<
typename T,
int s>
63 __host__ __device__
static void impl(
72 __host__ __device__
static void impl(
85 __host__ __device__
static void impl(
92 __host__ __device__
static void impl(
101 template<
typename Array>
104 template<
typename T,
int s>
136 template<
typename Array>
138 typename Array::head_type* ptr,
139 int offset,
int stride=32) {
143 template<
typename Array>
144 __host__ __device__ Array
warp_load(
const typename Array::head_type* ptr,
145 int offset,
int stride=32) {
149 template<
typename Array>
151 const volatile typename Array::head_type* ptr,
152 int offset,
int stride=32) {
156 template<
typename Array>
158 typename Array::head_type* ptr,
163 template<
typename Array>
165 volatile typename Array::head_type* ptr,
__host__ __device__ void uncoalesced_store(const Array &t, typename Array::head_type *ptr, int stride=1)
__host__ __device__ void warp_store(const Array &t, typename Array::head_type *ptr, int offset, int stride=32)
__host__ __device__ Array warp_load(const typename Array::head_type *ptr, int offset, int stride=32)
__host__ static __device__ void impl(const array< T, 1 > &d, volatile T *ptr, int offset=0, int stride=1)
__host__ static __device__ void impl(const array< T, 1 > &d, T *ptr, int offset=0, int stride=1)
__host__ static __device__ void impl(const array< T, s > &d, volatile T *ptr, int offset=0, int stride=1)
__host__ static __device__ void impl(const array< T, s > &d, T *ptr, int offset=0, int stride=1)
__host__ static __device__ array< T, 1 > impl(const volatile T *ptr, int offset, int stride=32)
__host__ static __device__ array< T, 1 > impl(const T *ptr, int offset, int stride=32)
__host__ static __device__ array< T, s > impl(const volatile T *ptr, int offset, int stride=32)
__host__ static __device__ array< T, s > impl(const T *ptr, int offset, int stride=32)
__host__ static __device__ void impl(const array< T, 1 > &d, T *ptr, int offset, int stride)
__host__ static __device__ void impl(const array< T, s > &d, T *ptr, int offset, int stride)