quda-ref/v1.1.0/aos_8h_source.html

 /*

 Copyright (c) 2013, NVIDIA Corporation

 All rights reserved.


 Redistribution and use in source and binary forms, with or without

 modification, are permitted provided that the following conditions are met:

     * Redistributions of source code must retain the above copyright

       notice, this list of conditions and the following disclaimer.

     * Redistributions in binary form must reproduce the above copyright

       notice, this list of conditions and the following disclaimer in the

       documentation and/or other materials provided with the distribution.

     * Neither the name of the <organization> nor the

       names of its contributors may be used to endorse or promote products

       derived from this software without specific prior written permission.


 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY

 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */


 #pragma once

 #include <trove/detail/dismember.h>

 #include <trove/detail/fallback.h>

 #include <trove/warp.h>

 #include <trove/transpose.h>

 #include <trove/utility.h>

 #include <trove/memory.h>


 #define WARP_CONVERGED 0xffffffff


 namespace trove {


 namespace detail {


 template<typename T>

 struct size_in_range {

     typedef typename dismember_type<T>::type U;

     static const int size = aliased_size<T, U>::value;

     static const bool value = (size > 1) && (size < 64);

 };


 template<typename T, bool s=size_multiple_power_of_two<T, 2>::value, bool r=size_in_range<T>::value>

 struct use_shfl {

     static const bool value = false;

 };


 template<typename T>

 struct use_shfl<T, true, true> {

     static const bool value = true;

 };


 template<typename T>

 struct use_direct {

     static const bool value = !(use_shfl<T>::value);

 };


 }


 template<typename T>

 __device__ typename enable_if<detail::use_shfl<T>::value, T>::type

 load_warp_contiguous(const T* src) {

   int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;

   const T *warp_begin_src = src - warp_id;

   typedef typename detail::dismember_type<T>::type U;

   const U *as_int_src = (const U *)warp_begin_src;

   typedef array<U, detail::aliased_size<T, U>::value> int_store;

   int_store loaded = warp_load<int_store>(as_int_src, warp_id);

   r2c_warp_transpose(loaded);

   return detail::fuse<T>(loaded);

 }


 template<typename T>

 __device__ typename enable_if<detail::use_direct<T>::value, T>::type

 load_warp_contiguous(const T* src) {

     return detail::divergent_load(src);

 }


 template<typename T>

 __device__ typename enable_if<detail::use_shfl<T>::value>::type

 store_warp_contiguous(const T& data, T* dest) {

   int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;

   T *warp_begin_dest = dest - warp_id;

   typedef typename detail::dismember_type<T>::type U;

   U *as_int_dest = (U *)warp_begin_dest;

   typedef array<U, detail::aliased_size<T, U>::value> int_store;

   int_store lysed = detail::lyse<U>(data);

   c2r_warp_transpose(lysed);

   warp_store(lysed, as_int_dest, warp_id);

 }


 template<typename T>

 __device__ typename enable_if<detail::use_direct<T>::value>::type

 store_warp_contiguous(const T& data, T* dest) {

     detail::divergent_store(data, dest);

 }


 namespace detail {


 template<typename T>

 __device__ typename detail::dismember_type<T>::type*

 compute_address(T* src, int div, int mod) {

     typedef typename detail::dismember_type<T>::type U;

 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)

 // we have already asserted that we have warp convergence here so just use full warp mask

     T* base_ptr = __shfl_sync(WARP_CONVERGED, src, div);

 #else

     T* base_ptr = __shfl(src, div);

 #endif

     U* result = ((U*)(base_ptr) + mod);

     return result;

 }


 template<typename T>

 struct address_constants {

     typedef typename detail::dismember_type<T>::type U;

     static const int m = aliased_size<T, U>::value;

     static const int mod_offset = WARP_SIZE % m;

     static const int div_offset = WARP_SIZE / m;

 };


 template<typename T>

 __device__ void update_indices(int& div, int& mod) {

     mod += address_constants<T>::mod_offset;

     if (mod >= address_constants<T>::m) {

         mod -= address_constants<T>::m;

         div += 1;

     }

     div += address_constants<T>::div_offset;

 }


 template<int s, typename T>

 struct indexed_load {

     typedef typename detail::dismember_type<T>::type U;

     __device__

     static array<U, s> impl(const T* src, int div, int mod) {

         U result;

         U* address = compute_address(src, div, mod);

         result = *address;

         update_indices<T>(div, mod);


         return array<U, s>(

             result,

             indexed_load<s-1, T>::impl(src, div, mod));

     }

 };


 template<typename T>

 struct indexed_load<1, T> {

     typedef typename detail::dismember_type<T>::type U;

     __device__

     static array<U, 1> impl(const T* src, int div, int mod) {

         U result;

         U* address = compute_address(src, div, mod);

         result = *address;

         return array<U, 1>(result);

     }

 };


 template<int s, typename T>

 struct indexed_store {

     typedef typename detail::dismember_type<T>::type U;

     __device__

     static void impl(const array<U, s>& src,

                      T* dest, int div, int mod) {

         U* address = compute_address(dest, div, mod);

         *address = src.head;

         update_indices<T>(div, mod);

         indexed_store<s-1, T>::impl(src.tail, dest, div, mod);

     }

 };


 template<typename T>

 struct indexed_store<1, T> {

     typedef typename detail::dismember_type<T>::type U;

     __device__

     static void impl(const array<U, 1>& src,

                      T* dest, int div, int mod) {

         U* address = compute_address(dest, div, mod);

         *address = src.head;

     }

 };


 template<typename T>

 __device__

 bool is_contiguous(int warp_id, const T* ptr) {

     int neighbor_idx = (warp_id == 0) ? 0 : warp_id-1;

     const T* neighbor_ptr = __shfl(ptr, neighbor_idx);

     bool neighbor_contiguous = (warp_id == 0) ? true : (ptr - neighbor_ptr == sizeof(T));

     bool result = __all(neighbor_contiguous);

     return result;

 }


 template<typename T>

 __device__ typename enable_if<use_shfl<T>::value, T>::type

 load_dispatch(const T* src) {

   int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;

   // if (detail::is_contiguous(warp_id, src)) {

   //     return detail::load_warp_contiguous(src);

   // } else {

   typedef typename detail::dismember_type<T>::type U;

   typedef array<U, detail::aliased_size<T, U>::value> u_store;

   u_store loaded = detail::indexed_load<detail::aliased_size<T, U>::value, T>::impl(

     src, warp_id / address_constants<T>::m, warp_id % address_constants<T>::m);

   r2c_warp_transpose(loaded);

   return detail::fuse<T>(loaded);

   // }

 }


 template<typename T>

 __device__ typename enable_if<use_direct<T>::value, T>::type

 load_dispatch(const T* src) {

     return detail::divergent_load(src);

 }


 template<typename T>

 __device__ typename enable_if<use_shfl<T>::value>::type

 store_dispatch(const T& data, T* dest) {

   int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;

   // if (detail::is_contiguous(warp_id, dest)) {

   //     detail::store_warp_contiguous(data, dest);

   // } else {

   typedef typename detail::dismember_type<T>::type U;

   typedef array<U, detail::aliased_size<T, U>::value> u_store;

   u_store lysed = detail::lyse<U>(data);

   c2r_warp_transpose(lysed);

   detail::indexed_store<detail::aliased_size<T, U>::value, T>::impl(lysed, dest, warp_id / address_constants<T>::m,

                                                                     warp_id % address_constants<T>::m);

   // }

 }


 template<typename T>

 __device__ typename enable_if<use_direct<T>::value>::type

 store_dispatch(const T& data, T* dest) {

     detail::divergent_store(data, dest);

 }


 }


 template<typename T>

 __device__ T load(const T* src) {

     if (warp_converged()) {

         return detail::load_dispatch(src);

     } else {

         return detail::divergent_load(src);

     }

 }


 template<typename T>

 __device__ void store(const T& data, T* dest) {

     if (warp_converged()) {

         detail::store_dispatch(data, dest);

     } else {

         detail::divergent_store(data, dest);

     }

 }


 }

WARP_CONVERGED
#define WARP_CONVERGED
Definition: aos.h:36

dismember.h

fallback.h

__shfl
__device__ __forceinline__ T __shfl(const T &t, const int &i)
Definition: shfl.h:214

memory.h

detail
Definition: alias.h:4

trove::detail::divergent_load
__device__ enable_if< use_divergent< T >::value, T >::type divergent_load(const T *src)
Definition: fallback.h:70

trove::detail::is_contiguous
__device__ bool is_contiguous(int warp_id, const T *ptr)
Definition: aos.h:197

trove::detail::store_dispatch
__device__ enable_if< use_shfl< T >::value >::type store_dispatch(const T &data, T *dest)
Definition: aos.h:232

trove::detail::load_dispatch
__device__ enable_if< use_shfl< T >::value, T >::type load_dispatch(const T *src)
Definition: aos.h:207

trove::detail::compute_address
__device__ detail::dismember_type< T >::type * compute_address(T *src, int div, int mod)
Definition: aos.h:111

trove::detail::divergent_store
__device__ enable_if< use_divergent< T >::value >::type divergent_store(const T &data, T *dest)
Definition: fallback.h:116

trove::detail::update_indices
__device__ void update_indices(int &div, int &mod)
Definition: aos.h:132

trove
Definition: aos.h:38

trove::store
__device__ void store(const T &data, T *dest)
Definition: aos.h:265

trove::load_warp_contiguous
__device__ enable_if< detail::use_shfl< T >::value, T >::type load_warp_contiguous(const T *src)
Definition: aos.h:69

trove::warp_converged
__device__ bool warp_converged()
Definition: warp.h:35

trove::warp_store
__host__ __device__ void warp_store(const Array &t, typename Array::head_type *ptr, int offset, int stride=32)
Definition: memory.h:137

trove::store_warp_contiguous
__device__ enable_if< detail::use_shfl< T >::value >::type store_warp_contiguous(const T &data, T *dest)
Definition: aos.h:89

trove::load
__device__ T load(const T *src)
Definition: aos.h:256

trove::r2c_warp_transpose
__device__ void r2c_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
Definition: transpose.h:655

trove::c2r_warp_transpose
__device__ void c2r_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
Definition: transpose.h:621

trove::array
Definition: array.h:33

trove::array::head
head_type head
Definition: array.h:38

trove::array::tail
tail_type tail
Definition: array.h:39

trove::detail::address_constants
Definition: aos.h:124

trove::detail::address_constants::mod_offset
static const int mod_offset
Definition: aos.h:127

trove::detail::address_constants::div_offset
static const int div_offset
Definition: aos.h:128

trove::detail::address_constants::U
detail::dismember_type< T >::type U
Definition: aos.h:125

trove::detail::address_constants::m
static const int m
Definition: aos.h:126

trove::detail::aliased_size
Definition: dismember.h:62

trove::detail::dismember_type::type
char type
Definition: dismember.h:42

trove::detail::indexed_load< 1, T >::U
detail::dismember_type< T >::type U
Definition: aos.h:161

trove::detail::indexed_load< 1, T >::impl
static __device__ array< U, 1 > impl(const T *src, int div, int mod)
Definition: aos.h:163

trove::detail::indexed_load
Definition: aos.h:143

trove::detail::indexed_load::U
detail::dismember_type< T >::type U
Definition: aos.h:144

trove::detail::indexed_load::impl
static __device__ array< U, s > impl(const T *src, int div, int mod)
Definition: aos.h:146

trove::detail::indexed_store< 1, T >::U
detail::dismember_type< T >::type U
Definition: aos.h:186

trove::detail::indexed_store< 1, T >::impl
static __device__ void impl(const array< U, 1 > &src, T *dest, int div, int mod)
Definition: aos.h:188

trove::detail::indexed_store
Definition: aos.h:172

trove::detail::indexed_store::U
detail::dismember_type< T >::type U
Definition: aos.h:173

trove::detail::indexed_store::impl
static __device__ void impl(const array< U, s > &src, T *dest, int div, int mod)
Definition: aos.h:175

trove::detail::size_in_range
Definition: aos.h:43

trove::detail::size_in_range::U
dismember_type< T >::type U
Definition: aos.h:44

trove::detail::size_in_range::size
static const int size
Definition: aos.h:45

trove::detail::size_in_range::value
static const bool value
Definition: aos.h:46

trove::detail::use_direct
Definition: aos.h:60

trove::detail::use_direct::value
static const bool value
Definition: aos.h:61

trove::detail::use_shfl
Definition: aos.h:50

trove::detail::use_shfl::value
static const bool value
Definition: aos.h:51

trove::enable_if
Definition: utility.h:149

transpose.h

utility.h

warp.h

WARP_MASK
#define WARP_MASK
Definition: warp.h:46

WARP_SIZE
#define WARP_SIZE
Definition: warp.h:45