QUDA  v1.1.0
A library for QCD on GPUs
aos.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2013, NVIDIA Corporation
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #pragma once
29 #include <trove/detail/dismember.h>
30 #include <trove/detail/fallback.h>
31 #include <trove/warp.h>
32 #include <trove/transpose.h>
33 #include <trove/utility.h>
34 #include <trove/memory.h>
35 
36 #define WARP_CONVERGED 0xffffffff
37 
38 namespace trove {
39 
40 namespace detail {
41 
42 template<typename T>
43 struct size_in_range {
44  typedef typename dismember_type<T>::type U;
45  static const int size = aliased_size<T, U>::value;
46  static const bool value = (size > 1) && (size < 64);
47 };
48 
49 template<typename T, bool s=size_multiple_power_of_two<T, 2>::value, bool r=size_in_range<T>::value>
50 struct use_shfl {
51  static const bool value = false;
52 };
53 
54 template<typename T>
55 struct use_shfl<T, true, true> {
56  static const bool value = true;
57 };
58 
59 template<typename T>
60 struct use_direct {
61  static const bool value = !(use_shfl<T>::value);
62 };
63 
64 }
65 
66 
67 template<typename T>
68 __device__ typename enable_if<detail::use_shfl<T>::value, T>::type
69 load_warp_contiguous(const T* src) {
70  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
71  const T *warp_begin_src = src - warp_id;
72  typedef typename detail::dismember_type<T>::type U;
73  const U *as_int_src = (const U *)warp_begin_src;
75  int_store loaded = warp_load<int_store>(as_int_src, warp_id);
76  r2c_warp_transpose(loaded);
77  return detail::fuse<T>(loaded);
78 }
79 
80 template<typename T>
81 __device__ typename enable_if<detail::use_direct<T>::value, T>::type
82 load_warp_contiguous(const T* src) {
83  return detail::divergent_load(src);
84 }
85 
86 
87 template<typename T>
88 __device__ typename enable_if<detail::use_shfl<T>::value>::type
89 store_warp_contiguous(const T& data, T* dest) {
90  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
91  T *warp_begin_dest = dest - warp_id;
92  typedef typename detail::dismember_type<T>::type U;
93  U *as_int_dest = (U *)warp_begin_dest;
95  int_store lysed = detail::lyse<U>(data);
96  c2r_warp_transpose(lysed);
97  warp_store(lysed, as_int_dest, warp_id);
98 }
99 
100 template<typename T>
101 __device__ typename enable_if<detail::use_direct<T>::value>::type
102 store_warp_contiguous(const T& data, T* dest) {
103  detail::divergent_store(data, dest);
104 }
105 
106 
107 namespace detail {
108 
109 template<typename T>
110 __device__ typename detail::dismember_type<T>::type*
111 compute_address(T* src, int div, int mod) {
112  typedef typename detail::dismember_type<T>::type U;
113 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
114 // we have already asserted that we have warp convergence here so just use full warp mask
115  T* base_ptr = __shfl_sync(WARP_CONVERGED, src, div);
116 #else
117  T* base_ptr = __shfl(src, div);
118 #endif
119  U* result = ((U*)(base_ptr) + mod);
120  return result;
121 }
122 
123 template<typename T>
126  static const int m = aliased_size<T, U>::value;
127  static const int mod_offset = WARP_SIZE % m;
128  static const int div_offset = WARP_SIZE / m;
129 };
130 
131 template<typename T>
132 __device__ void update_indices(int& div, int& mod) {
134  if (mod >= address_constants<T>::m) {
136  div += 1;
137  }
139 }
140 
141 
142 template<int s, typename T>
143 struct indexed_load {
145  __device__
146  static array<U, s> impl(const T* src, int div, int mod) {
147  U result;
148  U* address = compute_address(src, div, mod);
149  result = *address;
150  update_indices<T>(div, mod);
151 
152 
153  return array<U, s>(
154  result,
155  indexed_load<s-1, T>::impl(src, div, mod));
156  }
157 };
158 
159 template<typename T>
160 struct indexed_load<1, T> {
162  __device__
163  static array<U, 1> impl(const T* src, int div, int mod) {
164  U result;
165  U* address = compute_address(src, div, mod);
166  result = *address;
167  return array<U, 1>(result);
168  }
169 };
170 
171 template<int s, typename T>
174  __device__
175  static void impl(const array<U, s>& src,
176  T* dest, int div, int mod) {
177  U* address = compute_address(dest, div, mod);
178  *address = src.head;
179  update_indices<T>(div, mod);
180  indexed_store<s-1, T>::impl(src.tail, dest, div, mod);
181  }
182 };
183 
184 template<typename T>
185 struct indexed_store<1, T> {
187  __device__
188  static void impl(const array<U, 1>& src,
189  T* dest, int div, int mod) {
190  U* address = compute_address(dest, div, mod);
191  *address = src.head;
192  }
193 };
194 
195 template<typename T>
196 __device__
197 bool is_contiguous(int warp_id, const T* ptr) {
198  int neighbor_idx = (warp_id == 0) ? 0 : warp_id-1;
199  const T* neighbor_ptr = __shfl(ptr, neighbor_idx);
200  bool neighbor_contiguous = (warp_id == 0) ? true : (ptr - neighbor_ptr == sizeof(T));
201  bool result = __all(neighbor_contiguous);
202  return result;
203 }
204 
205 template<typename T>
206 __device__ typename enable_if<use_shfl<T>::value, T>::type
207 load_dispatch(const T* src) {
208  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
209  // if (detail::is_contiguous(warp_id, src)) {
210  // return detail::load_warp_contiguous(src);
211  // } else {
212  typedef typename detail::dismember_type<T>::type U;
215  src, warp_id / address_constants<T>::m, warp_id % address_constants<T>::m);
216  r2c_warp_transpose(loaded);
217  return detail::fuse<T>(loaded);
218  // }
219 }
220 
221 
222 
223 template<typename T>
224 __device__ typename enable_if<use_direct<T>::value, T>::type
225 load_dispatch(const T* src) {
226  return detail::divergent_load(src);
227 }
228 
229 
230 template<typename T>
231 __device__ typename enable_if<use_shfl<T>::value>::type
232 store_dispatch(const T& data, T* dest) {
233  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
234  // if (detail::is_contiguous(warp_id, dest)) {
235  // detail::store_warp_contiguous(data, dest);
236  // } else {
237  typedef typename detail::dismember_type<T>::type U;
239  u_store lysed = detail::lyse<U>(data);
240  c2r_warp_transpose(lysed);
242  warp_id % address_constants<T>::m);
243  // }
244 }
245 
246 template<typename T>
247 __device__ typename enable_if<use_direct<T>::value>::type
248 store_dispatch(const T& data, T* dest) {
249  detail::divergent_store(data, dest);
250 }
251 
252 
253 }
254 
255 template<typename T>
256 __device__ T load(const T* src) {
257  if (warp_converged()) {
258  return detail::load_dispatch(src);
259  } else {
260  return detail::divergent_load(src);
261  }
262 }
263 
264 template<typename T>
265 __device__ void store(const T& data, T* dest) {
266  if (warp_converged()) {
267  detail::store_dispatch(data, dest);
268  } else {
269  detail::divergent_store(data, dest);
270  }
271 }
272 
273 }
#define WARP_CONVERGED
Definition: aos.h:36
__device__ __forceinline__ T __shfl(const T &t, const int &i)
Definition: shfl.h:214
Definition: alias.h:4
__device__ enable_if< use_divergent< T >::value, T >::type divergent_load(const T *src)
Definition: fallback.h:70
__device__ bool is_contiguous(int warp_id, const T *ptr)
Definition: aos.h:197
__device__ enable_if< use_shfl< T >::value >::type store_dispatch(const T &data, T *dest)
Definition: aos.h:232
__device__ enable_if< use_shfl< T >::value, T >::type load_dispatch(const T *src)
Definition: aos.h:207
__device__ detail::dismember_type< T >::type * compute_address(T *src, int div, int mod)
Definition: aos.h:111
__device__ enable_if< use_divergent< T >::value >::type divergent_store(const T &data, T *dest)
Definition: fallback.h:116
__device__ void update_indices(int &div, int &mod)
Definition: aos.h:132
Definition: aos.h:38
__device__ void store(const T &data, T *dest)
Definition: aos.h:265
__device__ enable_if< detail::use_shfl< T >::value, T >::type load_warp_contiguous(const T *src)
Definition: aos.h:69
__device__ bool warp_converged()
Definition: warp.h:35
__host__ __device__ void warp_store(const Array &t, typename Array::head_type *ptr, int offset, int stride=32)
Definition: memory.h:137
__device__ enable_if< detail::use_shfl< T >::value >::type store_warp_contiguous(const T &data, T *dest)
Definition: aos.h:89
__device__ T load(const T *src)
Definition: aos.h:256
__device__ void r2c_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
Definition: transpose.h:655
__device__ void c2r_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
Definition: transpose.h:621
head_type head
Definition: array.h:38
tail_type tail
Definition: array.h:39
static const int mod_offset
Definition: aos.h:127
static const int div_offset
Definition: aos.h:128
detail::dismember_type< T >::type U
Definition: aos.h:125
static const int m
Definition: aos.h:126
detail::dismember_type< T >::type U
Definition: aos.h:161
static __device__ array< U, 1 > impl(const T *src, int div, int mod)
Definition: aos.h:163
detail::dismember_type< T >::type U
Definition: aos.h:144
static __device__ array< U, s > impl(const T *src, int div, int mod)
Definition: aos.h:146
detail::dismember_type< T >::type U
Definition: aos.h:186
static __device__ void impl(const array< U, 1 > &src, T *dest, int div, int mod)
Definition: aos.h:188
detail::dismember_type< T >::type U
Definition: aos.h:173
static __device__ void impl(const array< U, s > &src, T *dest, int div, int mod)
Definition: aos.h:175
dismember_type< T >::type U
Definition: aos.h:44
static const int size
Definition: aos.h:45
static const bool value
Definition: aos.h:46
static const bool value
Definition: aos.h:61
static const bool value
Definition: aos.h:51
#define WARP_MASK
Definition: warp.h:46
#define WARP_SIZE
Definition: warp.h:45