QUDA  v1.1.0
A library for QCD on GPUs
shfl.h
Go to the documentation of this file.
1 #pragma once
3 #include <thrust/detail/static_assert.h>
4 
5 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
6 
7 namespace detail {
8 
9 template<int s>
10 struct shuffle {
11  __device__ __forceinline__
12  static void impl(unsigned int mask, array<int, s>& d, const int& i) {
13  d.head = __shfl_sync(mask, d.head, i);
14  shuffle<s-1>::impl(mask, d.tail, i);
15  }
16 };
17 
18 template<>
19 struct shuffle<1> {
20  __device__ __forceinline__
21  static void impl(unsigned int mask, array<int, 1>& d, const int& i) {
22  d.head = __shfl_sync(mask, d.head, i);
23  }
24 };
25 
26 template<int s>
27 struct shuffle_down {
28  __device__ __forceinline__
29  static void impl(unsigned int mask, array<int, s>& d, const int& i) {
30  d.head = __shfl_down_sync(mask, d.head, i);
31  shuffle_down<s-1>::impl(mask, d.tail, i);
32  }
33 };
34 
35 template<>
36 struct shuffle_down<1> {
37  __device__ __forceinline__
38  static void impl(unsigned int mask, array<int, 1>& d, const int& i) {
39  d.head = __shfl_down_sync(mask, d.head, i);
40  }
41 };
42 
43 template<int s>
44 struct shuffle_up {
45  __device__ __forceinline__
46  static void impl(unsigned int mask, array<int, s>& d, const int& i) {
47  d.head = __shfl_up_sync(mask, d.head, i);
48  shuffle_up<s-1>::impl(mask, d.tail, i);
49  }
50 };
51 
52 template<>
53 struct shuffle_up<1> {
54  __device__ __forceinline__
55  static void impl(unsigned int mask, array<int, 1>& d, const int& i) {
56  d.head = __shfl_up_sync(mask, d.head, i);
57  }
58 };
59 
60 template<int s>
61 struct shuffle_xor {
62  __device__ __forceinline__
63  static void impl(unsigned int mask, array<int, s>& d, const int& i) {
64  d.head = __shfl_xor_sync(mask, d.head, i);
65  shuffle_xor<s-1>::impl(mask, d.tail, i);
66  }
67 };
68 
69 template<>
70 struct shuffle_xor<1> {
71  __device__ __forceinline__
72  static void impl(unsigned int mask, array<int, 1>& d, const int& i) {
73  d.head = __shfl_xor_sync(mask, d.head, i);
74  }
75 };
76 
77 
78 }
79 
80 template<typename T>
81 __device__ __forceinline__
82  T __shfl_sync(unsigned int mask, const T& t, const int& i) {
83  //X If you get a compiler error on this line, it is because
84  //X sizeof(T) is not divisible by 4, and so this type is not
85  //X supported currently.
87 
88  typedef typename detail::working_array<T>::type aliased;
89  aliased lysed = detail::lyse<int>(t);
91  return detail::fuse<T>(lysed);
92 }
93 
94 template<typename T>
95 __device__ __forceinline__
96 T __shfl_down_sync(unsigned int mask, const T& t, const int& i) {
97  //X If you get a compiler error on this line, it is because
98  //X sizeof(T) is not divisible by 4, and so this type is not
99  //X supported currently.
100  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
101 
102  typedef typename detail::working_array<T>::type aliased;
103  aliased lysed = detail::lyse<int>(t);
105  return detail::fuse<T>(lysed);
106 }
107 
108 template<typename T>
109 __device__ __forceinline__
110 T __shfl_up_sync(unsigned int mask, const T& t, const int& i) {
111  //X If you get a compiler error on this line, it is because
112  //X sizeof(T) is not divisible by 4, and so this type is not
113  //X supported currently.
114  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
115 
116  typedef typename detail::working_array<T>::type aliased;
117  aliased lysed = detail::lyse<int>(t);
119  return detail::fuse<T>(lysed);
120 }
121 
122 template<typename T>
123 __device__ __forceinline__
124 T __shfl_xor_sync(unsigned int mask, const T& t, const int& i) {
125  //X If you get a compiler error on this line, it is because
126  //X sizeof(T) is not divisible by 4, and so this type is not
127  //X supported currently.
128  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
129 
130  typedef typename detail::working_array<T>::type aliased;
131  aliased lysed = detail::lyse<int>(t);
133  return detail::fuse<T>(lysed);
134 }
135 
136 
137 #else
138 
139 namespace detail {
140 
141 template<int s>
142 struct shuffle {
143  __device__ __forceinline__
144  static void impl(array<int, s>& d, const int& i) {
145  d.head = __shfl(d.head, i);
146  shuffle<s-1>::impl(d.tail, i);
147  }
148 };
149 
150 template<>
151 struct shuffle<1> {
152  __device__ __forceinline__
153  static void impl(array<int, 1>& d, const int& i) {
154  d.head = __shfl(d.head, i);
155  }
156 };
157 
158 template<int s>
159 struct shuffle_down {
160  __device__ __forceinline__
161  static void impl(array<int, s>& d, const int& i) {
162  d.head = __shfl_down(d.head, i);
164  }
165 };
166 
167 template<>
168 struct shuffle_down<1> {
169  __device__ __forceinline__
170  static void impl(array<int, 1>& d, const int& i) {
171  d.head = __shfl_down(d.head, i);
172  }
173 };
174 
175 template<int s>
176 struct shuffle_up {
177  __device__ __forceinline__
178  static void impl(array<int, s>& d, const int& i) {
179  d.head = __shfl_up(d.head, i);
181  }
182 };
183 
184 template<>
185 struct shuffle_up<1> {
186  __device__ __forceinline__
187  static void impl(array<int, 1>& d, const int& i) {
188  d.head = __shfl_up(d.head, i);
189  }
190 };
191 
192 template<int s>
193 struct shuffle_xor {
194  __device__ __forceinline__
195  static void impl(array<int, s>& d, const int& i) {
196  d.head = __shfl_xor(d.head, i);
198  }
199 };
200 
201 template<>
202 struct shuffle_xor<1> {
203  __device__ __forceinline__
204  static void impl(array<int, 1>& d, const int& i) {
205  d.head = __shfl_xor(d.head, i);
206  }
207 };
208 
209 
210 }
211 
212 template<typename T>
213 __device__ __forceinline__
214 T __shfl(const T& t, const int& i) {
215 
216  //X If you get a compiler error on this line, it is because
217  //X sizeof(T) is not divisible by 4, and so this type is not
218  //X supported currently.
219  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
220 
221  typedef typename detail::working_array<T>::type aliased;
222  aliased lysed = detail::lyse<int>(t);
224  return detail::fuse<T>(lysed);
225 }
226 
227 template<typename T>
228 __device__ __forceinline__
229 T __shfl_down(const T& t, const int& i) {
230  //X If you get a compiler error on this line, it is because
231  //X sizeof(T) is not divisible by 4, and so this type is not
232  //X supported currently.
233  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
234 
235  typedef typename detail::working_array<T>::type aliased;
236  aliased lysed = detail::lyse<int>(t);
238  return detail::fuse<T>(lysed);
239 }
240 
241 template<typename T>
242 __device__ __forceinline__
243 T __shfl_up(const T& t, const int& i) {
244  //X If you get a compiler error on this line, it is because
245  //X sizeof(T) is not divisible by 4, and so this type is not
246  //X supported currently.
247  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
248 
249  typedef typename detail::working_array<T>::type aliased;
250  aliased lysed = detail::lyse<int>(t);
252  return detail::fuse<T>(lysed);
253 }
254 
255 template<typename T>
256 __device__ __forceinline__
257 T __shfl_xor(const T& t, const int& i) {
258 
259  //X If you get a compiler error on this line, it is because
260  //X sizeof(T) is not divisible by 4, and so this type is not
261  //X supported currently.
262  THRUST_STATIC_ASSERT((detail::size_multiple_power_of_two<T, 2>::value));
263 
264  typedef typename detail::working_array<T>::type aliased;
265  aliased lysed = detail::lyse<int>(t);
267  return detail::fuse<T>(lysed);
268 }
269 #endif
__device__ __forceinline__ T __shfl(const T &t, const int &i)
Definition: shfl.h:214
__device__ __forceinline__ T __shfl_down(const T &t, const int &i)
Definition: shfl.h:229
__device__ __forceinline__ T __shfl_xor(const T &t, const int &i)
Definition: shfl.h:257
__device__ __forceinline__ T __shfl_up(const T &t, const int &i)
Definition: shfl.h:243
Definition: alias.h:4
tail_type tail
Definition: array.h:12
head_type head
Definition: array.h:11
__device__ static __forceinline__ void impl(array< int, 1 > &d, const int &i)
Definition: shfl.h:153
__device__ static __forceinline__ void impl(array< int, 1 > &d, const int &i)
Definition: shfl.h:170
__device__ static __forceinline__ void impl(array< int, s > &d, const int &i)
Definition: shfl.h:161
__device__ static __forceinline__ void impl(array< int, 1 > &d, const int &i)
Definition: shfl.h:187
__device__ static __forceinline__ void impl(array< int, s > &d, const int &i)
Definition: shfl.h:178
__device__ static __forceinline__ void impl(array< int, 1 > &d, const int &i)
Definition: shfl.h:204
__device__ static __forceinline__ void impl(array< int, s > &d, const int &i)
Definition: shfl.h:195
__device__ static __forceinline__ void impl(array< int, s > &d, const int &i)
Definition: shfl.h:144